def calcPairwisePvalues( tdf, factcol, groupcol=None, onetailed=False ) : pvals = [] if groupcol is not None : for grp in tdf[groupcol].unique(): lof = tdf[(tdf.Vclass=="LoF") & (tdf[groupcol] ==grp)][factcol].tolist() rand = tdf[(tdf.Vclass=="Random") & (tdf[groupcol] ==grp)][factcol].tolist() maxY = max(lof+rand) ts,pv = ttest_ind(lof, rand) #p/2 < alpha and t < 0 #if onetailed : print "Todo" pvals.append( Series(data=[grp,pv,ts,"-2",maxY], index=[groupcol,"pvalue","T-statistic","x","y"])) else : lof = tdf[(tdf.Vclass=="LoF")][factcol].tolist() rand = tdf[(tdf.Vclass=="Random")][factcol].tolist() maxY = max(lof+rand) ts,pv = ttest_ind(lof, rand) #p/2 < alpha and t < 0 #if onetailed : print "Todo" pvals.append( Series(data=[pv,ts,"-2",maxY], index=["pvalue","T-statistic","x","y"])) pvals = DataFrame(pvals) pvals["Pvalue"] = ["P-value: %.3g\nT-statistic: %.2f" %(x,y) for x,y in pvals[["pvalue","T-statistic"]].values] return pvals
def linreg2_err(t, x, wleft=5, wright=5, hop=None, use_l=True, use_r=True): if hop is None: hop=1 zs = np.zeros(len(x)) if wright<0: wm = 0 else: wm=wright for ii in range(wleft,len(x)-wm,hop): ts=[] xl = x[ii-wleft:ii] tl = t[ii-wleft:ii] xr = x[ii:ii+wright] tr = t[ii:ii+wright] if use_l>0: pl = np.polyfit(tl,xl,1) residll = xl-np.polyval(pl,tl) stdll = np.std(residll) residlr = xr-np.polyval(pl,tr) stdlr = np.std(residlr) ttl,pvl = ttest_ind(residll,residlr) ts.append(ttl) if use_r>0: pr = np.polyfit(tr,xr,1) residrr = xr-np.polyval(pr,tr) stdrr = np.std(residrr) residrl = xl-np.polyval(pr,tl) stdrl = np.std(residrl) ttr,pvr = ttest_ind(residrr,residrl) ts.append(-ttr) zs[ii] = np.mean(ts) return zs
def results_plots(name, min_n=2000): frame = pd.DataFrame.from_csv(os.path.join(DATA_FOLDER, "storks_exp_rep", "out", name)) frame = frame[frame.N >= min_n] # 0) Sig tests behav_semi_sup = frame.AF_factor juv_idx = frame.status == 'Juv' adult_idx = frame.status == 'Adult' print("semi: ", np.array([1, .5])*ttest_ind(behav_semi_sup[juv_idx], behav_semi_sup[adult_idx])) # [1, .5]*[t, p] behav_sup = frame.AF_true_frac print("sup: ", np.array([1, .5])*ttest_ind(behav_sup[juv_idx], behav_sup[adult_idx])) # 1) scatter for AF_true_frac & AF_factor frame.plot(kind='scatter', x='AF_true_frac', y='AF_factor') plt.show() # 2) compare Juv/Adults with AF_factor m = frame.groupby(frame['status'])['AF_factor'].mean() s = frame.groupby(frame['status'])['AF_factor'].sem() m.plot(kind='bar', yerr=s) plt.show() # 2) compare Juv/Adults with AF_true_frac m = frame.groupby(frame['status'])['AF_true_frac'].mean() s = frame.groupby(frame['status'])['AF_true_frac'].sem() m.plot(kind='bar', yerr=s) plt.show()
def calc_ttest(data, exp_set, control_set, tags=()): d = [ st.ttest_ind( data.ix[probeset, list(exp_set.filenames)], data.ix[probeset, list(control_set.filenames)], equal_var=False) for probeset in data.index] rs = pandas.DataFrame( index=data.index, data=d, columns=[ tm.e( tags+(("st", "t"),("tt", "welch ttest"))), tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal") ))]) rs[tm.e( tags + (("tt", "welch ttest"), ("st", "pval"), ("mc", "bonf")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal"))) ], method="bonferroni")[1] rs[tm.e( tags + (("tt", "welch ttest"), ("st", "pval"), ("mc", "bh")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal")))], method="fdr_bh")[1] d = [ st.ttest_ind( data.ix[probeset, list(exp_set.filenames)], data.ix[probeset, list(control_set.filenames)], equal_var=True) for probeset in data.index] rs[tm.e( tags+(("st", "t"),("tt", "student ttest")))] = [v[0] for v in d] rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal") ))] = [v[1] for v in d] rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "bonf")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal"))) ], method="bonferroni")[1] rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "bh")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal")))], method="fdr_bh")[1] # do diagnostic tests for heteroskedasticity d = [st.levene( data.ix[probeset, list(exp_set.filenames)], data.ix[probeset, list(control_set.filenames)]) for probeset in data.index ] rs[ tm.e( tags + (("tt", "levene"), ("st", "pval")))] = [z[1] for z in d] # omnibus test for normality # d = [st.normaltest( data.ix[probeset, list(exp_set.filenames)]) for probeset in data.index ] # rs[ tm.e( tags + (("tt", "d-p omnibus"), ("st", "pval"), ("cg", "exp") ))] = [z[1] for z in d] # d = [st.normaltest( data.ix[probeset, list(control_set.filenames)]) for probeset in data.index ] # rs[ tm.e( tags + (("tt", "d-p omnibus"), ("st", "pval"), ("cg", "ctrl") ))] = [z[1] for z in d] return rs
def plotHist(): TO, wmeanto= makeGaussian(20.74,1115,500,plot=False,dir="/Users/george/Dropbox/Astronomy/Oculus/25Oct2016/IMG00069.FIT")[3:] SM, wmeansm= makeGaussian(20.64,710,885,plot=False,dir="/Users/george/Dropbox/Astronomy/Oculus/25Oct2016/IMG00074.FIT")[3:] dif = np.median(20.74-2.5*np.log10(TO/wmeanto))-np.median(20.64-2.5*np.log10(SM/wmeansm)) pval= stats.ttest_ind(SM, TO)[1] plt.ion() plt.clf() plt.figure(1) data = np.vstack([TO,SM]).T plt.xlim(2000,4000) plt.ylim(0,2000) #plt.hist(TO, bins=1000,label='Thacher Observatory',alpha=0.5,color='r') #plt.hist(SM, bins=1000,label='Sulfur Mountain',alpha=0.5,color='b') plt.axvline(x=rb.mean(TO), color ='red', linewidth = 2) plt.axvline(x=rb.mean(SM), color = 'red', linewidth = 2) plt.annotate(r'$dif$=%.2f mags/arcsec'u'\u00B2' %dif, [.01,.93], horizontalalignment='left', xycoords='axes fraction', fontsize='large', backgroundcolor='white') plt.annotate(r'$\bar{{\sigma}_T}_O$=%.2f flux/px' %rb.mean(TO), [.01,0.86], horizontalalignment='left', xycoords='axes fraction', fontsize="large", color='midnightblue') plt.annotate(r'$\bar{{\sigma}_S}_M$=%.2f flux/px'%rb.mean(SM), [0.01,0.79], horizontalalignment='left', xycoords='axes fraction', fontsize="large", color='darkgreen') plt.annotate(r'$p-val$=%.2E' %pval, [.01,.72], horizontalalignment='left', xycoords='axes fraction', fontsize='large') plt.hist(data, bins=1000,label=['Thacher Observatory (TO)','Sulphur Mountain (SM)'],alpha=0.5, width=40) plt.title("Sky brightness") plt.xlabel("Flux Value") plt.ylabel("Frequency") plt.legend(loc='upper right') plt.show() inds, = np.where(dif<=0) pcent = len(inds,) ttest=stats.ttest_ind(TO, SM) #returns: T-statistic((estimated-hypothesis value)/standard error), #p value(probability of an observed result assuming the null hypothesis is true) print dif, pcent, ttest
def runTTest(col, df): print "t-Test for %s" % col d1 = df[df[col] == True]["rating"] d2 = df[df[col] != True]["rating"] print "Number of Samples: (%d, %d)" % (d1.shape[0], d2.shape[0]) print "Means: (%f, %f)" %(d1.mean(), d2.mean()) print stats.ttest_ind(d1, d2)
def ttest_PTC_divergence_different(divergence_file, PTC_file, blast_file): ''' (file, file) -> None Performs a t-test to compare the mean divergence between PTC and non-PTC genes, assuming unequal variance and print the results on screen ''' divergence_data = PTC_divergence(divergence_file, PTC_file, blast_file) divergence = divergence_data[0] zero_dS = divergence_data[1] omega_greater_1 = divergence_data[2] from scipy import stats # make a list of divergence values for PTC and non-PTC genes dN_PTC = [] dN_non_PTC = [] dS_PTC = [] dS_non_PTC = [] omega_PTC = [] omega_non_PTC = [] for gene in divergence: if divergence[gene][-1] == 'yes': dN_PTC.append(divergence[gene][0]) dS_PTC.append(divergence[gene][1]) omega_PTC.append(divergence[gene][2]) else: dN_non_PTC.append(divergence[gene][0]) dS_non_PTC.append(divergence[gene][1]) omega_non_PTC.append(divergence[gene][2]) dN = stats.ttest_ind(dN_PTC, dN_non_PTC, equal_var = False) dS = stats.ttest_ind(dS_PTC, dS_non_PTC, equal_var = False) omega = stats.ttest_ind(omega_PTC, omega_non_PTC, equal_var = False) dN_ttest = (abs(round(float(dN[0]), 4)), float(dN[1])) dS_ttest = (abs(round(float(dS[0]), 4)), float(dS[1])) omega_ttest = (abs(round(float(omega[0]), 4)), float(omega[1])) # compute mean and standard error dN_stats_PTC = compute_mean_std_error(dN_PTC) dN_stats_non_PTC = compute_mean_std_error(dN_non_PTC) dS_stats_PTC = compute_mean_std_error(dS_PTC) dS_stats_non_PTC = compute_mean_std_error(dS_non_PTC) omega_stats_PTC = compute_mean_std_error(omega_PTC) omega_stats_non_PTC = compute_mean_std_error(omega_non_PTC) print('dN: t-test =\t {0},\t p-value =\t {1}'.format(dN_ttest[0], dN_ttest[1])) print('dS: t-test =\t {0},\t p-value =\t {1}'.format(dS_ttest[0], dS_ttest[1])) print('dN/dS: t-test =\t {0},\t p-value =\t {1}'.format(omega_ttest[0], omega_ttest[1])) print('PTC: mean dN =\t %6.4f,\t standard error =\t %6.4f' % dN_stats_PTC) print('nonPTC: mean dN =\t %6.4f,\t standard error =\t %6.4f' % dN_stats_non_PTC) print('PTC: mean dS =\t %6.4f,\t standard error =\t %6.4f' % dS_stats_PTC) print('nonPTC: mean dS =\t %6.4f,\t standard error =\t %6.4f' % dS_stats_non_PTC) print('PTC: mean dN/dS =\t %6.4f,\t standard error =\t %6.4f' % omega_stats_PTC) print('nonPTC: mean dN/dS =\t %6.4f,\t standard error =\t %6.4f' % omega_stats_non_PTC) print('{0}\t genes with dS = 0 were excluded'.format(zero_dS)) print('{0}\t tgenes with dN/dS > 1 were excluded'.format(omega_greater_1))
def runCompare(self, objId, labelToAdd, expression1, expression2): fh = open(self._getPath("report.txt"),'w') self.experiment = self.readExperiment(self.inputExperiment.get().fnPKPD) x1 = [float(x) for x in self.experiment.getSubGroupLabels(self.expression1.get(),self.labelToCompare.get())] x2 = [float(x) for x in self.experiment.getSubGroupLabels(self.expression2.get(),self.labelToCompare.get())] self.doublePrint(fh,"Values in SubGroup 1: %s"%str(x1)) self.doublePrint(fh,"Values in SubGroup 2: %s"%str(x2)) self.doublePrint(fh,"Testing H0: mu1=mu2") self.doublePrint(fh," ") try: [t,pval] = stats.ttest_ind(np.asarray(x1,np.double),np.asarray(x2,np.double),True) self.doublePrint(fh,"T-test two independent samples (same variance): t-statistic=%f p-value=%f"%(t,pval)) except: pass try: [t,pval] = stats.ttest_ind(x1,x2, False) self.doublePrint(fh,"T-test two independent samples (different variance, Welch's test): t-statistic=%f p-value=%f"%(t,pval)) except: pass try: [u,pval] = stats.mannwhitneyu(x1, x2, True) self.doublePrint(fh,"Mann-Whitney U test for two independent samples: u-statistic=%f p-value=%f"%(u,pval)) except: pass fh.close()
def pandas_boxplot(): '''Example from Altman "Practical statistics for medical research''' # Get the data inFile = 'altman_94.txt' url_base = 'https://raw.github.com/thomas-haslwanter/statsintro/master/Data/data_altman/' url = url_base + inFile data = np.genfromtxt(urlopen(url), delimiter=',') # Group them into "lean" and "obese" lean = pd.Series(data[data[:,1]==1,0]) obese = pd.Series(data[data[:,1]==0,0]) # Combine them into a pandas DataFrame df = pd.DataFrame({'lean':lean, 'obese':obese}) # Calculate the mean value, for each group print(df.mean()) # Show a boxplot df.boxplot() plt.show() # Perform a T-test between "lean" and "obese" subjects stats.ttest_ind(lean, obese)
def check_distance_pvals(dist_dict, group_dict, group_frac=0.5, nreps=500): groups = sorted(set(group_dict.values())) assert len(groups) == 2 group_vals = defaultdict(list) for (key1, key2), dist in dist_dict.items(): if group_dict[key1] == group_dict[key2]: group_vals[group_dict[key1]].append(dist) assert len(group_vals) == 2 _, raw_pval = ttest_ind(*group_vals.values()) nitems = int(group_frac*min(map(len, group_vals.values()))) cor_vals = [] for _ in range(nreps): [shuffle(items) for items in group_vals.values()] _, pval = ttest_ind(*[items[:nitems] for items in group_vals.values()]) cor_vals.append(pval) odict = { 'RawPval': raw_pval, 'AdjPval': np.mean(cor_vals), 'Group1Name': groups[0], 'Group2Name': groups[1], 'Group1Mean': np.mean(group_vals[groups[0]]), 'Group2Mean': np.mean(group_vals[groups[1]]), 'Group1Std': np.std(group_vals[groups[0]]), 'Group2Std': np.std(group_vals[groups[1]]) } return odict
def test_vs_nonmasked(self): np.random.seed(1234567) outcome = np.random.randn(20, 4) + [0, 0, 1, 2] # 1-D inputs res1 = stats.ttest_ind(outcome[:, 0], outcome[:, 1]) res2 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1]) assert_allclose(res1, res2) # 2-D inputs res1 = stats.ttest_ind(outcome[:, 0], outcome[:, 1], axis=None) res2 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1], axis=None) assert_allclose(res1, res2) res1 = stats.ttest_ind(outcome[:, :2], outcome[:, 2:], axis=0) res2 = mstats.ttest_ind(outcome[:, :2], outcome[:, 2:], axis=0) assert_allclose(res1, res2) # Check default is axis=0 res3 = mstats.ttest_ind(outcome[:, :2], outcome[:, 2:]) assert_allclose(res2, res3) # Check equal_var res4 = stats.ttest_ind(outcome[:, 0], outcome[:, 1], equal_var=True) res5 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1], equal_var=True) assert_allclose(res4, res5) res4 = stats.ttest_ind(outcome[:, 0], outcome[:, 1], equal_var=False) res5 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1], equal_var=False) assert_allclose(res4, res5)
def test(self, arr1, arr2): p_value = 0 if self.statistics == "auto": # проверяем Левеном на равенство дисперсий. Если равны if stats.levene(arr1, arr2)[1] > 0.05: # Шапир на нормальность выборок. Если нормальные if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05: # p = Student p_value = stats.ttest_ind(arr1, arr2)[1] else: # p = Mann if equal(arr1, arr2): p_value = 1 else: p_value = stats.mannwhitneyu(arr1, arr2)[1] else: p_value = stats.ttest_ind(arr1, arr2, False)[1] elif self.statistics == "student": p_value = stats.ttest_ind(arr1, arr2)[1] elif self.statistics == "welch": p_value = stats.ttest_ind(arr1, arr2, False)[1] elif self.statistics == "mann": if equal(arr1, arr2): p_value = 1 else: p_value = stats.mannwhitneyu(arr1, arr2)[1] return p_value
def t_value(self, data, matched_tr_ind_list, matched_co_ind_list, treat_col_name): """ t値を計算する """ ps_tr, ps_co = data.ix[matched_tr_ind_list], data.ix[matched_co_ind_list] rand_tr, rand_co = data.ix[data[treat_col_name]==1], data.ix[data[treat_col_name]==0] del ps_tr[treat_col_name] del ps_co[treat_col_name] del rand_tr[treat_col_name] del rand_co[treat_col_name] ps_t_val_dict, rand_t_val_dict = {}, {} for column in ps_tr.columns: ## ps scoreでマッチングを取った場合のT値 ps_tr_array, ps_co_array = numpy.array(ps_tr[column]), numpy.array(ps_co[column]) # ps_tr_n, ps_co_n = ps_tr.shape[0], ps_co.shape[0] # ps_tr_mean, ps_co_mean = numpy.mean(ps_tr_array), numpy.mean(ps_co_array) # ps_tr_var, ps_co_var = numpy.var(ps_tr_array), numpy.var(ps_co_array) ps_t_val = stats.ttest_ind(ps_tr_array, ps_co_array)[0] ps_t_val_dict.update({column:ps_t_val}) ## マッチングしていない場合のT値 rand_tr_array, rand_co_array = numpy.array(rand_tr[column]), numpy.array(rand_co[column]) # rand_tr_n, rand_co_n = rand_tr.shape[0], rand_co.shape[0] # rand_tr_mean, rand_co_mean = numpy.mean(rand_tr_array), numpy.mean(rand_co_array) # rand_tr_var, rand_co_var = numpy.var(rand_tr_array), numpy.var(rand_co_array) rand_t_val = stats.ttest_ind(rand_tr_array, rand_co_array)[0] rand_t_val_dict.update({column:rand_t_val}) return ps_t_val_dict, rand_t_val_dict
def read_result2(): result1 = pickle.load(open('data/result/lab2/case1.obj', 'rb')) result2 = pickle.load(open('data/result/lab2/case2.obj', 'rb')) result3 = pickle.load(open('data/result/lab2/case3.obj', 'rb')) print np.mean(result1), np.var(result1) print np.mean(result2), np.var(result2), stats.ttest_ind(result1, result2) print np.mean(result3), np.var(result3), stats.ttest_ind(result2, result3)
def significant(array1,array2): try: arr1 = np.array(array1) arr2 = np.array(array2) print(stats.ttest_ind(arr1,arr2)[1]) return stats.ttest_ind(arr1,arr2)[1] < 0.1 except: print('PROBLEM!') return None
def do_t_tests(): two_sample = stats.ttest_ind(sec_rating, not_rating) print "The t-statistic is %.3f and the p-value is %.3f." % two_sample # assuming unequal population variances two_sample_diff_var = stats.ttest_ind(sec_rating, not_rating) print "If we assume unequal variances than the t-statistic is %.3f and the p-value is %.3f." % two_sample_diff_var
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("top_data_dir") parser.add_argument("performance_dir") parser.add_argument("--basemodel","-bm",default="raw_f2exp") parser.add_argument("--expansion_model","-em",default="raw_f2exp_fbDocs:10") parser.add_argument("--threshold","-t",type=float,default=0.6) args=parser.parse_args() datasets = load_data(args.top_data_dir) for collection_name in datasets: for year in datasets[collection_name].years: base = [] expansion = [] probs = [] performance_file = os.path.join(args.performance_dir,year.name) performances = json.load(open(performance_file)) print "for year %s" %(year) for day in datasets[collection_name]._prediction[year]: if year==Year.y2016: day_string = "201608%s" %(day.zfill(2)) elif year == Year.y2015: day_string = "201507%s" %(day.zfill(2)) elif year == Year.y2017: if int(day) < 10: day_string = "201708%s" %(day.zfill(2)) else: day_string = "201707%s" %(day.zfill(2)) else: raise NotImplemented("year %s is not NotImplemented!" %(year.name)) for qid in datasets[collection_name]._prediction[year][day]: try: base_performance = performances[args.basemodel][qid][day_string] except KeyError: continue else: if datasets[collection_name].is_silent_day(year,day,qid): base.append(.0) expansion.append(.0) else: base.append(base_performance) query_day_prob = datasets[collection_name].get_prob(year,day,qid) probs.append(query_day_prob) if query_day_prob > args.threshold: expansion.append(base_performance) else: expansion.append(performances[args.expansion_model][qid][day_string]) print "There are %d pairs" %(len(base)) base_avg = sum(base) / len(base) expansion_avg = sum(expansion) / len(expansion) print "%s: %f, %s: %f" %(args.basemodel,base_avg, args.expansion_model,expansion_avg) # print probs print ttest_ind(base,expansion) print "-"*20
def motifStatsRandMarix(data, motifSize=3, degree=10, usetotal=False): """Outputs text file with stats on the motifs in data""" filename = "result/t_test_Deg-{0}_Size-{1}.txt".format(degree,motifSize) with open(filename,'w') as f: f.write("Student's T test comparing both AD/NL/MCI to Random.\n\n") for corr in ['corr']: title = "P-values for "+corr+" data set compared to random generated graphs\n" f.write(title) data[('MCIR', corr)] = genRandomGraphs(data[('MCI', corr)], degree, len(data[('MCI', corr)])) data[('ADR', corr)] = genRandomGraphs(data[('AD', corr)], degree, len(data[('AD', corr)])) data[('NLR', corr)] = genRandomGraphs(data[('NL', corr)], degree, len(data[('NL', corr)])) motifsNL=findMotifs(data,('NL',corr), motifSize, degree, usetotal) motifsMCI=findMotifs(data,('MCI',corr), motifSize, degree, usetotal) motifsAD=findMotifs(data,('AD',corr), motifSize, degree, usetotal) motifsNLR=findMotifs(data,('NLR',corr), motifSize, degree, usetotal) motifsMCIR=findMotifs(data,('MCIR',corr), motifSize, degree, usetotal) motifsADR=findMotifs(data,('ADR',corr), motifSize, degree, usetotal) allMotifs = list( set(motifsNL.keys()) & set(motifsAD.keys()) & set(motifsMCI.keys()) & set(motifsNLR.keys()) & set(motifsADR.keys()) & set(motifsMCIR.keys()) ) allMotifs.sort() f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}{6:>15}{7:>15}{8:>15}{9:>15}\n".format( 'MOTIF ID','NL', 'MCI','AD', 'NLR Mean','MCIR Mean','ADR Mean', 'NLR Std','MCIR Std', 'ADR Std')) motifStats = [] for key in allMotifs: tMCI, probMCI = stats.ttest_ind(motifsMCI[key], motifsMCIR[key]) tAD, probAD = stats.ttest_ind(motifsAD[key], motifsADR[key]) tNL, probNL = stats.ttest_ind(motifsNL[key], motifsNLR[key]) motifStats.append((key,probNL,probMCI,probAD)) motifStats.sort(key=lambda x: min(x)) for key, probNL, probMCI, probAD in motifStats: normRMean = motifsNLR[key].mean() mciRMean = motifsMCIR[key].mean() adRMean = motifsADR[key].mean() normRVar = motifsNLR[key].std() mciRVar = motifsMCIR[key].std() adRVar = motifsADR[key].std() if probMCI<0.01 or probAD<0.01 or probNL<0.01: star = "**" elif probMCI<0.1 or probAD<0.1 or probNL<0.01: star = "*" else: star = "" line = star+"{0:>"+str(10-len(star))+"}{1:>15.3}{2:>15.3}{3:>15.3}{4:>15.3}{5:>15.3}{6:>15.3}{7:>15.3}{8:>15.3}{9:>15.3}\n" f.write(line.format(str(int(key)), probNL, probMCI, probAD,normRMean,mciRMean,adRMean,normRVar,mciRVar,adRVar)) f.write("\n\n")
def generate_sequence_gene_expression_statistics(self, show_species_charts=True, show_chart=True): i = -1 if self.multiple_networks: for nw_ge_file in glob.glob(self.output_silix_nw_exp_data_folder_path + '/*.txt'): i += 1 mapping_data = np.genfromtxt(nw_ge_file, delimiter=',', dtype=str) if len(mapping_data) > 0: print 'Network: ', i, mapping_data.shape x = np.array(mapping_data[:, 2], dtype=float) y = np.array(mapping_data[:, 3], dtype=float) ca_stat = ca_pvalue = spike_stat = spike_pvalue = ind_stat = ind_pvalue = 0 if not np.all(x == 0): ca_stat, ca_pvalue = stats.ttest_1samp(x[x != 0], 0) spike_stat, spike_pvalue = stats.ttest_1samp(y[y != 0], 0) ind_stat, ind_pvalue = stats.ttest_ind(x[x != 0], y[y != 0], equal_var=False) nw_number = (int)(re.findall(r'\d+', nw_ge_file)[0]) nw_statistics = ( [nw_number, x[x != 0].mean(), x[x != 0].var(), x[x != 0].std(), y[y != 0].mean(), y[y != 0].var(), y[y != 0].std(), ca_stat, ca_pvalue, spike_stat, spike_pvalue, ind_stat, ind_pvalue]) self.network_gene_expressions.append(nw_statistics) else: mapping_data = np.genfromtxt(self.output_silix_nw_exp_data_folder_path + self.silix_nw_exp_data_filename, delimiter=',', dtype=str) if len(mapping_data) > 0: print 'Network: ', mapping_data.shape x = np.array(mapping_data[:, 2], dtype=float) y = np.array(mapping_data[:, 3], dtype=float) ca_stat = ca_pvalue = spike_stat = spike_pvalue = ind_stat = ind_pvalue = 0 if not np.all(x == 0): ca_stat, ca_pvalue = stats.ttest_1samp(x[x != 0], 0) spike_stat, spike_pvalue = stats.ttest_1samp(y[y != 0], 0) ind_stat, ind_pvalue = stats.ttest_ind(x[x != 0], y[y != 0], equal_var=False) nw_statistics = ( [0, x[x != 0].mean(), x[x != 0].var(), x[x != 0].std(), y[y != 0].mean(), y[y != 0].var(), y[y != 0].std(), ca_stat, ca_pvalue, spike_stat, spike_pvalue, ind_stat, ind_pvalue]) self.network_gene_expressions.append(nw_statistics) # convert list into array self.network_gene_expressions = np.asarray(self.network_gene_expressions) # Save network gene expression statistics to csv file gene_expression_statistics_file = self.output_silix_nw_exp_data_folder_path + 'gene_expression_statistics.csv' with open(gene_expression_statistics_file, 'w') as f_handle: f_handle.write( 'Network, 9mM CA Mean, 9mM CA Var, 9mM CA SD, Spike Mean, Spike Var, Spike SD, 9mM CA ttest-stat, 9mM CA ttest-pvalue, Spike ttest-stat, Spike ttest-pvalue, Ind ttest-stat, Ind ttest-pvalue \n') np.savetxt(f_handle, self.network_gene_expressions, delimiter=',') if show_species_charts: self.generate_species_wise_gene_expression_statistics() if self.multiple_networks and show_chart: self.plot_all_nw_gene_expr_stats_chart() elif show_chart and not self.multiple_networks: self.plot_single_network_gene_expr_stats_chart()
def ttest(exp1,exp2,exp3): print "=== exp1 vs exp2" ts, pvalue = stats.ttest_ind(exp1, exp2,equal_var=False) print "p value =", pvalue print "=== exp1 vs exp3" ts, pvalue = stats.ttest_ind(exp1, exp3,equal_var=False) print "p value =", pvalue print "=== exp2 vs exp3" ts, pvalue = stats.ttest_ind(exp2, exp3,equal_var=False) print "p value =", pvalue,
def motifStats(data, motifSize=3, degree=10, usetotal=False): """Outputs pdf file with stats on the motifs in data""" filename = "result/t_test_Deg-{0}_Size-{1}.txt".format(degree,motifSize) with open(filename,'w') as f: f.write("Student's T test comparing both MCI and AD to NL.\n\n") for corr in ('corr','lcorr','lacorr'): title = "P-values for "+corr+" data set compared to normal patients\n" f.write(title) motifsNL=findMotifs(data,('NL',corr), motifSize, degree, usetotal) motifsMCI=findMotifs(data,('MCI',corr), motifSize, degree, usetotal) motifsAD=findMotifs(data,('AD',corr), motifSize, degree, usetotal) #mats = [] #for i in xrange(108): # x = np.random.rand(88,88) # x -= np.diag(np.diag(x)) # mats.append(x) #rand = {} #rand['derp'] = mats #motifsNL=findMotifs(rand,'derp', motifSize, degree, usetotal) allMotifs = list( set(motifsNL.keys()) & set(motifsAD.keys()) & set(motifsMCI.keys()) ) f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}{6:>15}{7:>15}{8:>15}\n".format( 'MOTIF ID','MCI','AD','Norm Mean','MCI Mean','AD Mean','NORM Std','MCI Std', 'AD Std')) motifStats = [] for key in allMotifs: tMCI, probMCI = stats.ttest_ind(motifsMCI[key], motifsNL[key]) tAD, probAD = stats.ttest_ind(motifsAD[key], motifsNL[key]) motifStats.append((key,probMCI,probAD)) motifStats.sort(key=lambda x: min(x)) for key, probMCI, probAD in motifStats: normMean = motifsNL[key].mean() mciMean = motifsMCI[key].mean() adMean = motifsAD[key].mean() normVar = motifsNL[key].std() mciVar = motifsMCI[key].std() adVar = motifsAD[key].std() if probMCI<0.01 or probAD<0.01: star = "**" elif probMCI<0.1 or probAD<0.1: star = "*" else: star = "" line = star+"{0:>"+str(10-len(star))+"}{1:>15.3}{2:>15.3}{3:>15.3}{4:>15.3}{5:>15.3}{6:>15.3}{7:>15.3}{8:>15.3}\n" f.write(line.format(str(int(key)), probMCI, probAD,normMean,mciMean,adMean,normVar,mciVar,adVar)) f.write("\n\n")
def return_test_results(self, arr1, arr2): test_name = "" p_value = 0 t_value = 0 levene = stats.levene(arr1, arr2)[1] if self.statistics == "auto": # проверяем Левеном на равенство дисперсий. Если равны if levene > 0.05: # Шапир на нормальность выборок. Если нормальные if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05: # p = Student test_name = "Student" result = stats.ttest_ind(arr1, arr2) t_value = result[0] p_value = result[1] else: # p = Mann test_name = "Mann" if equal(arr1, arr2): t_value = None p_value = 1 else: result = stats.mannwhitneyu(arr1, arr2) t_value = result[0] p_value = result[1] else: test_name = "Welch" result = stats.ttest_ind(arr1, arr2, False) t_value = result[0] p_value = result[1] elif self.statistics == "student": test_name = "Student" result = stats.ttest_ind(arr1, arr2) t_value = result[0] p_value = result[1] elif self.statistics == "welch": test_name = "Welch" result = stats.ttest_ind(arr1, arr2, False) t_value = result[0] p_value = result[1] elif self.statistics == "mann": test_name = "Mann" if equal(arr1, arr2): t_value = None p_value = 1 else: result = stats.mannwhitneyu(arr1, arr2) t_value = result[0] p_value = result[1] df = len(arr1) + len(arr2) - 2 return [test_name, t_value, p_value, df, levene]
def read_result(): fsc0 = pickle.load(open('data/result/fsc1.obj', 'rb')) fsc25 = pickle.load(open('data/result/fsc2.obj', 'rb')) fsc50 = pickle.load(open('data/result/fsc3.obj', 'rb')) fsc75 = pickle.load(open('data/result/fsc4.obj', 'rb')) fsc100 = pickle.load(open('data/result/fsc5.obj', 'rb')) print np.mean(fsc0), np.var(fsc0) print np.mean(fsc25), np.var(fsc25), stats.ttest_ind(fsc0, fsc25) print np.mean(fsc50), np.var(fsc50), stats.ttest_ind(fsc25, fsc50) print np.mean(fsc75), np.var(fsc75), stats.ttest_ind(fsc50, fsc75) print np.mean(fsc100), np.var(fsc75), stats.ttest_ind(fsc100, fsc75)
def table_post_processing(results_collector): stats_collector = [] print '\n Summary of the analysis:' for line in results_collector: class_name, _time_stamps = classify(line[0]) stats_collector.append([class_name, _time_stamps, line[6], line[7]]) stats_collector = np.array(stats_collector) for class_name in classes: class_set_filter = stats_collector[:, 0] == class_name if any(class_set_filter): class_set = stats_collector[class_set_filter, :] print class_name final_stats_collector_x = [] final_stats_collector_y = [] final_stats_collector_e = [] raw_times = {} for time_stamp in time_stamp_coll: time_stamp_filter = class_set[:, 1] == time_stamp if any(time_stamp_filter): time_stamp_set = class_set[time_stamp_filter, :] mean = np.nanmean(time_stamp_set[:, 2].astype(np.float64)) err = np.nanstd(time_stamp_set[:, 2].astype(np.float64)) / \ np.sqrt(len(time_stamp_set[:, 2]))*1.96 raw_times[time_stamp] = rm_nans(time_stamp_set[:, 2].astype(np.float64)) print '\t time: %s, mean: %s, err: %s' % (time_stamp, mean, err) final_stats_collector_x.append(time_stamps[time_stamp]) final_stats_collector_y.append(mean) final_stats_collector_e.append(err) time_translator = dict([(item, i) for i, item in enumerate(sorted(raw_times.keys()))]) samples_n = len(raw_times.keys()) print samples_n p_val_array = np.array((samples_n, samples_n)) for time1, time2 in combinations(sorted(raw_times.keys()), 2): print time1, time2 print time_translator[time1], time_translator[time2] print ttest_ind(raw_times[time1], raw_times[time2]) _, p_val = ttest_ind(raw_times[time1], raw_times[time2]) p_val_array[time_translator[time1], time_translator[time2]] = p_val print p_val_array plt.errorbar(final_stats_collector_x, final_stats_collector_y, final_stats_collector_e, label=class_name) plt.legend() plt.show()
def determine_significance(mesa1, mesa2): """ Determines if two sets of values are statistically significant. In the best case, we can determine a normal distribution, and equal variance. Once determined we can use the independent t-test function if the values are of equal variance. If we have normal data, but the variance is unequal, the welch t-test is used. http://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test http://en.wikipedia.org/wiki/Student%27s_t-test#Equal_or_unequal_sample_sizes.2C_unequal_variances In the case where we cannot determine normality the mann-whitney u-test is desired to be used, but this test is only effective when there are greater than 20 samples. http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test """ # FIXME: Is it possible to determine these things with fewer samples? Distribution = Enum('Distribution', 'Normal, Non_normal Unknown') normality = Distribution.Normal try: k2, normal = stats.normaltest(mesa1) # FIXME: Unhardcode if (normal < NORMAL_CI): normality = Distribution.Non_normal k2, normal = stats.normaltest(mesa2) if (normal < NORMAL_CI): normality = Distribution.Non_normal except ValueError: normality = Distribution.Unknown equal_variance = is_equal_variance(mesa1, mesa2) if args.ttest: t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance) return (p, normality == Distribution.Normal, "t-test" if equal_variance else "Welch's") elif args.mannwhitney: u, p = stats.mannwhitneyu(mesa1, mesa2) p *= 2 # We want a 2-tailed p-value return (p, len(mesa1) < 20 or len(mesa2) < 20, "Mann-Whitney") if normality == Distribution.Normal: error_handler='raise' if np.var(mesa1) == 0 and equal_variance: error_handler='ignore' with np.errstate(divide=error_handler): t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance) return (p, False, "t-test" if equal_variance else "Welch's") else: u, p = stats.mannwhitneyu(mesa1, mesa2) p *= 2 # We want a 2-tailed p-value flawed = len(mesa1) < 20 or len(mesa2) < 20 return (p, flawed, "Mann-Whitney")
def main(): f27_scan = open('sim_scan27.txt', 'r') f27_table = open('sim_table27.txt', 'r') f35932_scan = open('sim_scan35932.txt', 'r') f35932_table = open('sim_table35932.txt', 'r') ntests = 10 scan27 = [0 for i in range(ntests)] table27 = [0 for i in range(ntests)] scan35932 = [0 for i in range(ntests)] table35932 = [0 for i in range(ntests)] files = [f27_scan, f27_table, f35932_scan, f35932_table] arrs = [scan27, table27, scan35932, table35932] for i in range(ntests): for j in range(4): line = files[j].readline() if line[len(line)-1] == '\n': line = line[:len(line)-1] arrs[j][i] = float(line) for j in range(4): files[j].close() _, p27 = stats.ttest_ind(scan27, table27, equal_var=False) mean27scan = stats.tmean(scan27) mean27table = stats.tmean(table27) var27scan = stats.tvar(scan27) var27table = stats.tvar(table27) _, p35932 = stats.ttest_ind(scan35932, table35932, equal_var=False) mean35932scan = stats.tmean(scan35932) mean35932table = stats.tmean(table35932) var35932scan = stats.tvar(scan35932) var35932table = stats.tvar(table35932) f = open('sim_results_compare_scan_table.txt', 'w') f.write('27\n') f.write('scan mean: ' + str(mean27scan) + '\n') f.write('scan var: ' + str(var27scan) + '\n') f.write('table mean: ' + str(mean27table) + '\n') f.write('table var: ' + str(var27table) + '\n') f.write('p-value: ' + str(p27) + '\n\n') f.write('35932\n') f.write('scan mean: ' + str(mean35932scan) + '\n') f.write('scan var: ' + str(var35932scan) + '\n') f.write('table mean: ' + str(mean35932table) + '\n') f.write('table var: ' + str(var35932table) + '\n') f.write('p-value: ' + str(p35932) + '\n') f.close()
def main(pkl_list, name_list, cut=sys.maxint): pickles = plot_util.load_pickles(name_list, pkl_list) best_dict, idx_dict, keys = plot_util.get_best_dict(name_list, pickles, cut=cut) for k in keys: sys.stdout.write("%10s: %s experiment(s)\n" % (k, len(best_dict[k]))) sys.stdout.write("Unpaired t-tests-----------------------------------------------------\n") # TODO: replace by itertools for idx, k in enumerate(keys): if len(keys) > 1: for j in keys[idx+1:]: t_true, p_true = stats.ttest_ind(best_dict[k], best_dict[j]) rounded_t_true, rounded_p_true = stats.ttest_ind(numpy.round(best_dict[k], 3), numpy.round(best_dict[j], 3)) sys.stdout.write("%10s vs %10s\n" % (k, j)) sys.stdout.write("Standard independent 2 sample test, equal population variance\n") sys.stdout.write(" "*24 + " T: %10.5e, p-value: %10.5e (%5.3f%%) \n" % (t_true, p_true, p_true*100)) sys.stdout.write("Rounded: ") sys.stdout.write(" T: %10.5e, p-value: %10.5e (%5.3f%%)\n" % (rounded_t_true, rounded_p_true, rounded_p_true*100)) if tuple(map(int, (scipy.__version__.split(".")))) >= (0, 11, 0): # print scipy.__version__ >= '0.11.0' t_false, p_false = stats.ttest_ind(best_dict[k], best_dict[j], equal_var=False) rounded_t_false, rounded_p_false = stats.ttest_ind(numpy.round(best_dict[k], 3), numpy.round(best_dict[j], 3), equal_var=False) sys.stdout.write("Welch's t-test, no equal population variance\n") sys.stdout.write(" "*24) sys.stdout.write(": T: %10.5e, p-value: %10.5e (%5.3f%%)\n" % (t_false, p_false, p_false*100)) sys.stdout.write("Rounded: ") sys.stdout.write(": T: %10.5e, p-value: %10.5e (%5.3f%%)\n" % (rounded_t_false, rounded_p_false, rounded_p_false*100)) sys.stdout.write("\n") sys.stdout.write("Best Value-----------------------------------------------------------\n") for k in keys: sys.stdout.write("%10s: %10.5f (min: %10.5f, max: %10.5f, std: %5.3f)\n" % (k, float(numpy.mean(best_dict[k])), float(numpy.min(best_dict[k])), numpy.max(best_dict[k]), float(numpy.std(best_dict[k])))) sys.stdout.write("Needed Trials--------------------------------------------------------\n") for k in keys: sys.stdout.write("%10s: %10.5f (min: %10.5f, max: %10.5f, std: %5.3f)\n" % (k, float(numpy.mean(idx_dict[k])), float(numpy.min(idx_dict[k])), numpy.max(idx_dict[k]), float(numpy.std(idx_dict[k])))) sys.stdout.write("------------------------------------------------------------------------\n")
def runCompare(self, objId1, objId2, label1, label2, paired, expression1, expression2): fh = open(self._getPath("report.txt"),'w') self.experiment1 = self.readExperiment(self.inputExperiment1.get().fnPKPD) self.experiment2 = self.readExperiment(self.inputExperiment2.get().fnPKPD) label2ToUse = self.label1.get() if self.label2.get()=="" else self.label2.get() if self.paired: x1=[] x2=[] for sampleName, sample in self.experiment1.getSubGroup(self.expression1.get()).iteritems(): x1.append(float(sample.descriptors[self.label1.get()])) if sampleName in self.experiment2.samples: x2.append(float(self.experiment2.samples[sampleName].descriptors[label2ToUse])) else: raise "Cannot find sample %s in Experiment 2"%sample.sampleName else: expression2ToUse = self.expression1.get() if self.expression2.get()=="" else self.expression2.get() x1 = [float(x) for x in self.experiment1.getSubGroupLabels(self.expression1.get(),self.label1.get())] x2 = [float(x) for x in self.experiment2.getSubGroupLabels(expression2ToUse,label2ToUse)] self.doublePrint(fh,"Values in SubGroup 1: %s"%str(x1)) self.doublePrint(fh,"Values in SubGroup 2: %s"%str(x2)) self.doublePrint(fh,"Testing H0: mu1=mu2") self.doublePrint(fh," ") try: if self.paired: [t,pval] = stats.ttest_rel(np.asarray(x1,np.double),np.asarray(x2,np.double),True) self.doublePrint(fh,"T-test two paired samples: t-statistic=%f p-value=%f"%(t,pval)) else: [t,pval] = stats.ttest_ind(np.asarray(x1,np.double),np.asarray(x2,np.double),True) self.doublePrint(fh,"T-test two independent samples (same variance): t-statistic=%f p-value=%f"%(t,pval)) except: pass if not self.paired: try: [t,pval] = stats.ttest_ind(x1,x2, False) self.doublePrint(fh,"T-test two independent samples (different variance, Welch's test): t-statistic=%f p-value=%f"%(t,pval)) except: pass try: if self.paired: [w,pval] = stats.wilcoxon(x1, x2, correction=True) self.doublePrint(fh,"Wilcoxon signed rank test for two paired samples: w-statistic=%f p-value=%f"%(w,pval)) else: [u,pval] = stats.mannwhitneyu(x1, x2, True) self.doublePrint(fh,"Mann-Whitney U test for two independent samples: u-statistic=%f p-value=%f"%(u,pval)) except: pass fh.close()
def detect_hemizygous_markers(pop_file , coverage_file, nb_sample_required=0): sample2pop, pop2sample = read_pop_file(pop_file) if len(pop2sample)!=2: logging.critical('Hemizygous Markers can only be search between two set of samples. Edit you population file to have two populations') return -1 pop1,pop2 = pop2sample.keys() samples_pop1 = pop2sample.get(pop1) samples_pop2 = pop2sample.get(pop2) all_markers, all_samples, all_samples_to_norm_coverage = get_normalize_coverage(coverage_file, nb_sample_required) sample_errors = set(sample2pop.keys()).difference(set(all_samples)) if len(sample_errors)>0: logging.critical('%s samples (%s) from the population file not found in the coverage file'%(len(sample_errors), ', '.join(sample_errors))) return -2 header = ["#consensus", "mean_%s"%(pop1), "mean_%s"%(pop2), "fold_change", "t_test_%s_eq_2X_%s"%(pop1,pop2), "t_test_%s_eq_%s"%(pop1,pop2)] all_lines = [' '.join(header)] for i, marker in enumerate(all_markers): out=[marker] out_pop1=[] out_pop2=[] pop1_values = [] pop2_values = [] for sample in samples_pop1: cov = all_samples_to_norm_coverage.get(sample)[i] pop1_values.append(cov) out_pop1.append(str(cov)) for sample in samples_pop2: cov = all_samples_to_norm_coverage.get(sample)[i] pop2_values.append(cov) out_pop2.append(str(cov)) pop1_nvalues = numpy.array(pop1_values) pop1_nvalues_2 =pop1_nvalues*2 pop2_nvalues = numpy.array(pop2_values) #This compare the normalized values, we assume they should not be equal t_stat_eq, pvalue_eq = stats.ttest_ind(pop1_nvalues,pop2_nvalues) #This compare the norm value with norm values * 2, we assume they should be equal t_stat_2X, pvalue_2X = stats.ttest_ind(pop1_nvalues_2,pop2_nvalues) fold=pop2_nvalues.mean()/pop1_nvalues.mean() if pvalue_eq<.05 and pvalue_2X >0.5 and fold < 2.2 and fold >1.8: out.append(str(pop1_nvalues.mean())) out.append(str(pop2_nvalues.mean())) out.append(str(fold)) out.append(str(pvalue_2X)) out.append(str(pvalue_eq)) all_lines.append(' '.join(out)) return '\n'.join(all_lines)
def stats(self, x, y): if not self.diagonal: xflatten = np.delete(x, [i*(x.shape[0]+1)for i in range(x.shape[0])]) yflatten = np.delete(y, [i*(y.shape[0]+1)for i in range(y.shape[0])]) p = np.corrcoef(xflatten,yflatten) utils.printf('Pearson\'s correlation:\n{}'.format(p)) utils.printf('Z-Test:{}'.format(ztest(xflatten, yflatten))) utils.printf('T-Test:{}'.format(ttest_ind(xflatten, yflatten))) else: p = np.corrcoef(x, y) utils.printf('Pearson\'s correlation:\n{}'.format(p)) utils.printf('Z-Test:{}'.format(ztest(x, y))) utils.printf('T-Test:{}'.format(ttest_ind(x, y)))
# if we leave in Aid Response all the distributions will test # as the same # comment out the next line to test with all Inc Types row.pop(1) # convert from strings floats = [] for count in row: floats.append(float(count)) # keep track of the values hoodCounts.append(floats) for index in range(len(hoodCounts)): # get the current Neighborhood compareHood = hoodNames[index] compareSet = hoodCounts[index] for next in range(len(hoodCounts)): # compare to each if index != next: testHood = hoodNames[next] testSet = hoodCounts[next] # Run a t test on the incident distributions pVals = stats.ttest_ind(compareSet, testSet, equal_var=False) if pVals[1] < 0.05: print testHood print compareHood print pVals
import numpy as np from scipy import stats N = 10 a = np.random.randn(N) + 2 b = np.random.randn(N) var_a = a.var(ddof=1) var_b = b.var(ddof=1) s = np.sqrt((var_a + var_b) / 2) t = ((a.mean() - b.mean()) / (s * np.sqrt(2.0 / N))) df = 2*N - 2 p = 1 - stats.t.cdf(t, df=df) print("t:\t", t, "p:\t", 2*p) t2, p2 = stats.ttest_ind(a, b) print("t2:\t", t2, "p2:\t", p2)
# Import independent two-sample t-test from scipy.stats import ttest_ind # Divide `df.brain_vol` by `df.skull_vol` df['adj_brain_vol'] = df.brain_vol / df.skull_vol # Select brain measures by Alzheimers group brain_alz = df.loc[df.alzheimers == True, 'adj_brain_vol'] brain_typ = df.loc[df.alzheimers == False, 'adj_brain_vol'] # Evaluate null hypothesis results = ttest_ind(brain_alz, brain_typ)
break client_seconds = time.perf_counter() - start_seconds print(f'client-time\t{query_index}\t{client_seconds}\t{server_seconds}') # Run additional profiling queries to collect profile data, but only if test times appeared to be different. # We have to do it after normal runs because otherwise it will affect test statistics too much if len(all_server_times) != 2: continue if len(all_server_times[0]) < 3: # Don't fail if for some reason there are not enough measurements. continue pvalue = stats.ttest_ind(all_server_times[0], all_server_times[1], equal_var=False).pvalue median = [statistics.median(t) for t in all_server_times] # Keep this consistent with the value used in report. Should eventually move # to (median[1] - median[0]) / min(median), which is compatible with "times" # difference we use in report (max(median) / min(median)). relative_diff = (median[1] - median[0]) / median[0] print( f'diff\t{query_index}\t{median[0]}\t{median[1]}\t{relative_diff}\t{pvalue}' ) if abs(relative_diff) < ignored_relative_change or pvalue > 0.05: continue # Perform profile runs for fixed amount of time. Don't limit the number # of runs, because we also have short queries. profile_start_seconds = time.perf_counter()
count2 = count2 - 1 print('After Sampling:\n') print(sampled_native_df.describe()) print(sampled_singularity_df.describe()) #df = native_df.merge(singularity_df, how='left') #print(df.describe()) print('p-value:\t 0.05\n') print('degrees of freedom:\t ~60\n') print('Critical t-val:\t 2.0\n') t_val_rel = stats.ttest_rel( sampled_native_df.loc[:, 'Native Runtime (Seconds)'], sampled_singularity_df.loc[:, 'Singularity Runtime (Seconds)']) print(t_val_rel) t_val_ind = stats.ttest_ind( sampled_native_df.loc[:, 'Native Runtime (Seconds)'], sampled_singularity_df.loc[:, 'Singularity Runtime (Seconds)']) print(t_val_ind) ax = plt.gca() native_df.plot(kind='hist', y='Native Runtime (Seconds)', color='red', ax=ax) singularity_df.plot(kind='hist', y='Singularity Runtime (Seconds)', color='blue', ax=ax) plt.savefig('2gpu_Histogram.png') plt.savefig('2gpu_Histogram.eps') plt.figure(2) ax2 = plt.gca() sampled_native_df.plot(kind='hist',
# ## Use similarity across conditions as the 4th dimension ########################################## print("Compute similarity via ttest.") condition_names = list(np.unique(haxby_labels)) n_conds = len(condition_names) n_compares = n_conds * (n_conds - 1) / 2 p_vectors = np.zeros((n_compares, masked_fmri_vectors.shape[1])) comparison_text = [] comparison_img = [] idx = 0 for i, cond in enumerate(condition_names): for j, cond2 in enumerate(condition_names[i + 1:]): print("Computing ttest for %s vs. %s." % (cond, cond2)) _, p_vector = stats.ttest_ind( masked_fmri_vectors[haxby_labels == cond, :], masked_fmri_vectors[haxby_labels == cond2, :], axis=0) # Normalize and log-transform p_vector /= p_vector.max() # normalize p_vector = -np.log10(p_vector) p_vector[np.isnan(p_vector)] = 0. p_vector[p_vector > 10.] = 10. p_img = epi_masker.inverse_transform(p_vector) comparison_img.append(p_img) comparison_text.append('%s vs. %s' % (cond, cond2)) p_vectors[idx, :] = p_vector idx += 1 # ## Convert similarities into a single subject image (like a time-course) ################
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--infile', required=True, help='Tabular file.') parser.add_argument('-o', '--outfile', required=True, help='Path to the output file.') parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument( "--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help= "Whether a continuity correction (1/2.) should be taken into account.") parser.add_argument( "--equal_var", action="store_true", default=False, help= "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance." ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values.") parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias" ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument("--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored") parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help= "If True, if there are extra points a warning is raised saying how many of those points there are" ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help= "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs." ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help= "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)" ) parser.add_argument( "--n", type=int, default=0, help= "the number of trials. This is ignored if x gives both the number of successes and failures" ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help= "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5" ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument( "--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end." ) parser.add_argument( "--lambda_", type=float, default=1.0, help= "lambda_ gives the power in the Cressie-Read power divergence statistic" ) parser.add_argument( "--imbda", type=float, default=0, help= "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument." ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, 'w+') test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(';'): barlett_samples.append(map(int, sample.split(','))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(',') if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(',') sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split('\t') if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(get_value(cols, index)) if sample2 == 1: for index in sample_two_cols: sample_two.append(get_value(cols, index)) if test_id.strip() == 'describe': size, min_max, mean, uv, bs, bk = stats.describe( map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == 'mode': vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == 'nanmean': m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == 'nanmedian': m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == 'kurtosistest': z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == 'variation': ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == 'itemfreq': freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ','.join(map(str, list)) cols.append(elements) elif test_id.strip() == 'nanmedian': m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == 'variation': ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == 'boxcox_llf': IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == 'tiecorrect': fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == 'rankdata': r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == 'nanstd': s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == 'anderson': A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(',') for list in sig: cols.append(list) elif test_id.strip() == 'binom_test': p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == 'gmean': gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == 'hmean': hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == 'kurtosis': k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == 'moment': n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == 'normaltest': k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == 'skew': skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == 'skewtest': z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == 'sem': s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == 'zscore': z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == 'signaltonoise': s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == 'percentileofscore': p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == 'bayes_mvs': c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == 'sigmaclip': c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == 'kstest': d, p_value = stats.kstest(map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode) cols.append(d) cols.append(p_value) elif test_id.strip() == 'chi2_contingency': chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == 'tmean': if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == 'tmin': if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == 'tmax': if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == 'tvar': if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == 'tstd': if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == 'tsem': if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == 'scoreatpercentile': if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation) for list in s: cols.append(list) elif test_id.strip() == 'relfreq': if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == 'binned_statistic': if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf)) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == 'threshold': if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == 'trimboth': o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == 'trim1': t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == 'histogram': if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == 'cumfreq': if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == 'boxcox_normmax': if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == 'boxcox': if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == 'histogram2': h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == 'ranksums': z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == 'ttest_1samp': t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == 'ansari': AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == 'linregress': slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two)) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == 'pearsonr': cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == 'pointbiserialr': r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == 'ks_2samp': d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == 'mannwhitneyu': mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == 'zmap': z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == 'ttest_ind': mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one), map(float, sample_two), equal_var=args.equal_var) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == 'ttest_rel': t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == 'mood': z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == 'shapiro': W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == 'kendalltau': k, p_value = stats.kendalltau(map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort) cols.append(k) cols.append(p_value) elif test_id.strip() == 'entropy': s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == 'spearmanr': if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == 'wilcoxon': if sample2 == 1: T, p_value = stats.wilcoxon(map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction) else: T, p_value = stats.wilcoxon(map(float, sample_one), zero_method=args.zero_method, correction=args.correction) cols.append(T) cols.append(p_value) elif test_id.strip() == 'chisquare': if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == 'power_divergence': if sample2 == 1: stat, p_value = stats.power_divergence(map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == 'theilslopes': if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == 'combine_pvalues': if sample2 == 1: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med, weights=map( float, sample_two)) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == 'obrientransform': ob = stats.obrientransform(*b_samples) for list in ob: elements = ','.join(map(str, list)) cols.append(elements) elif test_id.strip() == 'f_oneway': f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == 'kruskal': h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == 'friedmanchisquare': fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == 'fligner': xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == 'bartlett': T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == 'levene': w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == 'median_test': stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ','.join(map(str, list)) cols.append(elements) outfile.write('%s\n' % '\t'.join(map(str, cols))) outfile.close()
clmtempf = ma.filled(clmtempf, fill_value=0.) clmtropfa = ma.masked_where(maitoatrop <= 0, clmtropfa) clmtempfa = ma.masked_where(maitoatemp <= 0, clmtempfa) clmtropfa = ma.filled(clmtropfa, fill_value=0.) clmtempfa = ma.filled(clmtempfa, fill_value=0.) clmhis = clmtropf + clmtempf clmfuture = clmtropfa + clmtempfa clmhis = ma.masked_where(clmhis[:, :] <= 0, clmhis) clmfuture = ma.masked_where(clmfuture[:, :] <= 0, clmfuture) clmhist = clmtrop + clmtemp clmfutt = clmtropa + clmtempa tc, pTc = ttest_ind(clmhist, clmfutt, axis=0, equal_var=False) tc = N.flipud(tc) pTc = N.flipud(pTc) yieldclm = clmfuture - clmhis yieldclm = ma.masked_where(yieldclm == 0., yieldclm) yieldclm1 = ma.masked_where(pTc[:, :] > 0.1, yieldclm) yieldf = N.zeros((10, 360, 720)) yieldf2 = N.zeros((10, 360, 720)) yieldfa = N.zeros((10, 360, 720)) yieldf2a = N.zeros((10, 360, 720)) yieldfb = N.zeros((10, 360, 720)) yieldf2b = N.zeros((10, 360, 720))
# Plotting by genotype and sex gensex_fig = plt.figure(2) mouse_mask_table[['MaskVolume', 'Genotype', 'Sex']].boxplot(by=['Genotype', 'Sex']) plt.ylabel('$mm^3$') plt.ylabel('') plt.savefig(os.path.join(analysis_path, 'Boxplot_MaskVolumes_ByGenotypeSex')) # plt.show() # pval calculation, equal_var for now ;) mouse_mask_table[mouse_mask_table['Genotype'] == 'WT']['MaskVolume'] cat2 = mouse_mask_table[mouse_mask_table['Genotype'] == 'KO'] print( ttest_ind( mouse_mask_table[mouse_mask_table['Genotype'] == 'WT']['MaskVolume'], mouse_mask_table[mouse_mask_table['Genotype'] == 'KO']['MaskVolume'], equal_var=True)) mouse_mask_table.to_csv( os.path.join(analysis_path, 'Mouse_maskvolume_table.csv')) ## Function to compute volumes for image def image2volumetable(image_path, voxel_volume): # Compute voxel numbers and volumes and output to table mouse_mask_image = nib.load(image_path) mouse_mask_image_array = mouse_mask_image.get_fdata() [mouse_volume_integer, mouse_voxel_number ] = np.unique(np.int64(np.round(mouse_mask_image_array)), return_counts=True) mouse_volume = mouse_voxel_number * voxel_volume
# descriptive statistics for rest of world sales df["Other_Sales"].describe() # In[6]: # descriptive statistics for global sales df["Global_Sales"].describe() # In[11]: # T-test for North American sales stats.ttest_ind(df["Global_Sales"], df["NA_Sales"]) # In[12]: # T-test for European sales stats.ttest_ind(df["Global_Sales"], df["EU_Sales"]) # In[13]: # T-test for Japanese sales stats.ttest_ind(df["Global_Sales"], df["JP_Sales"])
del current currentInds.reset_index(drop=True) #test if dataframe is empty to continue to next drug if currentInds.empty: continue else: #separate the concentrations conc= np.unique(currentInds['concentration']) for dose in conc: test =[] to_test = currentInds['concentration'] ==dose testing = currentInds[to_test] for feature in currentInds.columns[0:-3]: test.append(stats.ttest_ind(testing[feature], controlMeans[rep][feature])) ps = [(test[i][1]) for i in range(len(test))] #make into a list ps.append(drug) ps.append(dose) temp = pd.DataFrame(ps).transpose() pVals[rep] = pVals[rep].append(temp) del temp, to_test, testing del currentInds #add in features pVals[rep].columns = feats pVals[rep] = pVals[rep].reset_index (drop=True) #import module for multiple comparison correction
def q5(): # Retorne aqui o resultado da questão 5. return False # In[85]: atletas = ["BRA", "USA", "CAN"] amostra = athletes[athletes["nationality"].isin(atletas)] brasileiros = athletes[athletes["nationality"].isin(["BRA"])]["height"] americanos = athletes[athletes["nationality"].isin(["USA"])]["height"] sct.ttest_ind(americanos.dropna(),brasileiros.dropna(), equal_var=False) # ## Questão 6 # # Repita o procedimento da questão 5, mas agora entre as alturas de `bra` e `can`. Podemos afimar agora que as médias são estatisticamente iguais? Reponda com um boolean (`True` ou `False`). # In[11]: def q6(): # Retorne aqui o resultado da questão 6. return True # In[88]:
def compara_media(str_1, str_2): v1 = athletes[athletes["nationality"].isin([str(str_1)])]["height"] v2 = athletes[athletes["nationality"].isin([str(str_2)])]["height"] return sct.ttest_ind(v1.dropna(), v2.dropna(), equal_var=False)
# print("df2.income.sum() : ", df2.income.sum()) # # print("+++++++++중앙값 구하기++++++++++") # print("df2.income.median() : ", df2.income.median()) # # print("+++++++++기초통계량 요약해서 출력하기++++++++++") # print("df2.describe() : ", df2.describe()) # print("df2.income.describe() : ", df2.income.describe()) # # print("df2.sex.value_counts() : ", df2.sex.value_counts()) # print("df2.groupby(df2.sex).mean()", df2.groupby(df2.sex).mean()) male = df2.income[df2.sex == 'm'] female = df2.income[df2.sex == 'f'] ttest_result = stats.ttest_ind(male, female) print(ttest_result) print("ttest_result[0]", ttest_result[0]) print("ttest_result[1]", ttest_result[1]) if ttest_result[1] > 0.05: print(f'p-value는 {ttest_result[1]}로 95% 수준에서 유의하지않음') else: print(f'p-value는 {ttest_result[1]}로 95% 수준에서 유의함') corr = df2.corr(method='spearman') print("corr:", corr) income_stress_corr = df2.income.corr(df2.stress) print("income_stress_corr:", income_stress_corr)
def return_stats( stock='jpm', commission=2, money=100000, #inc=10,- can read this argument and change code below if doing absolute share-based #original_shares=100, - can read this argument and change code below if doing absolute share-based policies=[hold, random_action, rule_based, ols, qlearner]): ''' Enacts every strategy and provides summary statistics and graphs Inputs stock: money: original cash held inc: increment of buy/sell permitted original_shares: original number of shares held Output None Provides numerous summary statistics and visualizations ''' original_money = money # generate stock table stock_table = read_stock(stock, start, end) # note stock name stock_name = stock.upper() # approximate 50/50 split in money-stock original_shares = round(money / 2 / stock_table.values[0]) # recalculate money accordingly money -= (stock_table.values[0] * original_shares) # make share increment about 1% of original share holdings inc = m.ceil(original_shares / 100) # generate results results = { policy.__name__: policy(stock_table, money=money, inc=inc, original_shares=original_shares, commission=commission) for policy in policies } # plot qtables only for qlearner (or any other strategies with Q table) for policy in policies: if results[policy.__name__][ 'qtable'] is not None: #don't try to plot Q tables for benchmark strategies # get state history and quantile length and qtable for normalization and averaging function state_history = results[policy.__name__]['state_history'] quantile_length = len(results[policy.__name__]['BB_quantiles']) qtab = results[policy.__name__]['qtable'] qtab_bb = weighted_average_and_normalize(qtab, state_history, 0, quantile_length) qtab_bb = qtab_bb.iloc[:: -1] # reverse order of rows for visualization purposes - now biggest value will be on top qtab_bb.index = np.round( np.flip(np.array(results[policy.__name__]['BB_quantiles'])), 5 ) # define index as bb quantiles, reversing quantile order in kind so biggest value is first # plot BB heatmap plt.figure(figsize=(9, 7)) fig = heatmap(qtab_bb, cmap='Blues') plt.title('Bollinger Band % Q-Table', size=16) plt.gca().hlines([i + 1 for i in range(len(qtab_bb.index))], xmin=0, xmax=10, linewidth=10, color='white') plt.xticks(fontsize=15) plt.yticks(fontsize=14, rotation=0) plt.gca().tick_params(axis='x', bottom=False, left=False) plt.gca().tick_params(axis='y', bottom=False, left=False) plt.show(fig) # marginalize over SMA # TODO - determine if this mean was taken correctly qtab_sma = weighted_average_and_normalize(qtab, state_history, 1, quantile_length) qtab_sma = qtab_sma.iloc[::-1] qtab_sma.index = np.round( np.flip(np.array(results[policy.__name__]['SMA_quantiles'])), 5) plt.figure(figsize=(9, 7)) fig = heatmap(qtab_sma, cmap='Blues') plt.title('SMA Percentage Q-Table', size=16) plt.gca().hlines([i + 1 for i in range(len(qtab_sma.index))], xmin=0, xmax=10, linewidth=10, color='white') plt.xticks(fontsize=15) plt.yticks(fontsize=14, rotation=0) plt.gca().tick_params(axis='x', bottom=False, left=False) plt.gca().tick_params(axis='y', bottom=False, left=False) plt.show(fig) # marginalize over MRDR # TODO - determine if this mean was taken correctly qtab_mrdr = weighted_average_and_normalize(qtab, state_history, 2, quantile_length) qtab_mrdr = qtab_mrdr.iloc[::-1] qtab_mrdr.index = np.round( np.flip(np.array(results[policy.__name__]['MRDR_quantiles'])), 5) plt.figure(figsize=(9, 7)) fig = heatmap(qtab_mrdr, cmap='Blues') plt.title('Market Relative Daily Return Q-Table', size=16) plt.gca().hlines([i + 1 for i in range(len(qtab_mrdr.index))], xmin=0, xmax=10, linewidth=10, color='white') plt.xticks(fontsize=15) plt.yticks(fontsize=14, rotation=0) plt.gca().tick_params(axis='x', bottom=False, left=False) plt.gca().tick_params(axis='y', bottom=False, left=False) plt.show(fig) # get markov transition models for policy in policies: plt.figure(figsize=(6, 3)) plt.title('Transition Matrix For ' + policy.__name__, size=16) mkv = results[policy.__name__]['markov'] fig = heatmap(mkv, annot=True, annot_kws={'size': 14}, cmap='Greens', cbar=False) plt.xticks(fontsize=14) plt.yticks(fontsize=14, rotation=0) plt.gca().set(xlabel='Current Trading Day', ylabel='Last Trading Day') plt.gca().tick_params(axis='x', bottom=False, left=False) plt.gca().tick_params(axis='y', bottom=False, left=False) plt.gca().hlines([1, 2], xmin=0, xmax=10, linewidth=10, color='white') plt.show(fig) # plot daily portfolio values plt.figure(figsize=(14, 8)) for policy in policies: plt.plot(results[policy.__name__]['final_vals'], label=policy.__name__) plt.legend() plt.xlabel("Date", fontsize=20) plt.ylabel("Portfolio Value ($)", fontsize=20) plt.title("Daily Portfolio Values For Different Trading Strategies: " + stock.upper(), fontsize=25) plt.show() # plot daily cash values plt.figure(figsize=(14, 8)) for policy in policies: plt.plot(results[policy.__name__]['cash'], label=policy.__name__) plt.legend() plt.xlabel("Date", fontsize=20) plt.ylabel("Cash Held ($)", fontsize=20) plt.title("Daily Cash Held For Different Trading Strategies: " + stock.upper(), fontsize=25) plt.show() # plot daily shares plt.figure(figsize=(14, 8)) for policy in policies: plt.plot(results[policy.__name__]['shares'], label=policy.__name__) plt.legend() plt.xlabel("Date", fontsize=20) plt.ylabel("Shares Held", fontsize=20) plt.title("Daily Share Holdings For Different Trading Strategies: " + stock_name, fontsize=25) plt.show() # plot daily portfolio values for i, policy in enumerate(policies): dic = results[policy.__name__] if dic['state_history'] is not None: print("States History for " + policy.__name__ + "is: ", dic['state_history']) del dic['state_history'] del dic['qtable'] del dic['markov'] try: del dic['BB_quantiles'] del dic['SMA_quantiles'] del dic['MRDR_quantiles'] except: pass df = pd.DataFrame(dic) plt.figure(figsize=(14, 8)) plt.plot([], label="BUY", color="orange", marker='o') plt.plot([], label="SELL", color="black", marker='o') plt.plot([], label="HOLD", color="red", marker='o') buy_df = df[df.actions == 'BUY'] sell_df = df[df.actions == 'SELL'] hold_df = df[df.actions == 'HOLD'] plt.plot(results[policy.__name__]['final_vals'], label=policy.__name__) plt.scatter(buy_df.index, buy_df['final_vals'], color='orange', marker='^', s=10) plt.scatter(sell_df.index, sell_df['final_vals'], color='black', marker='v', s=10) plt.scatter(hold_df.index, hold_df['final_vals'], color='red', marker='s', s=10) plt.xlabel("Date", fontsize=20) plt.ylabel("Portfolio Value ($)", fontsize=20) plt.title("Daily Portfolio Values For Trading Strategies of " + policy.__name__ + " for stock : " + stock.upper(), fontsize=25) plt.legend() plt.show() # display percentages #TODO: display(res) has no display() function. Fix bug. for policy in policies: print('For ' + stock_name + ',', policy.__name__, 'action proportions were:') res = results[policy.__name__]['actions'].value_counts() res = res / res.sum() print(res) print('\n') print('For ' + stock_name + ',', policy.__name__, 'average return based on action was:') res = returns(results[policy.__name__]['final_vals']).groupby( results[policy.__name__]['actions']).mean() print(res) print('\n') # calculate final returns for policy in policies: print('Final porfolio value under', policy.__name__, 'strategy for ' + stock_name + ':', round(results[policy.__name__]['final_vals'].values[-1], 0)) print('\n') # calculate final percentage of money invested in stock for policy in policies: print( 'Final percentage of money invested in stock under', policy.__name__, 'strategy for ' + stock_name + ':', str( round( 100 * (1 - (results[policy.__name__]['cash'].values[-1] / results[policy.__name__]['final_vals'].values[-1])), 1)) + '%') print('\n') # calculate returns rets = { policy: returns(results[policy.__name__]['final_vals']) for policy in policies } # generate risk_free return for sharpe ratio - five-year treasury yield rfs = returns(read_stock('^FVX')) # find common indecies between stock tables and treasury yields rfn = set(stock_table.index).intersection(set(rfs.index)) # now reindex rfr = rfs.loc[rfn] rfi = rfr.index # generate baseline return for information ratio - s&p 500 bls = returns(read_stock('^GSPC')).values # print summary stats for daily returns for policy in policies: nm = policy.__name__ # mean daily return print('Mean daily return under', nm, 'for', stock_name + ':', str(round(np.mean(rets[policy], axis=0), 5))) # standard deviation of daily return print('Standard deviation of daily return under', nm, 'for', stock_name + ':', round(np.std(rets[policy], axis=0), 3)) # information ratio of daily return checkhist(rets[policy].values, bls) pr = np.mean(rets[policy].values) br = np.mean(bls) te = np.std(rets[policy].values - bls) ir = round((pr - br) / (te) * np.sqrt(len(bls)), 2) print('Information Ratio against S&P 500 under', nm, 'strategy for', stock_name + ':', ir) # sharpe ratio of daily return dat = rets[policy].loc[ rfi].values # need to correct dates to line up with risk free return checkhist(dat, rfr) rp = np.mean(dat) br = np.mean(rfr) sd = np.std(rfr - dat) sr = round((rp - br) / (sd) * np.sqrt(len(rfr)), 2) print('Sharpe Ratio against five-year treasury yield under', nm, 'strategy for', stock_name + ':', sr) print( 'Note: only used dates when five-year treasury yields were available in calculating RFR for Sharpe Ratio' ) print('\n') for policy1 in policies: p1 = rets[policy1].loc[ rfi].values # filter to dates with five-year treasury yields available n1 = policy1.__name__ # independent samples t-test vs. risk-free return checkhist(p1, rfr) t = ttest_ind(p1, rfr, equal_var=True) gr = t[0] > 0 n2 = 'rfr' p = round(t[1], 3) / 2 # make one-sided if gr: print('T-test for difference of mean returns in', n1, 'and', n2, 'finds', n1, '>', n2, 'with p-value', round(p, 3)) else: print('T-test for difference of mean returns in', n2, 'and', n1, 'finds', n2, '>', n1, 'with p-value', round(p, 3)) # levene test vs. risk-free return l = levene(rets[policy1].values, bls) p = round(l[1], 3) gr = np.std(rets[policy1].values) > np.std(bls) n2 = 'bls' if gr: print('Levene test for difference of variances (volatility) in', n1, 'and', n2, 'finds p-value of', round(p, 3), 'with', n1, 'showing more volatility') else: print('Levene test for difference of variances (volatility) in', n1, 'and', n2, 'finds p-value of', round(p, 3), 'with', n2, 'showing more volatility') print('\n') for policy2 in policies: if policy1 != policy2: #and hash(policy1) <= hash(policy2) - not necessary p1 = rets[ policy1].values # no longer need to filter to dates with five-year treasury yields available p2 = rets[policy2].values checkhist(p1, p2) n2 = policy2.__name__ # independent samples t-test t = ttest_ind(p1, p2, equal_var=True) gr = t[0] > 0 p = round(t[1], 3) / 2 # make one-sided if gr: print('T-test for difference of mean returns in', n1, 'and', n2, 'finds', n1, '>', n2, 'with p-value', round(p, 3)) else: print('T-test for difference of mean returns in', n2, 'and', n1, 'finds', n2, '>', n1, 'with p-value', round(p, 3)) # levene test l = levene(p1, p2) p = round(l[1], 5) gr = np.std(p1) > np.std(p2) if gr: print( 'Levene test for difference of variances (volatility) in', n1, 'and', n2, 'finds p-value of', round(p, 3), 'with', n1, 'showing more volatility') else: print( 'Levene test for difference of variances (volatility) in', n1, 'and', n2, 'finds p-value of', round(p, 3), 'with', n2, 'showing more volatility') print('\n') print('\n') # TODO: add any additional desired visualizations plt.show()
sdf = pd.DataFrame() best_algo = "BL" for e in error_metrics: X = df.loc[df['algorithm'] == best_algo, e].to_numpy() sdf = sdf.append( { 'algorithm': best_algo, 'Error': np.mean(X), 'metric': e }, ignore_index=True) for algo in df['algorithm'].unique(): if algo in best_algo: continue Y = df.loc[df['algorithm'] == algo, e].to_numpy() p_value = stats.ttest_ind(X, Y).pvalue # print(p_value) if p_value <= 0.05: sdf = sdf.append( { 'algorithm': algo, 'Error': np.mean(Y), 'metric': e }, ignore_index=True) else: sdf = sdf.append( { 'algorithm': algo, 'Error': np.mean(X), 'metric': e
def main(args): # Loading data trought Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, runOrder=args.order, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. if args.pairing == "unpaired": logger.info( "Unpaired t-test will be performed for all groups pairwise.") # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique, 2)) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # This variable is useless for unpared test. it just adds extra column to the data frame. if args.order == False: number_of_features = data_frame.shape[1] - 1 else: number_of_features = data_frame.shape[1] - 2 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() else: data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group, args.order], 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() else: data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[ args.group].isin([groups_subset[0]])] data_frame_second_group = data_frame.loc[data_frame[ args.group].isin([groups_subset[1]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_first_group = data_frame_first_group.drop( args.group, 1).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1).transpose() else: data_frame_first_group = data_frame_first_group.drop( [args.group, args.order], 1).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features): series_first = data_frame_first_group.loc[indexes_list[j]] series_second = data_frame_second_group.loc[indexes_list[j]] ttest_ind_args = [series_first, series_second] p_value[j] = ttest_ind(*ttest_ind_args)[1] t_value[j] = ttest_ind(*ttest_ind_args)[0] # Possible alternative for two groups. # p_value[j] = ttest_ind_args(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[ 0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[ 0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame( data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[ neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups. # Each sample in one group should have exacty one matching pair in the other group. # The matching is controlled by args.order variable. if args.pairing == "paired": logger.info( "Paired test will be performed for two groups pairwise based on pairing variable: {0}." .format(args.order)) # Getting the number of unique groups. If it is bigger than 2 return the warning and exit. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] if number_of_unique_groups != 2: logger.warning( u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed." .format(number_of_unique_groups)) exit() # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. # Creating pairwise combination of our two groups that we will use in the future. groups_pairwise = list(combinations(group_values_series_unique, 2)) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # Checking that the requred pairing variable has been provided. if args.order == False: logger.info( "The required t-test pairing variable has not been provided: The paired t-test cannot be performed." ) exit() # This piece of code will be executed only if the args.order has been provided and the check is passed. # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order number_of_features = data_frame.shape[1] - 2 # At this point is is confirmed that there are only 2 groups and that pairing variable args.order has been provided. # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups. # Getting the unique pairs and deleting those theat have more or less than three. pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze() pairid_values_series_unique = pairid_values_series.unique() number_of_unique_pairid = pairid_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting the number of samples in the final frame. number_of_samples = data_frame.shape[0] # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups. # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning. # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed. for i in range(0, number_of_unique_pairid): # Extracting the pieces of the data frame that belong to ith unique pairid. data_frame_current_pairid = data_frame.loc[data_frame[ args.order].isin([pairid_values_series_unique[i]])] # We transpose here so it will be easier to operate with. data_frame_current_pairid = data_frame_current_pairid.transpose() sample_names_current_pairid = list( data_frame_current_pairid.columns.values) if data_frame_current_pairid.shape[1] != 2: # Pulling indexes list from the current data frame. logger.warning( u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis." .format(pairid_values_series_unique[i], data_frame_current_pairid.shape[1], sample_names_current_pairid)) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid) # Deleting the indexes and in the for loop going to next iteration. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2: # Here we are checking if the groupID-s for the given pair are indeed different. elif data_frame_current_pairid.transpose()[args.group][ 0] == data_frame_current_pairid.transpose()[args.group][1]: logger.warning( u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis." .format( pairid_values_series_unique[i], data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid)) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid) # Deleting the indexes. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # Checking if the data frame became empty after cleaning. if data_frame.shape[0] == 0: logger.warning( u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program." ) exit() # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group, args.order], 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] summary_df[means_value_column_name_current] = means_value # Performing paired t-test for the two groups and saving results. # Creating p_values and flag_values empty list of length number_of_features. # This will be used for the two groups in paired t-test. p_value = [0] * number_of_features t_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features # Performing paired t-test for each pair of features. for j in range(0, number_of_features): # Extracting the pieces of the data frame that belong to 1st group. data_frame_first_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[0]])] data_frame_second_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[1]])] # Sorting data frame by args.group index # This will ensure datasets are aligned by pair when fed to the t-test. data_frame_first_group = data_frame_first_group.sort(args.order) data_frame_second_group = data_frame_second_group.sort(args.order) # Sorting data frame by args.group index data_frame_first_group = data_frame_first_group.drop( [args.group, args.order], 1).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1).transpose() # Pulling list of indexes. This is the same list for the first and for the second. indexes_list = data_frame_first_group.index.tolist() # Pullinng the samples out series_first = data_frame_first_group.loc[indexes_list[j]] series_second = data_frame_second_group.loc[indexes_list[j]] # Running t-test for the two given samples paired_ttest_args = [series_first, series_second] p_value[j] = ttest_rel(*paired_ttest_args)[1] t_value[j] = ttest_rel(*paired_ttest_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[ 0] + '_' + group_values_series_unique[1] t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[ 0] + '_' + group_values_series_unique[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[ 0] + '_' + group_values_series_unique[1] difference_value_column_name_current = 'diff_of_' + group_values_series_unique[ 0] + '_' + group_values_series_unique[1] flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[ 0] + '_' + group_values_series_unique[1] + '_0p01' flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[ 0] + '_' + group_values_series_unique[1] + '_0p05' flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[ 0] + '_' + group_values_series_unique[1] + '_0p10' summary_df[t_value_column_name_current] = t_value summary_df[p_value_column_name_current] = p_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_groups_pairwise): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond threshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of treatment means for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing t-test run.")
sex = list(merged_df.gender.values) site = list(merged_df.site.values) age = list(merged_df.age.values) train_index, test_index = None, None while flag_selection: splits = StratifiedShuffleSplit(n_splits=1, test_size=args.test_size) for train_index, test_index in splits.split(np.zeros(len(site)), site): age_test = [float(age[idx]) for idx in test_index] age_train = [float(age[idx]) for idx in train_index] sex_test = [sex_dict[sex[idx]] for idx in test_index] sex_train = [sex_dict[sex[idx]] for idx in train_index] t_age, p_age = ttest_ind(age_test, age_train) T_sex = chi2(sex_test, sex_train) print(p_age, T_sex) if p_age > args.p_val_threshold and T_sex < args.t_val_threshold: flag_selection = False test_df = merged_df.iloc[test_index] train_df = merged_df.iloc[train_index] train_df.to_csv(train_path, sep='\t', index=False) test_df.to_csv(test_path, sep='\t', index=False)
print("%20s %20s %20s %20s %20s %20s %20s" % ( begin_date, round(init_svi, 4), round(init_bs, 4), round(portfolio_net_svi / init_svi, 4), round(portfolio_net_bs / init_bs, 4), round(tradedamt_svi / holdamt_svi, 4), round(tradedamt_bs / holdamt_bs, 4))) print('=' * 200) print("%20s %20s %20s %20s %20s %20s %20s %20s" % ( "eval date", "spot", "delta", 'price_svi', 'price_bs', 'portfolio_svi', 'portfolio_bs', 'transaction')) print('svi_pnl', sum(svi_pnl) / len(svi_pnl)) print('bs_pnl', sum(bs_pnl) / len(bs_pnl)) results = {} results.update({'date': dates}) results.update({'pnl svi': svi_pnl}) results.update({'pnl bs': bs_pnl}) results.update({'option init svi': option_init_svi}) results.update({'option init bs': option_init_bs}) results.update({'transaction svi': transaction_svi}) results.update({'transaction bs': transaction_bs}) results.update({'holdings svi': holdings_svi}) results.update({'holdings bs': holdings_bs}) df = pd.DataFrame(data=results) # print(df) df.to_csv(os.path.abspath('..') + '/results4/dh_MA_'+barrier_type+'_r=' +str(rebalancerate) + '_b=' + str(barrier_pct) + 'f2.csv') t,p = stats.ttest_ind(svi_pnl,bs_pnl) t1,p1 = stats.wilcoxon(svi_pnl,bs_pnl) print(barrier_type, ' ',barrier_pct) print('t : ',t,p) print('wilcoxom : ',t1,p1)
def generate_tstats_classes(df, dest_dir, params): """Computes t-test for each class. This function computes a t-test for each class in the dataset. The t-test is computed by comparing class level metrics for a set of sparse model checkpoints to non-sparse model checkppints. Args: df: input dataframe with class level metrics. dest_dir: pathway to output directory. params: dataset specific params. """ human_label_lookup = class_level_metrics.HumanLabelLookup() label_dict = human_label_lookup.create_library() class_names = list(label_dict.values()) df.drop(columns='Unnamed: 0') df.reset_index(inplace=True, drop=True) df['id'] = df.index df_ = pd.wide_to_long(df, stubnames=['precision', 'recall'], i='id', j='class', sep='/', suffix=r'\w+').reset_index() data = pd.DataFrame([]) num_classes = params['num_classes'] mean_accuracy_dict = params['accuracy'] long_df_all = df_ for i in range(num_classes): # adding label id ensures unique naming of classes c = class_names[i] + '_' + str(i) for p in [0.1, 0.3, 0.5, 0.7, 0.9]: variant_mean_recall = long_df_all[( (long_df_all['fraction_pruned'] == p) & (long_df_all['class'] == c))]['recall'].mean() baseline_mean_recall = long_df_all[( (long_df_all['fraction_pruned'] == 0.0) & (long_df_all['class'] == c))]['recall'].mean() # normalize recall by model accuracy baseline_set = long_df_all[( (long_df_all['fraction_pruned'] == 0.0) & (long_df_all['class'] == c))]['recall'] - mean_accuracy_dict[0.0] variant_set = long_df_all[( (long_df_all['fraction_pruned'] == p) & (long_df_all['class'] == c))]['recall'] - mean_accuracy_dict[p] t_stat = ttest_ind(baseline_set, variant_set, equal_var=False) data = data.append(pd.DataFrame( { 'class': c, 'pruning_fraction': p, 'baseline_mean_recall': baseline_mean_recall, 'variant_mean_recall': variant_mean_recall, 'pvalue_recall_norm': t_stat[1], 'statistic_recall_norm': t_stat[0], }, index=[0]), ignore_index=True) time_ = str(time.time()) output_file = 'recall_t_statistic' file_name = '_' + time_ + '_' + output_file + '.csv' file_path = os.path.join(dest_dir, file_name) with tf.gfile.Open(file_path, 'w') as f: data.to_csv(f)
def two_cal(self, x, norm_res, homo_res, skews, paired=False): '''Calculate and return two samples comparison tests results. Parameters: ---------- x : list of numpy.ndarray Variables to test on norm_res : dict Normality test results homo_res : dict Homogeneity test rerults skews : list Skewness values of all x variables paired : bool False for two independent variables. True Otherwise Returns: ------- res : dict 'Statistic': statistic value calculated by the test 'Pvalue': p-value calculated by the test 'Test': name of the test used 'Result': True if failed to reject null hypothesis, False otherwise Notes: ----- None''' res = {} if sum(norm_res.values()) == len(x) and all(abs(np.array(skews)) < .5): # If all normal if paired: # Paired samples res['Statistic'] = ss.ttest_rel(x[0], x[1])[0] res['Pvalue'] = ss.ttest_rel(x[0], x[1])[1] res['Test'] = 'T-test with paired samples' else: # Independent samples if homo_res: # Variances equal res['Statistic'] = ss.ttest_ind(x[0], x[1])[0] res['Pvalue'] = ss.ttest_ind(x[0], x[1])[1] res['Test'] = 'T-test with independent samples' else: # Variances unequal res['Statistic'] = ss.ttest_ind(x[0], x[1], equal_var=False)[0] res['Pvalue'] = ss.ttest_ind(x[0], x[1], equal_var=False)[1] res['Test'] = 'Welch\'s T-test' else: # If not all normal, use unparametric tests. if paired: # Paired samples res['Statistic'] = ss.wilcoxon(x[0].reshape(-1), x[1].reshape(-1))[0] res['Pvalue'] = ss.wilcoxon(x[0].reshape(-1), x[1].reshape(-1))[1] res['Test'] = 'Wilcoxon signed-rank Test with Paired Samples' else: # Independent samples if all([len(x[0]), len(x[1])]) >= 20: # Sample size > 20 res['Statistic'] = ss.mannwhitneyu(x[0], x[1])[0] res['Pvalue'] = ss.mannwhitneyu(x[0], x[1])[1] res['Test'] = 'Mann-Whitney U Test with Indepedent Samples' else: # Sample size < 20 res['Statistic'] = ss.ranksums(x[0], x[1])[0] res['Pvalue'] = ss.ranksums(x[0], x[1])[1] res['Test'] = 'Wilcoxon rank-sum Test with Indepedent Samples' # Get the results based on fixed significance level if res['Pvalue'] >= .05: res['Result'] = True else: res['Result'] = False return res
mean_acc = cat2.loc[cat2[('platform', '')] == 'All', ('{}{}'.format(part, var), 'mean')].values[0] std_rej = cat1.loc[cat1[('platform', '')] == 'All', ('{}{}'.format(part, var), 'std')].values[0] std_acc = cat2.loc[cat2[('platform', '')] == 'All', ('{}{}'.format(part, var), 'std')].values[0] count_rej = cat1.loc[cat1[('platform', '')] == 'All', ('{}{}'.format(part, var), 'count')].values[0] count_acc = cat2.loc[cat2[('platform', '')] == 'All', ('{}{}'.format(part, var), 'count')].values[0] grp1 = data.loc[data['accepted'] == 0, '{}{}'.format(part, var)].dropna() grp2 = data.loc[data['accepted'] == 1, '{}{}'.format(part, var)].dropna() ttest = stats.ttest_ind(grp1, grp2, equal_var=False) pooled_std = np.sqrt( ((count_rej - 1) * std_rej**2 + (count_acc - 1) * std_acc**2) / (count_rej + count_acc)) cohens_d = np.abs((mean_rej - mean_acc) / pooled_std) results.loc[len(results)] = [ part, var, ttest[1], mean_acc - mean_rej, mean_rej, std_rej, count_rej, mean_acc, std_acc, count_acc, cohens_d, has_edu ] bar_i += 1 fig, ax = plt.subplots(figsize=(width, height)) nb_bars = len(ind) acc = np.arange(0, nb_bars, 2) rej = np.arange(1, nb_bars, 2)
early = df[df['assignment1_submission'] <= '2015-12-31'] late = df[df['assignment1_submission'] > '2015-12-31'] print(early) # In[ ]: early.mean() # In[ ]: late.mean() # In[11]: from scipy import stats get_ipython().magic('pinfo stats.ttest_ind') # In[14]: stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade']) # In[ ]: stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade']) # In[ ]: stats.ttest_ind(early['assignment3_grade'], late['assignment3_grade'])
if "_" in id: id = id.split("_")[1] id = id[4:] return id correlationDict = {} for i in correlationfh: intA,intB,corr = i.split() correlationDict[intA.lower()+"_"+intB.lower()]=float(corr) inter_corr = [] count = 0 invalid = 0 for i in interologyfh: intA,intB = i.split() count += 1 try: corr = correlationDict[intB.lower() + "_" + intA.lower()] inter_corr.append(corr) except KeyError: try: corr = correlationDict[intA.lower() + "_" + intB.lower()] inter_corr.append(corr) except KeyError: print(intA.lower(),intB.lower()) invalid+=1 print(len(correlationDict.items())) random_corr = random.sample(list(correlationDict.values()),1000) print("total correlation from interologs= {}\ntotalcorrelation from random pairs= {}".format(sum(inter_corr)/count,sum(random_corr)/len(random_corr))) print("p value between two correlations: {}".format(ttest_ind(inter_corr,random_corr))) print("number of invalid keys: {} out of {} = {}".format(invalid,count,invalid/count))
def calculate_ttest(hyperpartisan_valid_predictions, joint_valid_predictions): _, p_value = ttest_ind(hyperpartisan_valid_predictions, joint_valid_predictions) return p_value
feature_rep_train, feature_rep_test = datasets[0], datasets[1] fingerprint_rep_train, fingerprint_rep_test = datasets[2], datasets[3] #image_rep_train, image_rep_test = datasets[4], datasets[5] ###################################################################################### # LogisticRegression with MolecularDescriptors X = feature_rep_train.iloc[:, 2:].values Y = feature_rep_train.iloc[:, 1].values model_scores, null_scores = assess_model(X, Y, [datasets[0], datasets[1]]) print("Null Model: ", np.mean(null_scores), np.std(null_scores)) print("Logistc Model with Molecular Descriptors performance: ", np.mean(model_scores), np.std(model_scores)) t_stat, p_value = stats.ttest_ind(model_scores, null_scores) print(p_value) X_train, Y_train = X[:750, :], Y[:750] X_test, Y_test = X[:750, :], Y[:750] standard_scaler = StandardScaler() X_train = standard_scaler.fit_transform(X_train) X_test = standard_scaler.transform(X_test) plot_learning_curve(LogisticRegression(), "LogisticRegression_MolecularDescriptors", X_train, Y_train, cv=10) model = LogisticRegression()
def main(): reddit_counts = sys.argv[1] station_fh = gzip.open(reddit_counts, 'rt', encoding='utf-8') stations = pd.read_json(station_fh, lines=True) stations = stations[stations.subreddit == "canada"] stations = stations[(stations['date'].dt.year == 2012) | (stations['date'].dt.year == 2013)].reset_index() del stations["index"] stations['weekday'] = stations['date'].dt.dayofweek weekday = stations[(stations['weekday'] == 5) | (stations['weekday'] == 6)].reset_index() weekend = stations[(stations['weekday'] != 5) & (stations['weekday'] != 6)].reset_index() del weekday["index"] del weekend["index"] initial_ttest_p = ttest_ind(weekday['comment_count'], weekend['comment_count']) initial_weekday_normality_p = stats.normaltest(weekday['comment_count']) initial_weekend_normality_p = stats.normaltest(weekend['comment_count']) initial_levene_p = stats.levene(weekday['comment_count'], weekend['comment_count']) # Fix1 # sqrt weekday_sqrt = np.sqrt(weekday["comment_count"]) weekend_sqrt = np.sqrt(weekend["comment_count"]) # print(weekday_sqrt) transformed_weekday_normality_p = stats.normaltest(weekday_sqrt) transformed_weekend_normality_p = stats.normaltest(weekend_sqrt) transformed_levene_p = stats.levene(weekday_sqrt, weekend_sqrt) # Fix2 # Logic copy from Prof Greg Baker def week(dt): isocal = dt.isocalendar() return '%i-%i' % (isocal[0], isocal[1]) weekday_number = weekday.date.apply(week) weekend_number = weekend.date.apply(week) weekday["number"] = weekday_number weekend["number"] = weekend_number grouped_weekday = weekday.groupby(['number']) weekly_weekday = grouped_weekday.aggregate('sum') grouped_weekend = weekend.groupby(['number']) weekly_weekend = grouped_weekend.aggregate('sum') weekly_weekday_normality_p = stats.normaltest( weekly_weekday['comment_count']) weekly_weekend_normality_p = stats.normaltest( weekly_weekend['comment_count']) weekly_levene_p = stats.levene(weekly_weekday['comment_count'], weekly_weekend['comment_count']) weekly_ttest_p = ttest_ind(weekly_weekday['comment_count'], weekly_weekend['comment_count']) # Fix3 utest_p = mannwhitneyu(weekday['comment_count'], weekend['comment_count']) print( OUTPUT_TEMPLATE.format( initial_ttest_p=initial_ttest_p.pvalue, initial_weekday_normality_p=initial_weekday_normality_p.pvalue, initial_weekend_normality_p=initial_weekend_normality_p.pvalue, initial_levene_p=initial_levene_p.pvalue, transformed_weekday_normality_p=transformed_weekday_normality_p. pvalue, transformed_weekend_normality_p=transformed_weekend_normality_p. pvalue, transformed_levene_p=transformed_levene_p.pvalue, weekly_weekday_normality_p=weekly_weekday_normality_p.pvalue, weekly_weekend_normality_p=weekly_weekend_normality_p.pvalue, weekly_levene_p=weekly_levene_p.pvalue, weekly_ttest_p=weekly_ttest_p.pvalue, utest_p=utest_p.pvalue, ))
def main(): parser = argparse.ArgumentParser(description='read result from csv') parser.add_argument('file_name_0', metavar='file_name_0', type=str, help='input file name') parser.add_argument('file_name_1', metavar='file_name_1', type=str, help='input file name') parser.add_argument('file_name_2', metavar='file_name_2', type=str, help='input file name') args = parser.parse_args() result0 = np.loadtxt(args.file_name_0, delimiter=',') result1 = np.loadtxt(args.file_name_1, delimiter=',') result2 = np.loadtxt(args.file_name_2, delimiter=',') plt.rcParams['font.family'] = 'Times new Roman' method_name1 = "S. L. P." method_name2 = "CADRL" fig = plt.figure() ax0 = fig.add_subplot(2, 2, 1) ax1 = fig.add_subplot(2, 2, 2) ax2 = fig.add_subplot(2, 2, 3) ax3 = fig.add_subplot(2, 2, 4) bp0 = ax0.boxplot((result0[:, 0], result1[:, 0], result2[:, 0]), whis="range", showmeans=True, meanline=True) ax0.set_xticklabels(['Our method', method_name1, method_name2]) ax0.set_title('(a) Travel distance[m]', y=-0.35, fontsize=10) ax0.set_ylim(0, 50) ax0.set_yticks(np.arange(0, ax0.get_ylim()[1] + 10, 10)) ax0.grid() print("travel distance") print("P. M. mean: " + str(result0[:, 0].mean()) + "[m]") print("P. M. mean: " + str(result0[result0[:, 2] == 0, 0].mean()) + "[m]") print("P. M. median: " + str(np.median(result0[:, 0])) + "[m]") print("P. M. stddev: " + str(result0[:, 0].std())) print(method_name1 + " mean: " + str(result1[:, 0].mean()) + "[m]") print(method_name1 + " mean: " + str(result1[result1[:, 2] == 0, 0].mean()) + "[m]") print(method_name1 + " median: " + str(np.median(result1[:, 0])) + "[m]") print(method_name1 + " stddev: " + str(result1[:, 0].std())) print(stats.ttest_ind(result0[:, 0], result1[:, 0], equal_var=False)) print( stats.ttest_ind(result0[result0[:, 2] == 0, 0], result1[result1[:, 2] == 0, 0], equal_var=False)) print(method_name2 + " mean: " + str(result2[:, 0].mean()) + "[m]") print(method_name2 + " mean: " + str(result2[result2[:, 2] == 0, 0].mean()) + "[m]") print(method_name2 + " median: " + str(np.median(result2[:, 0])) + "[m]") print(method_name2 + " stddev: " + str(result2[:, 0].std())) print(stats.ttest_ind(result0[:, 0], result2[:, 0], equal_var=False)) print( stats.ttest_ind(result0[result0[:, 2] == 0, 0], result2[result2[:, 2] == 0, 0], equal_var=False)) bp1 = ax1.boxplot((result0[:, 1], result1[:, 1], result2[:, 1]), whis="range", showmeans=True, meanline=True) ax1.set_xticklabels(['Our method', method_name1, method_name2]) ax1.set_title('(b) Travel time[s]', y=-0.35, fontsize=10) ax1.set_ylim(0, 50) ax1.set_yticks(np.arange(0, ax1.get_ylim()[1] + 10, 10)) ax1.grid() print("travel time") print("P. M. mean: " + str(result0[:, 1].mean()) + "[s]") print("P. M. mean: " + str(result0[result0[:, 2] == 0, 1].mean()) + "[s]") print("P. M. median: " + str(np.median(result0[:, 1])) + "[s]") print("P. M. stddev: " + str(result0[:, 1].std())) print(method_name1 + " mean: " + str(result1[:, 1].mean()) + "[s]") print(method_name1 + " mean: " + str(result1[result1[:, 2] == 0, 1].mean()) + "[s]") print(method_name1 + " median: " + str(np.median(result1[:, 1])) + "[s]") print(method_name1 + " stddev: " + str(result1[:, 1].std())) print(stats.ttest_ind(result0[:, 1], result1[:, 1], equal_var=False)) print( stats.ttest_ind(result0[result0[:, 2] == 0, 1], result1[result1[:, 2] == 0, 1], equal_var=False)) print(method_name2 + " mean: " + str(result2[:, 1].mean()) + "[s]") print(method_name2 + " mean: " + str(result2[result2[:, 2] == 0, 1].mean()) + "[s]") print(method_name2 + " median: " + str(np.median(result2[:, 1])) + "[s]") print(method_name2 + " stddev: " + str(result2[:, 1].std())) print(stats.ttest_ind(result0[:, 1], result2[:, 1], equal_var=False)) print( stats.ttest_ind(result0[result0[:, 2] == 0, 1], result2[result2[:, 2] == 0, 1], equal_var=False)) bp2 = ax2.boxplot((result0[:, 2], result1[:, 2], result2[:, 2]), whis="range", showmeans=True, meanline=True) ax2.set_xticklabels(['Our method', method_name1, method_name2]) ax2.set_title('(c) Number of collisions', y=-0.35, fontsize=10) ax2.set_ylim(0, 5) ax2.set_yticks(np.arange(0, ax2.get_ylim()[1] + 1, 1)) ax2.grid() print("collision count") print("P. M. mean: " + str(result0[:, 2].mean())) print("P. M. median: " + str(np.median(result0[:, 2]))) print("P. M. stddev: " + str(result0[:, 2].std())) print("P. M. c. rate: " + str( np.where(result0[:, 2] > 0)[0].shape[0] / float(result0[:, 2].shape[0]))) print(method_name1 + " mean: " + str(result1[:, 2].mean())) print(method_name1 + " median: " + str(np.median(result1[:, 2]))) print(method_name1 + " stddev: " + str(result1[:, 2].std())) print(method_name1 + " c. rate: " + str( np.where(result1[:, 2] > 0)[0].shape[0] / float(result1[:, 2].shape[0]))) print(stats.ttest_ind(result0[:, 2], result1[:, 2], equal_var=False)) print(method_name2 + " mean: " + str(result2[:, 2].mean())) print(method_name2 + " median: " + str(np.median(result2[:, 2]))) print(method_name2 + " stddev: " + str(result2[:, 2].std())) print(method_name2 + " c. rate: " + str( np.where(result2[:, 2] > 0)[0].shape[0] / float(result2[:, 2].shape[0]))) print(stats.ttest_ind(result0[:, 2], result2[:, 2], equal_var=False)) bp3 = ax3.boxplot((result0[:, 3], result1[:, 3], result2[:, 3]), whis="range", showmeans=True, meanline=True) ax3.set_xticklabels(['Our method', method_name1, method_name2]) ax3.set_title('(d) Minimum distance[m]', y=-0.35, fontsize=10) ax3.set_ylim(0, 1.5) ax3.set_yticks(np.arange(0, ax3.get_ylim()[1] + 0.5, 0.5)) ax3.axhline(0.6, c="r") ax3.text(3.6, 0.6, "$R_{col}$", size=10, color="red") ax3.grid() print("min distance") print("P. M. mean: " + str(result0[:, 3].mean()) + "[m]") print("P. M. mean: " + str(result0[result0[:, 2] == 0, 3].mean()) + "[m]") print("P. M. median: " + str(np.median(result0[:, 3])) + "[m]") print("P. M. stddev: " + str(result0[:, 3].std())) print(method_name1 + " mean: " + str(result1[:, 3].mean()) + "[m]") print(method_name1 + " mean: " + str(result1[result1[:, 2] == 0, 3].mean()) + "[m]") print(method_name1 + " median: " + str(np.median(result1[:, 3])) + "[m]") print(method_name1 + " stddev: " + str(result1[:, 3].std())) print(stats.ttest_ind(result0[:, 3], result1[:, 3], equal_var=False)) print( stats.ttest_ind(result0[result0[:, 2] == 0, 3], result1[result1[:, 2] == 0, 3], equal_var=False)) print(method_name2 + " mean: " + str(result2[:, 3].mean()) + "[m]") print(method_name2 + " mean: " + str(result2[result2[:, 2] == 0, 3].mean()) + "[m]") print(method_name2 + " median: " + str(np.median(result2[:, 3])) + "[m]") print(method_name2 + " stddev: " + str(result2[:, 3].std())) print(stats.ttest_ind(result0[:, 3], result2[:, 3], equal_var=False)) print( stats.ttest_ind(result0[result0[:, 2] == 0, 3], result2[result2[:, 2] == 0, 3], equal_var=False)) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.show()
data1, data2, data3 = sdpt, sdkt, sdbt stat, p = f_oneway(data1, data2, data3) f.write("\nanova small dense: ") f.write("stat: " + str(stat) + " ") f.write("p: " + str(p)) data1, data2, data3 = mspt, mskt, msbt stat, p = f_oneway(data1, data2, data3) f.write("\nanova medium sparse: ") f.write("stat: " + str(stat) + " ") f.write("p: " + str(p)) data1, data2, data3 = lspt, lskt, lsbt stat, p = f_oneway(data1, data2, data3) f.write("\nanova medium dense: ") f.write("stat: " + str(stat) + " ") f.write("p: " + str(p)) data1, data2 = lspt, lskt stat, p = ttest_ind(data1, data2) f.write("\n ttest p and k med dense: ") f.write("stat: " + str(stat) + " ") f.write("p: " + str(p)) data1, data2, data3 = lspt, lskt, lsbt stat, p = f_oneway(data1, data2, data3) f.write("\nanova large sparse: ") f.write("stat: " + str(stat) + " ") f.write("p: " + str(p)) f.close()
def t_test2(x1, x2, eq_Var=True): t, p = stats2.ttest_ind(x1, x2, equal_var=eq_Var) return p, p > alpha # 如果p小,是拒绝原假设,认为二者不同,即可以通过该属性区分