def main(): acc_array = {'smo':[], 'bayes':[], 'tree':[]} output = open("output_all.txt", 'w') output.write("classifier\taccuracy\t\tprecision(0)\t\tprecision(4)\t\trecall(0)\t\trecall(4)\n") for method in ['smo', 'bayes', 'tree']: output.write("\n") for i in range(10): f = open(method+str(i)+".txt", 'r') lst = f.readlines() f.close() c00 = float(lst[-3].strip().split()[0]) c01 = float(lst[-3].strip().split()[1]) c10 = float(lst[-2].strip().split()[0]) c11 = float(lst[-2].strip().split()[1]) accuracy = (c00 + c11) / 1100.000 acc_array[method].append(accuracy) precision0 = c00 / (c00 + c01) precision4 = c11 / (c10 + c11) recall0 = c00 / (c00 + c10) recall4 = c11 / (c01 + c11) output.write (method+"\t"+str(accuracy)+"\t"+str(precision0)+"\t"+str(precision4)+"\t"+str(recall0)+"\t"+str(recall4)+"\n") #print(acc_array) s1 = stats.ttest_rel(acc_array['smo'], acc_array['bayes']) s2 = stats.ttest_rel(acc_array['smo'], acc_array['tree']) s3 = stats.ttest_rel(acc_array['tree'], acc_array['bayes']) print(s1) print(s2) print(s3)
def analyze(c, subset, db, popularity): query_count = c.shape[0] print('distinct query count: %d' % query_count) if popularity not in c: print('warning: popularity column not found') c[popularity] = 1 s = c[popularity].sum() print('query count: %d' % c[popularity].sum()) r1 = c[subset] * c[popularity] r2 = c[db] * c[popularity] r3 = c['ql'] * c[popularity] r4 = c['ml'] * c[popularity] r5 = c['best'] * c[popularity] r6 = c['rand'] * c[popularity] s = c[popularity].sum() print('set \t sub \t db \t ql \t ml \t best \t rand') print('all \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f' % (r1.sum() / s, r2.sum() / s, r3.sum() / s, r4.sum() / s, r5.sum() / s, r6.sum() / s)) b = c['Label'] == 1 s = c[popularity][b].sum() r1 = c[subset][b] * c[popularity][b] r2 = c[db][b] * c[popularity][b] r3 = c['ql'][b] * c[popularity][b] r4 = c['ml'][b] * c[popularity][b] r5 = c['best'][b] * c[popularity][b] r6 = c['rand'][b] * c[popularity][b] sdf= c[popularity][b].sum() print('bad \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f' % (r1.sum() / s, r2.sum() / s, r3.sum() / s, r4.sum() / s, r5.sum() / s, r6.sum() / s)) nb = c['Label'] == 0 s = c[popularity][nb].sum() r1 = c[subset][nb] * c[popularity][nb] r2 = c[db][nb] * c[popularity][nb] r3 = c['ql'][nb] * c[popularity][nb] r4 = c['ml'][nb] * c[popularity][nb] r5 = c['best'][nb] * c[popularity][nb] r6 = c['rand'][nb] * c[popularity][nb] sdf= c[popularity][nb].sum() print('n_bad \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f' % (r1.sum() / s, r2.sum() / s, r3.sum() / s, r4.sum() / s, r5.sum() / s, r6.sum() / s)) bad_count = b.value_counts()[True] print('%d distinct bad queries (%.2f %%)' % (bad_count, bad_count * 100 / query_count)) ml_to_cache = c['ml_label'] * c[popularity] ql_to_cache = c['ql_label'] * c[popularity] best_to_cache = c['best'] * c[popularity] s = float(c[popularity].sum()) print('queries sent to full db by ml: %.2f%%' % (ml_to_cache.sum() / s)) print('queries sent to full db by ql: %.2f%%' % (ql_to_cache.sum() / s)) print('queries sent to full db by best: %.2f%%' % (best_to_cache.sum() / s)) print('queries with mrr > 0 on cache: %.2f%%' % (c[popularity][c[subset] > 0].sum() / c[popularity].sum())) print('queries with mrr > 0 on cache: %.2f%%' % (c[popularity][c[db] > 0].sum() / c[popularity].sum())) print('ml and rand ' + str(ttest_rel(c['ml'], c['rand']))) print('ql and rand ' + str(ttest_rel(c['ql'], c['rand']))) print('subset and rand ' + str(ttest_rel(c[subset], c['rand'])))
def plot_cd_data(pre_arr, peri_arr, post_arr): # Custom function to draw the p-value bars def label_diff(i,j,text,X,Y): x = (X[i]+X[j])/2 ##center of the p-val bar y = max(Y[i], Y[j]) props = {'connectionstyle':'bar','arrowstyle':'-',\ 'shrinkA':20,'shrinkB':20,'lw':2} ax.annotate(text, xy=(x,y+0.1), zorder=10) ax.annotate('', xy=(X[i],y), xytext=(X[j],y), arrowprops=props) ##create a numpy array containing the mean vals for the bar chart means = np.array([pre_arr.mean(), peri_arr.mean(), post_arr.mean()]) ##get the standard error values errs = np.array([stats.sem(pre_arr), stats.sem(peri_arr), stats.sem(post_arr)]) ##calculate the p-values between each of the sets p_pre_peri = np.round(stats.ttest_rel(pre_arr, peri_arr)[1], 3) p_pre_post = np.round(stats.ttest_rel(pre_arr, post_arr)[1], 3) p_peri_post = np.round(stats.ttest_rel(peri_arr, post_arr)[1], 3) ##put all the arrays into one big array to plot the ##individual lines all_arr = np.zeros((3,pre_arr.size)) all_arr[0,:] = pre_arr all_arr[1,:] = peri_arr all_arr[2,:] = post_arr ##formatting stuff idx = np.arange(3) # the x locations for the groups width= 0.8 labels = ('Pre', 'CD', 'Reinstatement') # Pull the formatting out here bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5} err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'} X = idx+width/2 ##position of the center of the bars fig, ax = plt.subplots() ax.p1 = plt.bar(idx, means, alpha = 0.5, **bar_kwargs) ax.errs = plt.errorbar(X, means, yerr=errs, **err_kwargs) ##plot the individual lines on their own axis ax2 = ax.twinx() ax2.lines = plt.plot(np.linspace(0,3,3), all_arr) ax2.set_ylabel("Percent correct") # Call the function label_diff(0,1,'p='+str(p_pre_peri),X,means) label_diff(0,2,'p='+str(p_pre_post),X,means) label_diff(1,2,'p='+str(p_peri_post),X,means) ax.set_ylim(ymax=means.max()+0.3) plt.xticks(X, labels, color='k') plt.title("Performance during contingency degredation") ax.set_ylabel("Percent correct") plt.show()
def plot_fr_means(arrs1, arrs2, chunk1 = (0,10), chunk2 = (35,45), n = None): ##grab the specified chunks arrs1_early = arrs1[:,chunk1[0]*60*1000:chunk1[1]*60*1000] arrs1_late = arrs1[:,chunk2[0]*60*1000:chunk2[1]*60*1000] arrs2_early = arrs2[:,chunk1[0]*60*1000:chunk1[1]*60*1000] arrs2_late = arrs2[:,chunk2[0]*60*1000:chunk2[1]*60*1000] ##calculate the means across all the arrays means =np.array([arrs1_early.mean(), arrs2_early.mean(), arrs1_late.mean(), arrs2_late.mean()])*1000 ##get the across session means m_arrs1_early = arrs1_early.mean(axis = 1)*1000 m_arrs2_early = arrs2_early.mean(axis = 1)*1000 m_arrs1_late = arrs1_late.mean(axis = 1)*1000 m_arrs2_late = arrs2_late.mean(axis = 1)*1000 ##get an array of SEM mesurements for the error bars errs = np.array([stats.sem(m_arrs1_early,axis = None), stats.sem(m_arrs2_early,axis = None), stats.sem(m_arrs1_late,axis = None), stats.sem(m_arrs2_late, axis = None)]) ##calculate the t-tests p_e1s = stats.ttest_rel(m_arrs1_early, m_arrs1_late) p_e2s = stats.ttest_rel(m_arrs2_early, m_arrs2_late) p_e12_early = stats.ttest_rel(m_arrs1_early, m_arrs2_early) p_e12_late = stats.ttest_rel(m_arrs1_late, m_arrs2_late) ##print the ttest results print "p_e1s = " + str(p_e1s) print "p_e2s = " + str(p_e2s) print "p_e12_early = " + str(p_e12_early) print "p_e12_late = " + str(p_e12_late) ##plot the bar graph ##formatting stuff idx = np.arange(4) # the x locations for the groups width= 0.8 labels = ('E1 early', 'E2_early', 'E1_late', 'E2_late') # Pull the formatting out here bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5} err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'} X = idx+width/2 ##position of the center of the bars fig, ax = plt.subplots() ax.p1 = plt.bar(idx, means, alpha = 0.5, **bar_kwargs) ax.errs = plt.errorbar(X, means, yerr=errs, **err_kwargs) ax.set_ylim(ymax=means.max()+means.max()/6.0) plt.xticks(X, labels, color='k') plt.title("Average firing rate within sessions") ax.set_ylabel("FR (Hz)") if n is not None: plt.text(0.2, means.max()+means.max()/10, "n= "+str(n)+" sessions") plt.show()
def write_transforms_to_file(transforms,filename="dummy_transforms.txt",min_pairs=3,p_level=0.05,std_min=0.0,id_assays=True,full_info=False): """ Write selected transformations to file. min_pairs : Minimum number of pairs per transformations p_level : Maximum p_value std_min : Minimum Standard deviation of differences within pairs id_assays : separately output statistics for using pairs from identical assays only """ print "Writing significant transformations to file" if min_pairs < 2: print "At least 2 pairs per transformation are necessary for significance tests." print "min_pairs set to 2" min_pairs = 2 header = "Transformation\tAssay_specific\tp-value\tAverage_Activity_Difference\tSigma_Differences\tnpairs" if full_info: header = header+"\tLigand_IDs\tlog(Activities[nM])\tAssay_Identity" header = header+"\n" f = open(filename,"w") f.write(header) for transf,pairs in transforms.iteritems(): if len(pairs["ligand_ids"]) < min_pairs: continue diffs = pairs["deltas"] npairs_all = len(diffs) p_all = stats.ttest_rel(diffs,[0.0 for i in diffs])[1] av_all = sum(diffs)/npairs_all std_all = stats.tstd(diffs) if npairs_all >= min_pairs and p_all <= p_level and std_all >= std_min: f.write(transf+"\t"+"mixed_assays"+"\t"+"{:4.2}".format(p_all)+"\t"+"{:4.3}".format(av_all)+"\t"+"{:4.2}".format(std_all)+"\t"+str(npairs_all)) if full_info: for i in range(npairs_all): f.write("\t"+pairs["ligand_ids"][i][0]+":"+pairs["ligand_ids"][i][1]) for i in range(npairs_all): f.write("\t"+"{:4.3}".format(pairs["activities1"][i])+":"+"{:4.3}".format(pairs["activities2"][i])) for i in range(npairs_all): f.write("\t"+str(pairs["assay_identity"][i])) f.write("\n") if id_assays == False: continue diffs_id = list(set([pairs["deltas"][i] for i in range(npairs_all) if pairs["assay_identity"][i]])) npairs_id = len(diffs_id) if npairs_id < min_pairs: continue p_id = stats.ttest_rel(diffs_id,[0.0 for i in diffs_id])[1] av_id = sum(diffs_id)/npairs_id std_id = stats.tstd(diffs_id) if npairs_id >= min_pairs and p_id <= p_level and std_id >= std_min: f.write(transf+"\t"+"ident_assays"+"\t"+"{:4.2}".format(p_id)+"\t"+"{:4.3}".format(av_id)+"\t"+"{:4.2}".format(std_id)+"\t"+str(npairs_id)) if full_info: for i in range(npairs_all): if pairs["assay_identity"][i] == True: f.write("\t"+pairs["ligand_ids"][i][0]+":"+pairs["ligand_ids"][i][1]) for i in range(npairs_all): if pairs["assay_identity"][i] == True:f.write("\t"+"{:4.3}".format(pairs["activities1"][i])+":"+"{:4.2}".format(pairs["activities2"][i])) for i in range(npairs_all): if pairs["assay_identity"][i] == True:f.write("\t"+str(pairs["assay_identity"][i])) f.write("\n") f.close()
def decoder_perf_stats(data_dir='/auto/tdrive/mschachter/data'): df = pd.read_csv(os.path.join(data_dir, 'aggregate', 'decoder_perfs_for_glm.csv')) decomps = ['full_psds', 'spike_rate', 'spike_rate+spike_sync'] aprops = ['maxAmp', 'meanspect', 'q2', 'q3', 'skewspect', 'q1', 'entropytime', 'entropyspect', 'skewtime', 'sal', 'maxfund', 'cvfund', 'minfund', 'stdspect', 'fund', 'kurtosisspect', 'kurtosistime', 'voice2percent', 'fund2'] r2_vals_by_aprop = dict() for aprop in aprops: r2_vals = dict() for decomp in decomps: r2_vals[decomp] = list() i = (df.aprop == aprop) & (df.r2 > 0) & ~np.isnan(df.r2) g = df[i].groupby(['bird', 'block', 'segment', 'hemi']) for (bird, block, segment, hemi), gdf in g: if len(gdf) != len(decomps): print "Missing data for aprop=%s, (%s,%s,%s,%s), len(gdf)=%d" % (aprop, bird, block, segment, hemi, len(gdf)) continue for decomp in decomps: ii = gdf.decomp == decomp assert ii.sum() == 1 r2_vals[decomp].append(gdf[ii].r2.values[0]) r2_vals_by_aprop[aprop] = r2_vals for aprop in aprops: r2_vals = r2_vals_by_aprop[aprop] lfp_r2 = np.array(r2_vals['full_psds']) spike_r2 = np.array(r2_vals['spike_rate']) sync_r2 = np.array(r2_vals['spike_rate+spike_sync']) lfp_vs_spike_t,lfp_vs_spike_p = ttest_rel(lfp_r2, spike_r2) lfp_vs_sync_t, lfp_vs_sync_p = ttest_rel(lfp_r2, sync_r2) spike_vs_sync_t, spike_vs_sync_p = ttest_rel(spike_r2, sync_r2) print '----------- %s ------------' % aprop print 'N=%d' % len(lfp_r2) print 'lfp_r2 = %0.2f +/- %0.2f' % (lfp_r2.mean(), lfp_r2.std(ddof=1)) print 'spike_r2 = %0.2f +/- %0.2f' % (spike_r2.mean(), spike_r2.std(ddof=1)) print 'sync_r2 = %0.2f +/- %0.2f' % (sync_r2.mean(), sync_r2.std(ddof=1)) print 'LFP vs Spike: t=%0.6f, p=%0.6f' % (lfp_vs_spike_t, lfp_vs_spike_p) print 'LFP vs Spike+Sync: t=%0.6f, p=%0.6f' % (lfp_vs_sync_t, lfp_vs_sync_p) print 'Spike vs Spike+Sync: t=%0.6f, p=%0.6f' % (spike_vs_sync_t, spike_vs_sync_p)
def directional(M, window=None, circ=False, extrapolate=True): """From a symmetrical matrix M of size n, return a vector d whose each component d[i] is a T-test of two samples represented by vectors of size window on either side of the i-th pixel on the diagonal. Edge elements may be extrapolated based on the vector size reduction, except in the case of circular genomes. If they aren't, d will be of size n - 2*(window-1) instead of n. """ # Sanity checks if not type(M) is np.ndarray: M = np.array(M) if M.shape[0] != M.shape[1]: raise ValueError("Matrix is not square.") try: n = min(M.shape) except AttributeError: n = M.size # Default window argument if window is None: window = max(n // 100, 5) if window >= n: raise ValueError("Please choose a smaller window size.") try: from scipy.stats import ttest_rel except ImportError as e: print("I couldn't import scipy's stats module which is needed to compute directionality index.") print(str(e)) raise if circ: d = [ttest_rel(np.array(list(M[i, i - window:]) + list(M[i, :i])), M[i, i:i + window])[0] for i in range(window)] elif extrapolate: d = [ttest_rel(M[i, 0:i], M[i, i:2 * i])[0] for i in range(window)] else: d = [] d += [ttest_rel(M[i, i - window:i], M[i, i:i + window])[0] for i in range(window, n - window)] if circ: d += [ttest_rel(M[i, i - window:i], M[i, i:i + window])[0] for i in range(window)] elif extrapolate: d += [ttest_rel(M[i, i - window:i], np.array(list(M[i, i:]) + list(M[i, :window - (n - i)])))[0] for i in range(n - window, n)] return d
def output_stats(self): ''' Compute and store all statistics in a csv file. Order: first the adaptive measure, then Resnik, Lin, Jiang, and simGIC (first all of them wrt biological process sub-ontology, then all of them wrt molecular function sub-ontology, finally all of them wrt cellular component sub-ontology). ''' stats = np.zeros((13, 8)) pvals = np.zeros(8) c = 0 all_learners = [self.mytr] + sum([[self.resniktr[root], self.lintr[root], self.jiangtr[root], self.simgictr[root]] for root in ['BIO', 'MOL', 'CEL']], []) best_prec = 1 best_rec = 1 best_f1 = 1 best_area = 1 for h in all_learners: stats[c,0] = np.mean(h.precisions) stats[c,1] = np.std(h.precisions) stats[c,2] = np.mean(h.recalls) stats[c,3] = np.std(h.recalls) stats[c,4] = np.mean(h.f1s) stats[c,5] = np.std(h.f1s) stats[c,6] = np.mean(h.areas) stats[c,7] = np.std(h.areas) if c > 0: if stats[c,0] > stats[best_prec, 0]: best_prec = c if stats[c,2] > stats[best_rec, 2]: best_rec = c if stats[c,4] > stats[best_f1, 4]: best_f1 = c if stats[c, 6] > stats[best_area, 6]: best_area = c c+=1 np.savetxt(statfiles[self.species], stats, delimiter='\t', header='precision \t std \t recall \t std \t F1 \t std \t ROC area \t std') pvals[0], pvals[1] = ttest_rel(self.mytr.precisions, all_learners[best_prec].precisions) pvals[2], pvals[3] = ttest_rel(self.mytr.recalls, all_learners[best_rec].recalls) pvals[4], pvals[5] = ttest_rel(self.mytr.f1s, all_learners[best_f1].f1s) pvals[6], pvals[7] = ttest_rel(self.mytr.areas, all_learners[best_area].areas) np.savetxt(pvalfiles[self.species], np.expand_dims(pvals, 0), delimiter='\t', header = 'precision \t p-value \t recall \t p-value \t F1 \t p-value \t ROC area \t p-value') return stats
def run_statistics(voxel_name): # Load dataframes controls_a = pd.read_csv(os.path.join(statistics_dir, '%s_controls_a.csv'%voxel_name ), index_col = 0) controls_b = pd.read_csv(os.path.join(statistics_dir, '%s_controls_b.csv'%voxel_name ), index_col = 0) # Run an independent t-test on QUALITY PARAMETERS ttpaired_fwhm = stats.ttest_rel(controls_a['Linewidth'], controls_b['Linewidth']) ttpaired_snr = stats.ttest_rel(controls_a['SNR'], controls_b['SNR']) print "FWHM: T-statistic is %.3f and the p-value is %.3f." % ttpaired_fwhm return controls_a, controls_b
def main(): file1 = 'data/output1.json' file2 = 'data/output2.json' file05 = 'data/output05.json' control_fees = [] exp2_fees = [] exp05_fees = [] with open(file1) as f: contents = f.read() data = json.loads(contents) blocks = data['blocks'] txs = [ block['transactions'] for block in blocks ] control_fees = [ float(tx[0]['fee']) for tx in txs ] with open(file2) as f: contents = f.read() data = json.loads(contents) blocks = data['blocks'] txs = [ block['transactions'] for block in blocks ] exp2_fees = [ float(tx[0]['fee']) for tx in txs ] with open(file05) as f: contents = f.read() data = json.loads(contents) blocks = data['blocks'] txs = [ block['transactions'] for block in blocks ] exp05_fees = [ float(tx[0]['fee']) for tx in txs ] t, p = stats.ttest_rel(control_fees, exp2_fees) print '--------------------------------------------' print 'Comparing the control group to the 2x Group |' print 'T Stat: {}'.format(t) print 'P value: {}'.format(p) print '--------------------------------------------' t, p = stats.ttest_rel(control_fees, exp05_fees) print '--------------------------------------------' print 'Comparing the control group to the 0.5x Group |' print 'T Stat: {}'.format(t) print 'P value: {}'.format(p) print '--------------------------------------------' control_mean = sum(control_fees) / len(control_fees) exp2_mean = sum(exp2_fees) / len(exp2_fees) print 'Control mean: {}'.format(control_mean) print 'Exp2 Mean: {}'.format(exp2_mean)
def stats_test(self, agg, test='ttest'): d = agg.shape[0] if test == 'ttest': # 2-tail T-Test ttest = (np.zeros((agg.shape[1]*(agg.shape[1]-1)/2, agg.shape[2])), np.zeros((agg.shape[1]*(agg.shape[1]-1)/2, agg.shape[2]))) ii = 0 for c1 in range(agg.shape[1]): for c2 in range(c1+1,agg.shape[1]): thisTtest = stats.ttest_rel(agg[:,c1,:], agg[:,c2,:], axis = 0) ttest[0][ii,:] = thisTtest[0] ttest[1][ii,:] = thisTtest[1] ii += 1 ttestPrint(title = '**** 2-tail T-Test of related samples ****', values = ttest, plotOpt = plotOpt, type = 2) elif test == 'ttest_1samp': # One-sample t-test m = .5 oneSample = stats.ttest_1samp(agg, m, axis = 0) ttestPrint(title = '**** One-sample t-test: difference from %.2f ****' %m, values = oneSample, plotOpt = plotOpt, type = 1) elif test == 'binomial': # Binomial test binom = np.apply_along_axis(stats.binom_test,0,agg) print binom return binom
def plotTrials(data, fish, CSname, key, step, offset=0, pp=None): fig = figure(figsize=(12,8), facecolor='w') ax1 = fig.add_subplot(121) # raw trace ax2 = fig.add_subplot(222) # learning curve ax3 = fig.add_subplot(224) # bar plot preP, postP, postP2 = [], [], [] longestUS = 0 for n, measurement in enumerate(data[fish][CSname]): tr = n+1 CS, USs, preRange = measurement['events'] subplot(ax1) mi = -step*(tr-1) ma = mi + step drawLines(mi, ma, (preRange, [preRange+(USs[0]-CS)], preRange)) longestUS = max([us-CS+preRange*3/2 for us in USs]+[longestUS]) # 'measurement[key]': vector around the CS timing (+/-) preRange. i.e., preRange is the center ax1.plot(measurement[key]-step*(tr-1)+offset) title(CSname+': '+key) # cf. preRange = 3600 frames pre = measurement[key][:preRange].mean()+offset # 2 min window post = measurement[key][preRange:preRange+(USs[0]-CS)].mean()+offset # 23 s window post2 = measurement[key][preRange+(USs[0]-CS):preRange*3/2+(USs[0]-CS)].mean()+offset # 1 min window after US preP.append(pre) postP.append(post) postP2.append(post2) ax3.plot([1, 2, 3], [pre, post, post2],'o-') ax1.set_xlim([0,longestUS]) ax1.axis('off') subplot(ax2) x = range(1, tr+1) y = np.diff((preP,postP), axis=0).ravel() ax2.plot( x, y, 'ko-', linewidth=2 ) ax2.plot( x, np.zeros_like(x), '-.', linewidth=1, color='gray' ) # grid() slope, intercept, rvalue, pval, stderr = stats.stats.linregress(x,y) title('slope = zero? p-value = %f' % pval) ax2.set_xlabel("Trial#") ax2.set_xlim([0.5,tr+0.5]) ax2.set_ylabel('CS - pre') subplot(ax3) ax3.bar([0.6, 1.6, 2.6], [np.nanmean(preP), np.nanmean(postP), np.nanmean(postP2)], facecolor='none') t, pval = stats.ttest_rel(postP, preP) title('paired t p-value = %f' % pval) ax3.set_xticks([1,2,3]) ax3.set_xticklabels(['pre', CSname, measurement['matchedUSname']]) ax3.set_xlim([0.5,3.5]) ax3.set_ylabel('Raw mean values') tight_layout(2, h_pad=1, w_pad=1) if pp: fig.savefig(pp, format='pdf') close('all') return np.vstack((preP, postP, postP2))
def compare_normalised(model_score_path, hemisphere, weight, lateral, atlas, y_file, list_compare): jl_file_name = 'r2score_{h}_{atlas}_{lateral}_connmat_{y}_weighted{w}.jl'.format(h=hemisphere, atlas=atlas, lateral=lateral,y=y_file,w=weight) print jl_file_name jl_path = op.join(model_score_path, jl_file_name) if op.isfile(jl_path): jl = joblib.load(jl_path) print "%s vs %s " %(list_compare[0], list_compare[1]) a = jl[list_compare[0]] b = jl[list_compare[1]] a = a.astype(float) b = b.astype(float) if 'AHS22' in a.index: a = a.drop('AHS22') b = b.drop('AHS22') t, p = stats.ttest_rel(a, b) str = cs.get_difference_ttest(list_compare[0],list_compare[1], t, float(p)) a_score = cs.convert_to_dataframe(a, hemisphere, lateral, atlas, list_compare[0], y_file, weight) b_score = cs.convert_to_dataframe(b, hemisphere, lateral, atlas, list_compare[1], y_file, weight) all_score = pd.concat([a_score, b_score]) return all_score, {'p':p, 't':t, 'result':str} else: print '%s not exists'%jl_path
def ttest(list1, list2): a1 = np.array(list1) a2 = np.array(list2) diff = a1 - a2 t, prob = stats.ttest_rel(a1, a2) print np.mean(diff), np.std(diff), t, prob return np.mean(diff), np.std(diff), t, prob
def plot_modulation_depth(arr_early, arr_late, sigma): arr_early = ss.zscored_fr(arr_early, sigma).max(axis = 0) arr_early = np.nan_to_num(arr_early) arr_late = ss.zscored_fr(arr_late, sigma).max(axis = 0) arr_late = np.nan_to_num(arr_late) if arr_early.size > arr_late.size: arr_early = np.random.choice(arr_early, size = arr_late.size, replace = False) if arr_late.size > arr_early.size: arr_late = np.random.choice(arr_late, size = arr_early.size, replace = False) early_sem = stats.sem(arr_early) early_mean = arr_early.mean() late_sem = stats.sem(arr_late) late_mean = arr_late.mean() p_val = stats.ttest_rel(arr_early, arr_late) print "p val is = " + str(p_val) # Pull the formatting out here width = 0.8 bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5} err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'} means = np.array([early_mean, late_mean]) errs = np.array([early_sem, late_sem]) idx = np.arange(2) X = idx+width/2 labels = ['E1 early', 'E1_late'] plt.bar(idx, means, alpha = 0.5,**bar_kwargs) plt.errorbar(X, means, yerr = errs,**err_kwargs) plt.xticks(X, labels) plt.ylabel('z-scored modulation depth') plt.title('Change in modulation depth from early in session to late in session') plt.show()
def eeg_twosample_ttest(array1,array2): """ Two-sample t-test comparing the values of two EEG data-sets Args: array1 : contains EEG data of multiple subjects (nsub x ntpts) array1 : contains EEG data of the same multiple subjects (nsub x ntpts) but from a different condition Returns: t : t-values, one for each of the timepoints p : p-values, also one for each of the timepoints Dependence: scipy.stats.ttest_rel """ from scipy.stats import ttest_rel s1 = array1.shape p = np.zeros(s1[1]) t = np.zeros(s1[1]) for i in range(s1[1]): tval,pval = ttest_rel(array1[:,i],array2[:,i]) p[i]=pval t[i]=tval return t,p
def compute_paired_ttest(best_test_score): """ """ df_col_name = ['ind_vs_union', 'ind_vs_mtl', \ 'ind_vs_mtmkl', 'union_vs_mtl', 'union_vs_mtmkl', \ 'mtl_vs_mtmkl'] pairs_test = [('individual', 'union'), ('individual', 'mtl'), \ ('individual', 'mtmkl'), ('union', 'mtl'), ('union', 'mtmkl'), \ ('mtl', 'mtmkl')] org_names = best_test_score.keys() ttest_p_val = numpy.zeros((len(org_names), len(pairs_test))) #ttest_p_val = numpy.zeros((len(pairs_test), len(org_names))) for org_idx, org_code in enumerate(org_names): meth_perf = best_test_score[org_code] for pair_idx, rel_pair in enumerate(pairs_test): t_stats, p_val = stats.ttest_rel(meth_perf[rel_pair[0]], meth_perf[rel_pair[1]]) ttest_p_val[org_idx, pair_idx] = p_val #ttest_p_val[pair_idx, org_idx] = p_val df_pval = pandas.DataFrame(ttest_p_val, columns=df_col_name, index=org_names) #df_pval = pandas.DataFrame(ttest_p_val, columns=org_names, index=pairs_test) return df_pval
def TTestPaired(data1, data2): for i in range(len(data1)): if data1[i] is ma.masked: data2[i] = ma.masked elif data2[i] is ma.masked: data1[i] = ma.masked c1 = Count(data1) c2 = Count(data2) if c1 != c2: df = 0 t = 1.0 prob = -1.0 d = 0.0 else: cov = 0.0 df = c1 - 1 cov = Sum((data1-Mean(data1))*(data2-Mean(data2))) / df sd = math.sqrt((SampVar(data1)+SampVar(data2)-2.0 * cov)/float(c1)) diff = data1 - data2 try: t, prob = stats.ttest_rel(data1, data2) d = Mean(diff) / SampStdDev(diff) except ZeroDivisionError: t = 0.0 prob = 1.0 result = {} result['t'] = t result['df'] = df result['prob'] = prob result['d'] = d result['quote'] = "<b>Quote: </b> <i>t</i> (%d) = %.3f, <i>p</i> = %1.4f, d = %.3f<br />" result['quotetxt'] = "Quote: t (%d) = %.3f, p = %1.4f, d = %.3f\n" return result
def compare_omission(mt_para_corpus, si_para_corpus, lang): tag_weights, tok_weights = get_omission_weights(mt_para_corpus, si_para_corpus, lang) mask = [] for mt_sent_pair, si_sent_pair in zip(mt_para_corpus.sent_pairs, si_para_corpus.sent_pairs): if mt_sent_pair.good_alignment and si_sent_pair.good_alignment: mask.append(True) else: mask.append(False) mt_omit, mt_omit_detail, mt_omit_tok = count_omission(mask, mt_para_corpus, tag_weights, tok_weights, lang) si_omit, si_omit_detail, si_omit_tok = count_omission(mask, si_para_corpus, tag_weights, tok_weights, lang) top_k = 10 print 'MT tag omissions:' print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in mt_omit if tag_weights[x[0]] > 0]).encode('utf-8') print u'MT tok omissions:' print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in mt_omit_tok[:top_k] if tok_weights[x[0]] > 0]).encode('utf8') print 'SI tag omissions:' print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in si_omit if tag_weights[x[0]] > 0]).encode('utf8') print 'SI tok omissions:' print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in si_omit_tok[:top_k] if tok_weights[x[0]] > 0]).encode('utf8') print 'Sentence omission stats:' for tag in tag_weights: if tag_weights[tag] > 0: mt_mean = sum(mt_omit_detail[tag]) si_mean = sum(si_omit_detail[tag]) t, prob = stats.ttest_rel(mt_omit_detail[tag], si_omit_detail[tag]) if prob < 0.05: print (u'%s\t%f\t%f\t%f\t%f' % (tag, mt_mean, si_mean, t, prob)).encode('utf8')
def compare_poincare_baker(): """ bread measurements by poincare 365 fitted normal distribution with mean 950g and 50g standard deviation """ poincare_sample = [ random.normalvariate(950, 50) for i in xrange(366) ] poincare_sample_cdf = MakeCdfFromList(poincare_sample, 'poincare') baker_sample = [] for i in xrange(366): baker_sample.append(max(random.normalvariate(950, 50) for i in xrange(4))) baker_cdf = MakeCdfFromList(baker_sample) print poincare_sample_cdf.Mean(), print baker_cdf.Mean() myplot.Clf() myplot.Cdfs([poincare_sample_cdf, baker_cdf]) pyplot.xlim(600, 1120) pyplot.legend(loc=0) myplot.SaveFormat('../resources/plots/poincare_vs_baker', 'png') # t-test t_test = stats.ttest_rel(poincare_sample, baker_sample) print "t-test statistic is %s with a p-value %s" % t_test
def mushconditions2compare(roi, conditiondict, data, dataindex, subjindex, roiindex, condindex, subjlist): subjlist = [s for s in subjlist if s in set([d[subjindex] for d in data])] mushes = [] means = [] sems = [] condlabels = [] f, ax = plt.subplots(figsize=[3, 6]) for mushedcondn, mushedcond in enumerate(conditiondict.keys()): condlabels.append(mushedcond) conds = conditiondict[mushedcond]['conds'] color = conditiondict[mushedcond]['color'] subjmushes = [] for subj in subjlist: subjbetas = np.array([d[dataindex] for d in data if d[condindex] in conds and d[subjindex] == subj]) subjbetas = subjbetas.astype('float64') subjmushes.append(np.mean(subjbetas)) mushes.append(subjmushes) condstd = np.std(subjmushes, ddof=1) condsem = condstd / np.sqrt(len(subjmushes)) sems.append(condsem) plotmeans = [0, 0] plotsems = [0, 0] plotmeans[mushedcondn] = np.mean(subjmushes) plotsems[mushedcondn] = condsem ax.bar(range(len(plotmeans)), plotmeans, yerr=plotsems, color=color, error_kw={'ecolor': color}) ax.set_title(data[0][roiindex]) ax.set_xlim([0, len(plotmeans)]) ax.set_xticks(np.arange(len(plotmeans)) + .5) ax.set_xticklabels(condlabels, rotation=90) array1 = mushes[0] array2 = mushes[1] df = len(array1) - 1 t, p = sst.ttest_rel(array1, array2) string = roi + ' :' + condlabels[0] + '-' + condlabels[1] + ': t(%.0f)=%.3f, p=%.3f.' % (df, t, p) print string
def accept(self): self.no_exeption = False self.con = float(self.con_edit.text()) first_sample, counts = self.dataset.GetNumericValues(self.currentGroup) second_sample, counts2 = self.dataset.GetNumericValues(self.currentVar) if first_sample == second_sample: self.no_exeption = True QtGui.QMessageBox.warning(self, u'Uyarı', u'Test için farklı örneklemleri seçiniz!', QtGui.QMessageBox.Cancel, QtGui.QMessageBox.NoButton, QtGui.QMessageBox.NoButton) else: self.t_score, self.pvalue = stats.ttest_rel(first_sample, second_sample) if len(first_sample) < len(second_sample): self.df = len(first_sample)-1 else: self.df = len(second_sample)-1 mean1 = sum(first_sample)/len(first_sample) mean2 = sum(second_sample)/len(second_sample) self.means = [mean1, mean2] if self.radio_noteq.isChecked(): pass elif self.radio_greater.isChecked(): self.pvalue /= 2 elif self.radio_less.isChecked(): self.pvalue /= 2 self.P_obs = stats.t.ppf(1-self.con, self.df)
def directional(A, nw): n1 = A.shape[0] print("Size of the matrix entetered for the directional index:") print(n1) signal1 = np.zeros((n1, 1)); for i in range(0,n1) : vect_left = []; vect_right = []; for k in range(i-1,i-nw-1,-1) : kp =k; if k < 0 : kp = n1 +k ; if A[i,kp] > 0 : vect_left.append(math.log(A[i,kp])); else : vect_left.append(0); for k in range(i+1,i+nw+1) : kp =k; if k >= n1 : kp = k - n1; if A[i,kp] > 0 : vect_right.append(math.log(A[i,kp])); else : vect_right.append(0); if sum(vect_left) != 0 and sum(vect_right) != 0 : signal1[i] = stats.ttest_rel(vect_right,vect_left)[0]; else : signal1[i] = 0; return signal1
def ttest(filename1, filename2): qids1, values1 = load_evaluation_file(arguments.filename1) qids2, values2 = load_evaluation_file(arguments.filename2) if qids1.shape[0] != qids2.shape[0]: raise ValueError("number of queries in files do not match (%d != %d)" % (qids1.shape[0], qids2.shape[0])) qids1_sort_idxs = np.argsort(qids1) qids2_sort_idxs = np.argsort(qids2) qids1 = qids1[qids1_sort_idxs] qids2 = qids2[qids2_sort_idxs] if np.any(qids1 != qids2): raise ValueError("files do not contain the same queries") values1 = values1[qids1_sort_idxs] values2 = values2[qids2_sort_idxs] mean1 = np.mean(values1) mean2 = np.mean(values2) t_statistic, p_value = ttest_rel(values1, values2) return values1.shape[0], mean1, mean2, t_statistic, p_value
def significance(self, fun, other, test="wilcoxon"): """computes stats significance of difference between two sets of scores test can be paired wilcoxon, mannwhitney for indep samples, or paired ttest. """ scores1 = self.map_doc_scores(fun) scores2 = other.map_doc_scores(fun) if isinstance(scores1[0], float) or isinstance(scores1[0], int): pass else: # TODO: this is suspicious scores1 = [x for x, _ in scores1] scores2 = [x for x, _ in scores2] # differences = [(x, y) for (x, y) in zip(scores1, scores2) if x != y] # print(difference, file=sys.stderr) # print(d2, file=sys.stderr) # print([x for (i,x) in enumerate(d1) if x!=d2[i]], file=sys.stderr) assert len(scores1) == len(scores1) results = {} if test == "wilcoxon" or test == "all": results["wilcoxon"] = wilcoxon(scores1, scores2)[1] if test == "ttest" or test == "all": results["paired ttest"] = ttest_rel(scores1, scores2)[1] if test == "mannwhitney" or test == "all": results["mannwhitney"] = mannwhitneyu(scores1, scores2)[1] return results
def compute(growth_npz, debug): from dataloader import DataLoader loader = DataLoader(growth_npz) mat = loader.matrix # we capture 1 scans extra of information, so ignore it tvec = mat[:-1, 0] residual_vec = mat[:-1, 5] dt = (tvec[2]-tvec[0])/2 US_exp_window = 10 * 60 // dt n = (residual_vec.size - US_exp_window) // 2 residual_vec_pre_US = residual_vec[:n] residual_vec_post_US = residual_vec[-n:] u0 = np.mean(residual_vec_pre_US) sd0 = np.std(residual_vec_pre_US) u1 = np.mean(residual_vec_post_US) sd1 = np.std(residual_vec_post_US) print growth_npz print '\tMean residual up to',(tvec[n-1] - tvec[0])/60, 'min' print bcolors.OKGREEN + '\t\t%.2f'%(u0), '\tSD %.2f'%(sd0), bcolors.ENDC print '\tMean residual from',(tvec[-n] - tvec[0])/60, 'min' print bcolors.OKGREEN + '\t\t%.2f'%(u1), '\tSD %.2f'%(sd1), bcolors.ENDC print '\tu_1 - u_0' # http://stattrek.com/sampling/difference-in-means.aspx?tutorial=ap # http://onlinestatbook.com/2/sampling_distributions/samplingdist_diff_means.html # Note that the result is the standard error of the mean, which is the # standard deviation in difference of mean sd_diff = (sd0**2/residual_vec_pre_US.size + sd1**2/residual_vec_post_US.size)**0.5 print bcolors.WARNING + '\t\t%.2f'%(u1-u0), '\tSEM=%.2f'%(sd_diff), bcolors.ENDC import scipy.stats as stats print '\tDependent t-test' t, p = stats.ttest_rel(residual_vec_pre_US, residual_vec_post_US) print '\t\t'+bcolors.FAIL, 't=%.2f, p=%.4f'%(t, p), bcolors.ENDC if debug: import matplotlib.pyplot as plt import matplotlib_setup from utils import keypress plt.plot(residual_vec_pre_US, label='pre') plt.hold(True) plt.plot(residual_vec_post_US, label='post') plt.legend() plt.gcf().canvas.mpl_connect('key_press_event', keypress) plt.show() plt.close()
def reaction_times_second_step(sessions, fig_no = 1): 'Reaction times for second step pokes as function of common / rare transition.' sec_step_IDs = ut.get_IDs(sessions[0].IDs, ['right_active', 'left_active']) median_RTs_common = np.zeros(len(sessions)) median_RTs_rare = np.zeros(len(sessions)) for i,session in enumerate(sessions): event_times = ut.get_event_times(session.time_stamps, session.event_codes, session.IDs) left_active_times = event_times['left_active'] right_active_times = event_times['right_active'] left_reaction_times = _latencies(left_active_times, event_times['left_poke']) right_reaction_times = _latencies(right_active_times, event_times['right_poke']) ordered_reaction_times = np.hstack((left_reaction_times,right_reaction_times))\ [np.argsort(np.hstack((left_active_times,right_active_times)))] transitions = session.blocks['trial_trans_state'] == session.CTSO['transitions'] # common vs rare. median_RTs_common[i] = np.median(ordered_reaction_times[ transitions]) median_RTs_rare[i] = np.median(ordered_reaction_times[~transitions]) mean_RT_common = 1000 * np.mean(median_RTs_common) mean_RT_rare = 1000 * np.mean(median_RTs_rare) SEM_RT_common = 1000 * np.sqrt(np.var(median_RTs_common/len(sessions))) SEM_RT_rare = 1000 * np.sqrt(np.var(median_RTs_rare /len(sessions))) p.figure(fig_no) p.bar([1,2],[mean_RT_common, mean_RT_rare], yerr = [SEM_RT_common,SEM_RT_rare]) p.xlim(0.8,3) p.ylim(mean_RT_common * 0.8, mean_RT_rare * 1.1) p.xticks([1.4, 2.4], ['Common', 'Rare']) p.title('Second step reaction times') p.ylabel('Reaction time (ms)') print('Paired t-test P value: {}'.format(ttest_rel(median_RTs_common, median_RTs_rare)[1]))
def test_paired_ttest_with_diff_sums(data): model, X_test = data pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)] nb_pairs = len(pairs) nb_features, nb_classes, nb_cases = 1717, 4, 20 batch_size = 5 process_X_data_func_args = {'nb_features': nb_features} dlc_gen = deeplift_contribs_generator(model, X_test, process_X_data_func=process_X_data, nb_features=nb_features, nb_classes=nb_classes, batch_size=batch_size, process_X_data_func_args=process_X_data_func_args) sums_D, sums_D2, sums_contribs, pairs = diff_sums_from_generator(dlc_gen, nb_features=nb_features, nb_classes=nb_classes) unadjusted_t_values, p_values = paired_ttest_with_diff_sums(sums_D, sums_D2, pairs=pairs, nb_cases=nb_cases) assert unadjusted_t_values.shape == (nb_pairs, nb_features) assert p_values.shape == (nb_pairs, nb_features) # force only 1 batch with abnormally high batch_size parameter alt_dlc_gen = deeplift_contribs_generator(model, X_test, process_X_data_func=process_X_data, nb_features=nb_features, nb_classes=nb_classes, batch_size=109971161161043253 % 8085, process_X_data_func_args=process_X_data_func_args) # non-streaming paired t-test implementation... fails with larger # datasets due to large matrix sizes (e.g., memory overflow), but # works as an alternative implementation for a tiny unit testing dataset alt_t_values, alt_p_values = [], [] for idx, contribs in enumerate(alt_dlc_gen): assert not idx # check only 1 batch (idx == 0) for i, j in pairs: curr_t_values = np.zeros((nb_features, )) curr_p_values = np.zeros((nb_features, )) for f in range(nb_features): t, p = ttest_rel(contribs[i][:, f], contribs[j][:, f]) curr_t_values[f] = t curr_p_values[f] = p alt_t_values.append(curr_t_values) alt_p_values.append(curr_p_values) for r in range(len(pairs)): t = unadjusted_t_values[r] alt_t = alt_t_values[r] p = p_values[r] # already bonferroni adjusted alt_p = bonferroni(alt_p_values[r], nb_pairs * nb_features) assert t.shape == alt_t.shape assert p.shape == alt_p.shape assert np.all(del_nans(np.abs(alt_t - t)) < epsilon) assert np.all(del_nans(np.abs(alt_p - p)) < epsilon)
def PlotXference_IND(): global EyeData global Events plt.style.use("ggplot") fig = plt.figure() fig.suptitle("Individual trials X coordinates") inf = fig.add_subplot(121) nof = fig.add_subplot(122) inference = [] noference = [] inf_cross = [] nof_cross = [] for idx in range(0, len(Events)): inf_slices, _inf_cross = FilterSlices(EyeData[idx], Events[idx], "Inference", "typeB", 0) inference.append(inf_slices) inf_cross.append(_inf_cross) nof_slices, _nof_cross = FilterSlices(EyeData[idx], Events[idx], "Noference", "typeA", 0) noference.append(nof_slices) nof_cross.append(_nof_cross) for trial in range(0, len(inference[idx])): inf.plot(inference[idx][trial, :]) for trial in range(0, len(noference[idx])): nof.plot(noference[idx][trial, :]) nof.set_xlabel("No inference trials") inf.set_xlabel("Inference trials") inf.set_ylabel("Changes in gaze X position") inf.axhline(y=1200, color="black", linestyle="--") nof.axhline(y=1200, color="black", linestyle="--") ticks = inf.get_xticks() * 16 inf.set_xticklabels(ticks.astype(int)) nof.set_xticklabels(ticks.astype(int)) t, p = stats.ttest_rel(inf_cross, nof_cross) print(t) print(p) fig2 = plt.figure() box = plt.subplot(111) fig2.suptitle("Average number of center crossing per subject", fontweight="bold") bp1 = box.boxplot([inf_cross, nof_cross], patch_artist=True) bp1["boxes"][0].set(color="b", linewidth=0, alpha=0.5) bp1["boxes"][1].set(color="m", linewidth=0, alpha=0.5) box.set_xticklabels(["Inference trials", "no inference trials"]) box.set_ylabel("Average amount of crossing per subject", fontweight="bold") box.set_ylim(-0.01, 2.01) return (inf_cross, nof_cross)
def ttest(): SVM = array( [ [61.3636, 60.6, 62.23, 64.9, 57.8], [62.1818, 61.96, 62.4, 63.1, 61.3], [60.3636, 59.6, 61.3, 64.4, 56.4], [58.6364, 58.6, 58.6, 58.7, 58.5], [64, 63.8, 64.2, 64.5, 63.5], [61.2727, 60.2, 62.6, 66.5, 56], [62.1818, 60.8, 64, 68.5, 55.8], [62, 62, 62, 62.2, 61.8], [62.5455, 65.9, 63.2, 65.1, 60], [62, 61.1, 63.1, 65.6, 58.4], ] ) Bayes = array( [ [58.3636, 60.5, 57, 48.4, 68.4], [57.9, 60.9, 56.2, 42.5, 71.6], [57, 59.1, 55.7, 45.5, 68.5], [55.5, 57.4, 54.4, 42.9, 68.2], [58.1, 59.7, 56.9, 49.8, 66.4], [57.72, 59.4, 56.5, 48.7, 66.7], [60.4, 63.3, 58.5, 49.3, 71.5], [58.9, 61.1, 57.4, 49.1, 68.7], [59.2, 61.8, 57.5, 48, 70.4], [57.5, 57.5, 56.1, 44.7, 70.4], ] ) DeTree = array( [ [60.3, 61.5, 59.3, 54.9, 65.6], [59.4, 59.5, 59.2, 58.5, 60.2], [57.9, 58.2, 57.7, 56.4, 59.5], [55.8, 55.9, 55.7, 54.7, 56.9], [60.2, 60.9, 59.5, 58.4, 63.6], [58.2, 58.1, 58.3, 58.9, 57.5], [58.5, 58.8, 58.1, 56.4, 60.5], [58.7, 59.1, 58.4, 56.7, 60.7], [58.2, 58.2, 57.9, 56.4, 60], [59.2, 59.3, 59, 57.8, 60], ] ) for i in range(0, 10): """print "======SVM and Bayes=======" r = stats.ttest_rel(SVM[i], Bayes[i]) print r[1] #print "======SVM and Decision Tree=======" r = stats.ttest_rel(SVM[i], DeTree[i]) print r[1] """ # print "======Bayes and Decision Tree=======" r = stats.ttest_rel(Bayes[i], DeTree[i]) print r[1]
def calculatePairedTTest(jsonCollect, model, embed_type): random.seed(116) initialRand = random.getstate() embed = None transformer = None coefficients = [] rPattern = r'https:\/\/stackoverflow\.com\/questions\/\d+' if sample: jsonCollect = jsonCollect[:100] urlMapping = {} urlList = [] linkedDists = [] foreignDists = [] differences = [] for jsonObject in jsonCollect: qUrl = jsonObject['q_url'] urlMapping[qUrl] = jsonObject urlList.append(qUrl) number_posts_with_stackOverflow_links = 0 num_stackOverflow_links = [] for idx, jsonObject in enumerate(jsonCollect): if idx % 1000 == 0: print( f'calculatePairedTTest: finished {idx} out of {len(jsonCollect)}' ) qUrl = jsonObject['q_url'] all_content = jsonObject['q_text'] answerCollection = jsonObject['answers'] for answer in answerCollection: answerText = answer['a_text'] all_content += ' ' + answerText urls = re.findall( '(https://)([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', all_content) filtered_urls = [] for url_parts in urls: url = ''.join(url_parts) if 'stackoverflow.com/questions' in url: filtered_urls.append(url) # q_urls = [url for url in urls if 'https://stackoverflow.com/questions/' in url] # urlContent = jsonObject['stackoverflow_urls'] urlContent = list(filtered_urls) if len(filtered_urls) > 0: number_posts_with_stackOverflow_links += 1 num_stackOverflow_links.append(len(filtered_urls)) for potentialUrl in urlContent: urlMatch = re.search(rPattern, potentialUrl) if urlMatch == None: continue actualUrl = urlMatch.group(0) if actualUrl not in urlMapping or qUrl == actualUrl: continue post2Object = urlMapping[actualUrl] post1Url = qUrl post2Url = actualUrl post1EmbeddingArray = embed_sentences(jsonObject['q_text'], model, embed_type) post2EmbeddingArray = embed_sentences(post2Object["q_text"], model, embed_type) linkedDist = np.linalg.norm(post1EmbeddingArray - post2EmbeddingArray)**2 if linkedDist <= .001: continue post3Url = random.choice(urlList) post4Url = random.choice(urlList) while post3Url == post1Url or post3Url == post2Url: post3Url = random.choice(urlList) while post4Url == post2Url or post4Url == post1Url: post4Url = random.choice(urlList) post3EmbeddingArray = embed_sentences( urlMapping[post3Url]["q_text"], model, embed_type) post4EmbeddingArray = embed_sentences( urlMapping[post4Url]["q_text"], model, embed_type) post1And3Dist = np.linalg.norm(post1EmbeddingArray - post3EmbeddingArray)**2 post2And4Dist = np.linalg.norm(post2EmbeddingArray - post4EmbeddingArray)**2 foreignDistAverage = (post1And3Dist + post2And4Dist) / 2 linkedDists.append(linkedDist) foreignDists.append(foreignDistAverage) difference = foreignDistAverage - linkedDist differences.append(difference) results = stat.ttest_rel(foreignDists, linkedDists) random.setstate(initialRand) print('Result of T statistic calculation is:', results) print('Number of forum posts with stackoverflow links = ', number_posts_with_stackOverflow_links) print('Average number of links per post: ', statistics.mean(num_stackOverflow_links))
def stat_test(box_data1, box_data2, test, **stats_params): test_short_name = '' pval = None formatted_output = None if test == 'Levene': stat, pval = stats.levene(box_data1, box_data2, **stats_params) test_short_name = 'levene' formatted_output = ("Levene test of variance, " "P_val={:.3e} stat={:.3e}").format(pval, stat) elif test == 'Mann-Whitney': u_stat, pval = stats.mannwhitneyu(box_data1, box_data2, alternative='two-sided', **stats_params) test_short_name = 'M.W.W.' formatted_output = ("Mann-Whitney-Wilcoxon test two-sided " "P_val={:.3e} U_stat={:.3e}").format(pval, u_stat) elif test == 'Mann-Whitney-gt': u_stat, pval = stats.mannwhitneyu(box_data1, box_data2, alternative='greater', **stats_params) test_short_name = 'M.W.W.' formatted_output = ("Mann-Whitney-Wilcoxon test greater " "P_val={:.3e} U_stat={:.3e}").format(pval, u_stat) elif test == 'Mann-Whitney-ls': u_stat, pval = stats.mannwhitneyu(box_data1, box_data2, alternative='less', **stats_params) test_short_name = 'M.W.W.' formatted_output = ("Mann-Whitney-Wilcoxon test smaller " "P_val={:.3e} U_stat={:.3e}").format(pval, u_stat) elif test == 't-test_ind': stat, pval = stats.ttest_ind(a=box_data1, b=box_data2, **stats_params) test_short_name = 't-test_ind' formatted_output = ("t-test independent samples, " "P_val={:.3e} stat={:.3e}").format(pval, stat) elif test == 't-test_welch': stat, pval = stats.ttest_ind(a=box_data1, b=box_data2, equal_var=False, **stats_params) test_short_name = 't-test_welch' formatted_output = ("Welch's t-test independent samples, " "P_val={:.3e} stat={:.3e}").format(pval, stat) elif test == 't-test_paired': stat, pval = stats.ttest_rel(a=box_data1, b=box_data2, **stats_params) test_short_name = 't-test_rel' formatted_output = ("t-test paired samples, " "P_val={:.3e} stat={:.3e}").format(pval, stat) elif test == 'Wilcoxon': if "zero_method" in stats_params.keys(): zero_method = stats_params["zero_method"] del stats_params["zero_method"] else: zero_method = len(box_data1) <= 20 and "pratt" or "wilcox" print("Using zero_method ", zero_method) stat, pval = stats.wilcoxon(box_data1, box_data2, zero_method=zero_method, **stats_params) # stat, pval = wilcoxon_exact.wilcoxon_exact(box_data1, box_data2) test_short_name = 'Wilcoxon' formatted_output = ("Wilcoxon test (paired samples), " "P_val={:.3e} stat={:.3e}").format(pval, stat) elif test == "wilcoxon-exact": stat, pval = wilcoxon_exact(box_data1, box_data2, alternative="less") test_short_name = 'Wilcoxon' formatted_output = ("Wilcoxon test (paired samples), " "P_val={:.3e} stat={:.3e}").format(pval, stat) return pval, formatted_output, test_short_name
# plot the data setFonts(20) plt.plot(x, 'o', ms=10, label='pre') plt.plot(xs, 'r*', ms=12, label='post') plt.bar(index, dx, width=0.5, align='center', color=0.75 * np.ones(3), label='pre-post') # Format the plot plt.legend(loc='upper left') plt.axhline(0, ls='--') plt.xlim(-0.3, 5.3) plt.ylim(-0.2, 6.2) plt.xlabel('Subject Nr') plt.ylabel('Value') plt.tight_layout() # P-values for paired and unpaired T-tests _, p_paired = stats.ttest_rel(x, xs) _, p_ind = stats.ttest_ind(x, xs) print('A paired comparison yields p={p_paired:.4f},' + f' while an unpaired T-test gives us p={p_ind:.3f}') # Show and save figure outFile = 'pairedTtest.png' showData(outFile)
def class34(filename, i): ''' This function performs experiment 3.4 Parameters filename : string, the name of the npz file from Task 2 i: int, the index of the supposed best classifier (from task 3.1) ''' # Set timer start = timeit.default_timer() kf = KFold(n_splits=5, random_state=1, shuffle=True) feats = np.load(filename) feats = feats[feats.files[0]] # (40000,174) X = feats[..., :-1] # first 173 element for all 40,000 inputs -> input y = feats[..., -1] # last column of feats -> label output = np.zeros((5, 5)) # Count time stop = timeit.default_timer() print('Starting the folding') print(stop - start) f = 0 # counter for fold for train_index, test_index in kf.split(X): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for classifier in range(5): print('Now working on classifier ' + str(classifier)) if classifier == 0: clf = SVC(kernel='linear', max_iter=10000) if classifier == 1: clf = SVC(kernel='rbf', max_iter=10000, gamma=2) # default is rdf if classifier == 2: clf = RandomForestClassifier(max_depth=5, n_estimators=10) if classifier == 3: clf = MLPClassifier(alpha=0.05) if classifier == 4: clf = AdaBoostClassifier() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) c = confusion_matrix(y_test, y_pred) output[f][classifier] = accuracy(c) #adding to the output array stop = timeit.default_timer() print('Done with ' + str(f + 1) + ' fold') print(stop - start) f += 1 iBest = i - 1 # h[:,1] the 2nd coloumn only of np array h p_values = [] for column in range(output.shape[1]): if column != iBest: S = stats.ttest_rel(output[:, column], output[:, iBest]) #print(output[:,column]) #print(output[:, iBest]) #print(S) p_values.append(S[1]) with open('./a1_3.4.csv', 'w', newline='') as csv_file: writer = csv.writer(csv_file, delimiter=',') for line in output: # Write the results for 32K data into writer.writerow(line) writer.writerow(p_values)
s2 = df2['JobSatisfaction'].sample(n=50, random_state=1) # a qq plot of sample 1 fig = sm.qqplot(df1['JobSatisfaction'], fit = True, line= '45') plt.show() # a qq plot of sample 2 fig = sm.qqplot(df2['JobSatisfaction'], fit = True, line= '45') plt.show() print(s1.mean()) print(s2.mean()) # let us perfrom our ttest tstat, pval = stats.ttest_rel(s1, s2) print('t statistic is: ', tstat) print('p value is: ', pval) # let us use a loop to analyse our results if pval<0.05: print("Reject null hypothesis") else: print("Accept null hypothesis") """Rejecting the null hypothesis means that we have enough statistical evidence to state that, There is a statistically significant difference in satisfaction level between employees who retained and those who left. # Conclusion We set out to identify determinant factors that lead to staff attrition in a company.
#print(stc_d.data[1,:]) #print('\n') dd_per_sub[subject_idx,:] = numpy.mean(stc_d.data, axis=1) stc_w = mne.read_source_estimate(TFCE_data_path.format(subject, 'dW', time_lbl, lbl_name)) #print('dW') #print(stc_w.data[26,:]) #print(stc_w.data[1,:]) #print('\n') dw_per_sub[subject_idx,:] = numpy.mean(stc_w.data, axis=1) stc_d = [] stc_w = [] t_stat, p_val = stats.ttest_rel(dw_per_sub, dd_per_sub, axis=0) print('p_val') print(p_val[26]) print(p_val[1]) print('\n') #p_val inversion for STC Viewer p_val = 1-p_val print('1-p_val') print(p_val[26]) print(p_val[1]) print('\n') #binarize p_val #p_val[p_val < 0.95] = 0 # significance threshold #p_val[p_val > 0.95] = 1
sns.set(style="dark", font_scale=2) sns.despine() ax.set_ylim([55, 95]) for cycle in range(4): stat, p = wilcoxon(accuracies_3DC[cycle], accuracies_myo[cycle]) print(p) if p < 0.05: p_rounded = np.round(p, decimals=5) if p_rounded > 0: label_diff(current_cycle=cycle, p_value=p_rounded, sign_to_use="=") else: label_diff(current_cycle=cycle, p_value=0.0001, sign_to_use="<") print("Normality : ", shapiro(accuracies_3DC[cycle]-accuracies_myo[cycle])) plt.show() for cycle in range(4): print("Cycle: ", cycle+1) _, normality_p_value = shapiro(accuracies_myo[cycle] - accuracies_3DC[cycle]) print("Normality p-value: ", normality_p_value) if normality_p_value < 0.1: print("p-value t-test: N.A.") _, p = wilcoxon(accuracies_3DC[cycle], accuracies_myo[cycle]) print("p-value Wilcoxon : ", p) else: _, p = ttest_rel(accuracies_3DC[cycle], accuracies_myo[cycle]) print("p-value t-test: ", p) _, p = wilcoxon(accuracies_3DC[cycle], accuracies_myo[cycle]) print("p-value Wilcoxon : ", p)
'Fog': lr_recall[6], 'Lix': lr_recall[7], 'WSF1': lr_recall[8], 'WSF2': lr_recall[9], 'WSF3': lr_recall[10], 'WSF4': lr_recall[11], 'FEAT': feat_recall, 'BASE': random_recall }, index=[n for n in range(0, 10)]) recall_performance.mean() # ================================================================================================================ # compare f1 to random baseline stats.ttest_rel(f1_performance.BASE, f1_performance.FEAT) # significant for a = .01 stats.ttest_rel(f1_performance.BASE, f1_performance.CLI) # significant for a = .01 stats.ttest_rel(f1_performance.BASE, f1_performance.Fog) # significant for a = .01 stats.ttest_rel(f1_performance.BASE, f1_performance.WSF3) # significant for a = .01 stats.ttest_rel(f1_performance.BASE, f1_performance.WSF4) # significant for a = .01 stats.ttest_rel(f1_performance.BASE, f1_performance.Lix) # significant for a = .01 stats.ttest_rel(f1_performance.BASE, f1_performance.WSF2) # significant for a = .01 stats.ttest_rel(f1_performance.BASE, f1_performance.WSF1) # significant for a = .01 stats.ttest_rel(f1_performance.BASE,
def stats_stps(corrs1, corrs2, fisherz=True, permutation=True, iter=5000): """ Conduct the statistical analysis for results of EEG-like data(for STPS) Parameters ---------- corrs1 : array The correlation coefficients under condition1. The shape of corrs1 must be [n_subs, n_chls, n_ts]. n_subs, n_chls, n_ts represent the number of subjects, the number of channels and the number of time-points. corrs2 : array The correlation coefficients under condition2. The shape of corrs2 must be [n_subs, n_chls, n_ts]. n_subs, n_chls, n_ts represent the number of subjects, the number of channels and the number of time-points. fisherz : bool True or False. Default is True. Conduct Fisher-Z transform. permutation : bool True or False. Default is False. Use permutation test or not. iter : int. Default is 5000. The times for iteration. Returns ------- stats : array The statistical results. The shape of stats is [n_chls, n_ts, 2]. n_chls, n_ts represent the number of channels and the number of time-points. 2 represents a t-value and a p-value. Notes ----- n_subs must >= 6. """ if len(np.shape(corrs1)) != 3 or len(np.shape(corrs2)) != 3 or np.shape(corrs1)[1] != np.shape(corrs2)[1] or \ np.shape(corrs1)[2] != np.shape(corrs2)[2]: return "Invalid input!" # get the number of subjects, channels & time-points subs, chls, ts = np.shape(corrs1) # subs>=6 if subs < 6: return print("the number of subjects is too small!") # initialize the corrs stats = np.zeros([chls, ts, 2], dtype=np.float) # get r-map rs1 = corrs1 rs2 = corrs2 # calculate the statistical results for i in range(chls): for j in range(ts): if fisherz is True: # Fisher r to z zs1 = 0.5*np.log((1+rs1)/(1-rs1)) zs2 = 0.5*np.log((1+rs2)/(1-rs2)) # t test stats[i, j] = ttest_rel(rs1[:, i, j], rs2[:, i, j]) if permutation == True: stats[i, j, 1] = permutation_test(zs1[:, i, j], zs2[:, i, j], iter=iter) return stats
def find_pvalue_2smaple_paired(a, b): """a and b are arrays""" pval_2sided = stats.ttest_rel(a, b) return pval_2sided
from scipy.stats import ttest_rel, ttest_ind ddnn_raw = np.loadtxt("./output_ddnn_old.txt",delimiter=',')[:,:10] sing_raw = np.loadtxt("./output_sing_old.txt",delimiter=',') gmm_raw = np.loadtxt("./output_sing_gmm.txt",delimiter=',') ddnn = np.mean(ddnn_raw,axis=0) sing = np.mean(sing_raw,axis=0) gmm = np.mean(gmm_raw,axis=0) ys = range(0,1000,100) print ddnn.shape print len(ys) plot_ddnn, = plt.plot(ys,ddnn,label='Decentralized') plot_sing, = plt.plot(ys,sing,label='Centralized') plot_gmm, = plt.plot(ys,gmm,label='GMM') last_DNN = ddnn_raw[:,9] last_GMM = gmm_raw[:,9] last_SING = sing_raw[:,9] print gmm.shape print ddnn.shape print sing.shape t,r = ttest_rel(last_DNN,last_GMM) print sing[9] print gmm[9] plt.legend([plot_ddnn, plot_sing, plot_gmm], loc=4) plt.xlabel("Number of epochs") plt.ylabel("Accuracy score") print r #plt.show()
data = d, x_estimator = np.mean, x_ci = "ci", ci = 95, n_boot = 5000, line_kws = {'lw': 5}, color = "darkgrey") # save only the slope arr[i,j] = slope # close all figures plt.close() #%% """ Compute the t-test """ import scipy.stats as stats m1, sd1 = np.round(np.mean(arr[:,0]), 1), np.round(np.std(arr[:,0]), 1) m2, sd2 = np.round(np.mean(arr[:,1]), 1), np.round(np.std(arr[:,1]), 1) print(m1, sd1) print(m2, sd2) t_stat, p_val = stats.ttest_rel(arr[:,0], arr[:,1]) print("T-statistic: {}\np-value: {}".format(np.round(t_stat, 3), np.round(p_val, 3)))
sentiment_singularity_csv = '~/git/afit_mlperf_training/sentiment_analysis/results/1xP100_sentiment_analysis_singularity.csv' sentiment_native_csv = '~/git/afit_mlperf_training/sentiment_analysis/results/1xP100_sentiment_analysis_native.csv' native_df = pandas.read_csv(sentiment_native_csv, '\n', names=['Native Runtime (Seconds)']) singularity_df = pandas.read_csv(sentiment_singularity_csv, '\n', names=['Singularity Runtime (Seconds)']) native_df = native_df.drop([35]) singularity_df = singularity_df.drop([18]) df = native_df.join(singularity_df).dropna() print(df.describe()) t_val_rel = stats.ttest_rel(df.loc[:, 'Native Runtime (Seconds)'], df.loc[:, 'Singularity Runtime (Seconds)']) print(t_val_rel) t_val_ind = stats.ttest_ind(df.loc[:, 'Native Runtime (Seconds)'], df.loc[:, 'Singularity Runtime (Seconds)']) print(t_val_ind) ax = plt.gca() df.plot(kind='hist', y='Native Runtime (Seconds)', color='red', ax=ax) df.plot(kind='hist', y='Singularity Runtime (Seconds)', color='blue', ax=ax) plt.savefig('P100_Histogram.png') plt.show()
def plot_mean_podf(po, sz=200, typ='cp', typek='', pos=(0, 4, 4, 8)): global freq, df ax = py.subplot(gs[pos[2]:pos[3], pos[0]:pos[1]]) set_axis(ax, -0.05, 1.1, letter=letters[po]) # df = pd.read_excel(saveDir+'g_and_h_mjh.xlsx') results = pd.read_excel(saveDir + 'results.xlsx') df = pd.read_excel(saveDir + 'g_and_h.xlsx') df2 = pd.read_excel(saveDir + 'Injection times.xlsx') if 'xyl' in typek: color = 'purple' else: color = 'red' ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) plot_len = 55 min_minus = 20 min_plus = 35 okno = 60 time_gh = np.linspace(-20, min_plus, plot_len) hfo = np.zeros((len(lista_rats), plot_len)) gamma = np.zeros((len(lista_rats), plot_len)) for i in range(len(lista_rats)): row = df2.loc[df2['RAT'] == int(lista_rats[i])] start = int(row[typek].values[0] / (okno)) # print(start) hfo[i] = df[lista_rats[i] + 'HFO_' + typ + typek].values[start - min_minus:start + min_plus] gamma[i] = df[lista_rats[i] + 'gamma_' + typ + typek].values[start - min_minus:start + min_plus] # py.plot(hfo[i], color='indianred') # py.plot(gamma[i], color = 'blue') sem = len(lista_rats)**(1 / 2) m_hfo = hfo.mean(axis=0) s_hfo = hfo.std(axis=0) / sem m_gamma = gamma.mean(axis=0) s_gamma = gamma.std(axis=0) / sem py.plot(time_gh, m_gamma, color='blue') py.fill_between(time_gh, m_gamma - s_gamma, m_gamma + s_gamma, alpha=0.3, color='blue') py.plot(time_gh, m_hfo, color=color) py.fill_between(time_gh, m_hfo - s_hfo, m_hfo + s_hfo, alpha=0.3, color=color) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) py.ylabel('Power of dom. freq.($mV^2$)', fontsize=fsize) py.yscale('log') py.xlabel('Time (min)', fontsize=fsize) ax = py.subplot(gs[pos[2]:pos[3], pos[1] + 1:pos[1] + 5]) set_axis(ax, -0.05, 1.1, letter=letters[po + 1]) bef_gamma, rly_gamma, lat_gamma = [], [], [] bef_hfo, rly_hfo, lat_hfo = [], [], [] for i in range(len(lista_rats)): bef_gamma.append(gamma[i, bs - 5:bs].mean()) rly_gamma.append(gamma[i, early_s:early_f].mean()) lat_gamma.append(gamma[i, late_s:late_f].mean()) bef_hfo.append(hfo[i, bs - 5:bs].mean()) rly_hfo.append(hfo[i, early_s:early_f].mean()) lat_hfo.append(hfo[i, late_s:late_f].mean()) # if typek=='xyl': # py.plot([hfo[i, bs-3:bs].mean(), hfo[i,early_s:early_f].mean(), hfo[i, late_s:late_f].mean()], marker = 'o', color = 'indianred') # else: py.plot([ hfo[i, bs - 5:bs].mean(), hfo[i, early_s:early_f].mean(), hfo[i, late_s:late_f].mean() ], marker='o', color=color) py.plot([ gamma[i, bs - 5:bs].mean(), gamma[i, early_s:early_f].mean(), gamma[i, late_s:late_f].mean() ], marker='o', color='blue') # py.text(-.1, hfo[i, bs-3:bs].mean(), lista_rats[i]) results[typek + 'bef_gamma'] = bef_gamma results[typek + 'rly_gamma'] = rly_gamma results[typek + 'lat_gamma'] = lat_gamma results[typek + 'bef_hfo'] = bef_hfo results[typek + 'rly_hfo'] = rly_hfo results[typek + 'lat_hfo'] = lat_hfo results['rats'] = lista_rats results.to_excel(saveDir + 'results.xlsx', sheet_name='sheet1', index=False) shift = np.asarray(rly_hfo).mean() / 10 max_ind = np.max(np.array([rly_hfo, lat_hfo])) print('shap', st.shapiro(bef_gamma)[1]) print('shap', st.shapiro(rly_gamma)[1]) print('shap', st.shapiro(lat_gamma)[1]) pvalue = st.ttest_rel(bef_gamma, rly_gamma)[1] print('gamma pval', pvalue) py.text(.9, max_ind + shift, pval(pvalue), color='blue') pvalue = st.ttest_rel(bef_gamma, lat_gamma)[1] py.text(1.9, max_ind + shift, pval(pvalue), color='blue') shift = np.asarray(rly_hfo).mean() * 2 print('shap', st.shapiro(bef_hfo)[1]) print('shap', st.shapiro(rly_hfo)[1]) print('shap', st.shapiro(lat_hfo)[1]) pvalue = st.ttest_rel(bef_hfo, rly_hfo)[1] print('hfo pval', pvalue) py.text(.9, max_ind + shift, pval(pvalue), color=color) pvalue = st.ttest_rel(bef_hfo, lat_hfo)[1] py.text(1.9, max_ind + shift, pval(pvalue), color=color) py.ylabel('Power of dom. freq.($mV^2$)', fontsize=fsize) py.yscale('log') py.xticks([0, 1, 2], ['base', 'early Ket', 'late Ket'], fontsize=fsize) if typek == 'xyl': py.xticks([0, 1, 2], ['base', 'early KX', 'late KX'], fontsize=fsize) else: ket = mpatches.Patch(color='red', label='HFO after Ket.') kx = mpatches.Patch(color='purple', label='HFO after KX') gam = mpatches.Patch(color='blue', label='Gamma 30-65 Hz') ax.legend(handles=[ket, kx, gam], loc='center', bbox_to_anchor=(1.7, 0.5), ncol=1, frameon=True, fontsize=20) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) py.xlim(-.2, 2.2)
def main(): if len(sys.argv) < 3: print("You did not give enough arguments\n ") sys.exit(1) filename_A = sys.argv[1] filename_B = sys.argv[2] alpha = sys.argv[3] with open(filename_A) as f: data_A = f.read().splitlines() with open(filename_B) as f: data_B = f.read().splitlines() data_A = list(map(float, data_A)) data_B = list(map(float, data_B)) print( "\nPossible statistical tests: Shapiro-Wilk, Anderson-Darling, Kolmogorov-Smirnov, t-test, Wilcoxon, McNemar, Permutation, Bootstrap" ) name = raw_input("\nEnter name of statistical test: ") ### Normality Check if (name == "Shapiro-Wilk" or name == "Anderson-Darling" or name == "Kolmogorov-Smirnov"): output = normality_check(data_A, data_B, name, alpha) if (float(output) > float(alpha)): #answer = raw_input("\nThe normal test is significant, would you like to perform a t-test for checking significance of difference between results? (Y\N) ") if (answer == 'Y'): # two sided t-test t_results = stats.ttest_rel(data_A, data_B) # correct for one sided test pval = t_results[1] / 2 if (float(pval) <= float(alpha)): print( "\nTest result is significant with p-value: {}".format( pval)) return else: print("\nTest result is not significant with p-value: {}". format(pval)) return else: answer2 = raw_input( "\nWould you like to perform a different test (permutation or bootstrap)? If so enter name of test, otherwise type 'N' " ) if (answer2 == 'N'): print("\nbye-bye") return else: name = answer2 else: #answer = raw_input("\nThe normal test is not significant, would you like to perform a non-parametric test for checking significance of difference between results? (Y\N) ") if (answer == 'Y'): answer2 = raw_input( "\nWhich test (Permutation or Bootstrap)? ") name = answer2 else: print("\nbye-bye") return ### Statistical tests # Paired Student's t-test: Calculate the T-test on TWO RELATED samples of scores, a and b. for one sided test we multiply p-value by half if (name == "t-test"): t_results = stats.ttest_rel(data_A, data_B) # correct for one sided test pval = float(t_results[1]) / 2 if (float(pval) <= float(alpha)): print("\nTest result is significant with p-value: {}".format(pval)) return else: print("\nTest result is not significant with p-value: {}".format( pval)) return # Wilcoxon: Calculate the Wilcoxon signed-rank test. if (name == "Wilcoxon"): wilcoxon_results = stats.wilcoxon(data_A, data_B) if (float(wilcoxon_results[1]) <= float(alpha)): print("\nTest result is significant with p-value: {}".format( wilcoxon_results[1])) return else: print("\nTest result is not significant with p-value: {}".format( wilcoxon_results[1])) return if (name == "McNemar"): print( "\nThis test requires the results to be binary : A[1, 0, 0, 1, ...], B[1, 0, 1, 1, ...] for success or failure on the i-th example." ) f_obs = calculateContingency(data_A, data_B, len(data_A)) mcnemar_results = mcNemar(f_obs) if (float(mcnemar_results) <= float(alpha)): print("\nTest result is significant with p-value: {}".format( mcnemar_results)) return else: print("\nTest result is not significant with p-value: {}".format( mcnemar_results)) return if (name == "Permutation"): R = max(10000, int(len(data_A) * (1 / float(alpha)))) pval = rand_permutation(data_A, data_B, len(data_A), R) if (float(pval) <= float(alpha)): print("\nTest result is significant with p-value: {}".format(pval)) return else: print("\nTest result is not significant with p-value: {}".format( pval)) return if (name == "Bootstrap"): R = max(10000, int(len(data_A) * (1 / float(alpha)))) pval = Bootstrap(data_A, data_B, len(data_A), R) if (float(pval) <= float(alpha)): print("\nTest result is significant with p-value: {}".format(pval)) return else: print("\nTest result is not significant with p-value: {}".format( pval)) return else: print("\nInvalid name of statistical test") sys.exit(1)
for j in range(1, len(algo1[0])): algo1_temp.append(float(algo1[i][j])) algo2_temp.append(float(algo2[i][j])) #print(algo1_temp) #print(algo2_temp) temp1 = [0 for i in range(10)] temp2 = [0 for i in range(10)] for j in range(10): for k in range(10): temp1[j] += algo1_temp[10 * j + k] temp2[j] += algo2_temp[10 * j + k] temp1[j] /= 10 temp2[j] /= 10 #print(temp1) #print(temp2) t_test = stats.ttest_rel(temp1, temp2) #print(t_test) t_test_value = t_test[0] if t_test[1] > 0.05: t_test_value = 0 index = int(algo1[i][0]) #if t test result is positive, algo1 is better, if t test result is negative, algo2 is better, if t test result is 0, then draw if t_test_value == 0: label[index] = 0 elif t_test_value > 0: label[index] = 1 elif t_test_value < 0:
def get_different_from_best(results_df, raw_results_df, metric='aupr', id_name='gene'): """Identify best-performing data types for each gene. As an alternative to just identifying the data type with the best average performance, we want to also identify data types that are "statistically equivalent" to the best performer. For each gene, we do the following: 1) get all data types that significantly outperform the permuted baseline ("well-performing" data types) 2) do pairwise t-tests comparing the best performing data types with other well-performing data types 3) apply an FDR correction for the total number of t-tests In each case where the null hypothesis is accepted, we say both data types are statistically equivalent. If the null is rejected, the relevant data type does not provide statistically equivalent performance to the best performing data type. """ from scipy.stats import ttest_rel comparison_pvals = [] for identifier in results_df[id_name].unique(): # compare best with other data types that are significant from # baseline, using pairwise t-tests # null hypothesis = each pair of results distributions is the same # get best data type best_data_ix = (results_df[results_df[id_name] == identifier].loc[:, 'delta_mean'].idxmax()) best_data_type = results_df.iloc[best_data_ix, :].training_data # get other significant data types other_data_types = ( results_df[(results_df[id_name] == identifier) & (results_df.training_data != best_data_type) & (results_df.reject_null)])['training_data'].values best_data_dist = ( raw_results_df[(raw_results_df.identifier == identifier) & (raw_results_df.training_data == best_data_type) & (raw_results_df.signal == 'signal') & (raw_results_df.data_type == 'test')]).sort_values( by=['seed', 'fold'])[metric].values if len(other_data_types) == 0: continue for other_data_type in other_data_types: # do pairwise t-tests other_data_dist = (raw_results_df[ (raw_results_df.identifier == identifier) & (raw_results_df.training_data == other_data_type) & (raw_results_df.signal == 'signal') & (raw_results_df.data_type == 'test')]).sort_values( by=['seed', 'fold'])[metric].values p_value = ttest_rel(best_data_dist, other_data_dist)[1] best_id = '{}, {}'.format(identifier, best_data_type) other_id = '{}, {}'.format(identifier, other_data_type) comparison_pvals.append( [identifier, best_data_type, other_data_type, p_value]) comparison_df = pd.DataFrame( comparison_pvals, columns=[id_name, 'best_data_type', 'other_data_type', 'p_value']) # apply multiple testing correction and identify significant similarities from statsmodels.stats.multitest import multipletests corr = multipletests(comparison_df['p_value'], alpha=0.05, method='fdr_bh') comparison_df = comparison_df.assign(corr_pval=corr[1], accept_null=~corr[0]) # add column to results_df for statistically equal to best equal_to_best = [] for _, vals in results_df.iterrows(): if not vals['reject_null']: equal_to_best.append(False) else: comp_gene_df = comparison_df[comparison_df[id_name] == vals[id_name]] if vals['training_data'] in comp_gene_df.best_data_type.values: equal_to_best.append(True) elif vals['training_data'] in comp_gene_df.other_data_type.values: # reject null = means are significantly different # accept null = means are statistically the same # so accept null = alternate data type is statistically the # same as the best data type equal_to_best.append( comp_gene_df[comp_gene_df.other_data_type == vals['training_data']].accept_null.values[0]) else: # this happens when the data type is the only significant one equal_to_best.append(True) results_df = results_df.assign(equal_to_best=equal_to_best) return results_df
import pandas as pd from scipy import stats measurements = pd.read_csv('data/hawaii_measurements.csv') measurements['month'] = pd.to_datetime(measurements.date).dt.month june = measurements[measurements['month'] == 6] december = measurements[measurements['month'] == 12] june_grp = june.groupby('station') december_grp = december.groupby('station') june_avg = june_grp.tobs.mean() december_avg = december_grp.tobs.mean() print(stats.ttest_rel(june_avg, december_avg)) print( '''I did a paired t-test because these are temperature observations from the same stations across two different months. The results return a small p-value (p < 0.05) which indicates a statistically significant difference between June and December temperatures across all years.''' )
def main(): (current_work_dir_path, asset_dir_path, program_dir_path, conda_program_dir_path) = utils.get_dir_paths() num_of_threads = multiprocessing.cpu_count() gammas = [2.**i for i in range(-7, 11)] mafft_xinsi_params = [] consalifold_params = [] posterior_consalifold_params = [] rna_seq_dir_path = asset_dir_path + "/compiled_rna_fams_test" mafft_xinsi_dir_path = asset_dir_path + "/mafft_xinsi" mafft_xinsi_plus_consalifold_dir_path = asset_dir_path + "/mafft_xinsi_plus_consalifold" posterior_mafft_xinsi_plus_consalifold_dir_path = asset_dir_path + "/posterior_mafft_xinsi_plus_consalifold" if not os.path.isdir(mafft_xinsi_plus_consalifold_dir_path): os.mkdir(mafft_xinsi_plus_consalifold_dir_path) if not os.path.isdir(posterior_mafft_xinsi_plus_consalifold_dir_path): os.mkdir(posterior_mafft_xinsi_plus_consalifold_dir_path) sub_thread_num = 4 for rna_seq_file in os.listdir(rna_seq_dir_path): if not rna_seq_file.endswith(".fa"): continue rna_seq_file_path = os.path.join(rna_seq_dir_path, rna_seq_file) (rna_family_name, extension) = os.path.splitext(rna_seq_file) mafft_xinsi_output_file_path = os.path.join(mafft_xinsi_dir_path, rna_family_name + ".aln") mafft_xinsi_plus_consalifold_output_dir_path = os.path.join( mafft_xinsi_plus_consalifold_dir_path, rna_family_name) posterior_mafft_xinsi_plus_consalifold_output_dir_path = os.path.join( posterior_mafft_xinsi_plus_consalifold_dir_path, rna_family_name) if not os.path.isdir(mafft_xinsi_plus_consalifold_output_dir_path): os.mkdir(mafft_xinsi_plus_consalifold_output_dir_path) if not os.path.isdir( posterior_mafft_xinsi_plus_consalifold_output_dir_path): os.mkdir(posterior_mafft_xinsi_plus_consalifold_output_dir_path) consalifold_params.insert( 0, (sub_thread_num, mafft_xinsi_output_file_path, mafft_xinsi_plus_consalifold_output_dir_path, False)) posterior_consalifold_params.insert( 0, (sub_thread_num, mafft_xinsi_output_file_path, posterior_mafft_xinsi_plus_consalifold_output_dir_path, True)) # ConsAliFold's execution. pool = multiprocessing.Pool(int(num_of_threads / sub_thread_num)) consalifold_results = pool.map(bench_consalifold, consalifold_params) consalifold_output_file_path = asset_dir_path + "/consalifold_running_times_turner.dat" write_consalifold_results(consalifold_results, consalifold_output_file_path) data_turner = read_consalifold_results(consalifold_output_file_path) posterior_consalifold_results = pool.map(bench_consalifold, posterior_consalifold_params) posterior_consalifold_output_file_path = asset_dir_path + "/consalifold_running_times_posterior.dat" write_consalifold_results(posterior_consalifold_results, posterior_consalifold_output_file_path) data_posterior = read_consalifold_results( posterior_consalifold_output_file_path) data = { "Running time (s)": data_turner + data_posterior, "Pair-matching probability inference method": ["ConsProb"] * len(data_turner) + ["LocARNA-P + our PCT"] * len(data_posterior) } data_frame = pandas.DataFrame(data=data) ax = seaborn.boxplot(x="Pair-matching probability inference method", y="Running time (s)", data=data_frame, sym="") fig = ax.get_figure() fig.tight_layout() image_dir_path = asset_dir_path + "/images" if not os.path.exists(image_dir_path): os.mkdir(image_dir_path) fig.savefig(image_dir_path + "/consalifold_model_comparison_running_time.eps", bbox_inches="tight") fig.clf() print("Running time significance test: ", stats.ttest_rel(data_turner, data_posterior))
) or experiment_name.startswith('original'): print( 'this element is teh baseline and we do not evaluate with respect to itself' ) elif 'spsa' in experiment_name or 'zoo' in experiment_name: print('Spsa e Zoo not under evaluation') else: # Take the baseline and compare with the model baseline = ttest_map[baseline] actual_experiment = ttest_map[experiment_name] base = [] test = [] for user_id in actual_experiment.keys(): base.append(baseline[user_id]) test.append(actual_experiment[user_id]) p = stats.ttest_rel(base, test).pvalue star = '' if p <= 0.05: star = '*' else: start = '#' line = '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format( dataset_name, an_metric, analyzed_k, experiment_name, p, star) f.writelines(line) print(line)
# A researcher noted the number of chocolate chips consumed by 10 rats, with and without electrical stimulation. # The data set s1 represents consumption with stimulation, and s2 without simulation. s1 = [12, 7, 3, 11, 8, 5, 14, 7, 9, 10] s2 = [8, 7, 4, 14, 6, 7, 12, 5, 5, 8] # Compute t-statistic for the above samples, and display the t-score and p-value in separate lines. # Hint: Use the ttest_rel function available in scipy. import numpy as np from scipy import stats a = np.array(s1) b = np.array(s2) ## Checking with the internal scipy function t2, p2 = stats.ttest_rel(a, b) print("t = " + str(t2)) print("p = " + str(p2))
from scipy import stats SIGNIFICANCE_LEVEL = 0.05 H0 = 'As the tree depth increases, the mean accuracies for both DT and BT remain the same i.e. their performance does not change with respect to each other.' H1 = 'As the tree depth increases, the mean accuracies of DT != mean accuracies of BT i.e. their performance differs with respect to each other.' print('H0: {}'.format(H0)) print('H1: {}'.format(H1)) # Accuracies from the 10-Fold Cross Validation for each depth and each model dt_accs_depth_3 = [0.72, 0.77, 0.74, 0.69, 0.79, 0.75, 0.75, 0.76, 0.73, 0.7] dt_accs_depth_5 = [0.75, 0.75, 0.72, 0.7, 0.78, 0.75, 0.72, 0.76, 0.75, 0.7] dt_accs_depth_7 = [0.75, 0.73, 0.71, 0.68, 0.74, 0.75, 0.72, 0.78, 0.77, 0.7] dt_accs_depth_9 = [0.76, 0.73, 0.71, 0.67, 0.73, 0.72, 0.71, 0.8, 0.74, 0.72] dt_accs = [dt_accs_depth_3, dt_accs_depth_5, dt_accs_depth_7, dt_accs_depth_9] bt_accs_depth_3 = [0.73, 0.77, 0.73, 0.7, 0.79, 0.75, 0.75, 0.76, 0.73, 0.7] bt_accs_depth_5 = [0.74, 0.78, 0.72, 0.7, 0.78, 0.75, 0.75, 0.77, 0.73, 0.7] bt_accs_depth_7 = [0.76, 0.78, 0.74, 0.72, 0.79, 0.75, 0.74, 0.77, 0.76, 0.7] bt_accs_depth_9 = [0.75, 0.77, 0.75, 0.72, 0.76, 0.75, 0.75, 0.8, 0.76, 0.71] bt_accs = [bt_accs_depth_3, bt_accs_depth_5, bt_accs_depth_7, bt_accs_depth_9] depths = [3, 5, 7, 9] for i in range(len(depths)): depth = depths[i] t_val, p_val = stats.ttest_rel(dt_accs[i], bt_accs[i]) print( 'Depth: {} H0 for DT and BT: t-statistics = {}, p-value = {} Reject with significance level of {}? {}' .format(depth, t_val, p_val, SIGNIFICANCE_LEVEL, (p_val < SIGNIFICANCE_LEVEL)))
def stats_stpsfmri(corrs1, corrs2, fisherz=True, permutation=False, iter=5000): """ Conduct the statistical analysis for results of fMRI data (STPS searchlight) Parameters ---------- corrs1 : array The correlation coefficients under condition1. The shape of corrs1 must be [n_subs, n_x, n_y, n_z]. n_subs, n_x, n_y, n_z represent the number of subjects, the number of calculation units for searchlight along the x, y, z axis. corrs2 : array The correlation coefficients under condition2. The shape of corrs2 must be [n_subs, n_x, n_y, n_z]. n_subs, n_x, n_y, n_z represent the number of subjects, the number of calculation units for searchlight along the x, y, z axis. fisherz : bool True or False. Default is True. Conduct Fisher-Z transform. permutation : bool True or False. Default is False. Use permutation test or not. iter : int. Default is 5000. The times for iteration. Returns ------- stats : array The statistical results. The shape of stats is [n_x, n_y, n_z, 2]. n_x, n_y, n_z represent the number of calculation units for searchlight along the x, y, z axis and 2 represents a t-value and a p-value. Notes ----- n_subs must >= 6. """ if len(np.shape(corrs1)) != 4 or len(np.shape(corrs2)) != 4 or np.shape(corrs1)[1] != np.shape(corrs2)[1] \ or np.shape(corrs1)[2] != np.shape(corrs2)[2] or np.shape(corrs1)[3] != np.shape(corrs2)[3]: return "Invalid input!" # get the number of subjects subs = np.shape(corrs1)[0] # subs>=6 if subs < 6: return print("the number of subjects is too small!") # get the number of the calculation units in the x, y, z directions n_x, n_y, n_z = np.shape(corrs1)[1:] # initialize the corrs stats = np.zeros([n_x, n_y, n_z, 2], dtype=np.float) # get r-map rs1 = corrs1 rs2 = corrs2 # calculate the statistical results for i in range(n_x): for j in range(n_y): for k in range(n_z): if fisherz is True: # Fisher r to z zs1 = 0.5 * np.log((1 + rs1) / (1 - rs1)) zs2 = 0.5 * np.log((1 + rs2) / (1 - rs2)) # t test stats[i, j, k] = ttest_rel(rs1[:, i, j, k], rs2[:, i, j, k]) if permutation == True: stats[i, j, k, 1] = permutation_test(zs1[:, i, j, k], zs2[:, i, j, k], iter=iter) return stats
while (count2 > 30): r = random.randint(0, count2 - 1) sampled_singularity_df = sampled_singularity_df.drop( sampled_singularity_df.index[r]) count2 = count2 - 1 print('After Sampling:\n') print(sampled_native_df.describe()) print(sampled_singularity_df.describe()) #df = native_df.merge(singularity_df, how='left') #print(df.describe()) print('p-value:\t 0.05\n') print('degrees of freedom:\t ~60\n') print('Critical t-val:\t 2.0\n') t_val_rel = stats.ttest_rel( sampled_native_df.loc[:, 'Native Runtime (Seconds)'], sampled_singularity_df.loc[:, 'Singularity Runtime (Seconds)']) print(t_val_rel) t_val_ind = stats.ttest_ind( sampled_native_df.loc[:, 'Native Runtime (Seconds)'], sampled_singularity_df.loc[:, 'Singularity Runtime (Seconds)']) print(t_val_ind) ax = plt.gca() sampled_native_df.plot(kind='hist', y='Native Runtime (Seconds)', color='red', ax=ax) sampled_singularity_df.plot(kind='hist', y='Singularity Runtime (Seconds)',
# Data stored in form of xlsx with contents: """ group data 0 1 34 1 1 37 2 1 28 3 1 36 4 1 30 5 2 43 6 2 45 7 2 47 8 2 49 9 2 39 """ # Assume these data are paired sample. # ------------------------------------------------------------------------------ IS_t_test = pd.read_excel('E:\\IS_t_test.xlsx') Group1 = IS_t_test[IS_t_test['group'] == 1]['data'] Group2 = IS_t_test[IS_t_test['group'] == 2]['data'] print(ttest_rel(Group1, Group2)) """ (-5.6873679190073361, 0.00471961872448184) """ # The first element from output is the value of t # The second element from output is p-value
df1 = pd.DataFrame({'sample':[6,5,5,4,6,7,6,4,5,6,4,5,5,6,4,8,6,5,6,7]}) df2 = pd.DataFrame({'sample':[7,5,7,8,7,8,8,5,7,6,5,5,6,6,5,7,9,7,7,8]}) t_result = stats.ttest_ind(df1, df2) t, p = t_result.statistic.round(3), t_result.pvalue.round(3) print("2-Sample t-test") print("t검정통계량:{}".format(t)) print("p-value:{}".format(p)) # In[14]: # page67 t-test연습 df1 = pd.DataFrame({'before':[720,589,780,648,720,589,780,648,780,648]}) df2 = pd.DataFrame({'after':[810,670,790,712,810,670,790,712,790,712]}) t_test = stats.ttest_rel(df1, df2) t, p = t_result.statistic.round(3), t_test.pvalue.round(3) print("paired t-test") print("t:{}".format(t)) print("p:{}".format(p)) # In[15]: # page68 paried t-test 실습 df1 = pd.DataFrame({'before':[720,589,780,648,720,589,780,648,780,648]}) df2 = pd.DataFrame({'after':[710,580,787,712,750,600,782,670,790,680]}) t_test = stats.ttest_rel(df1, df2) t, p = t_result.statistic.round(3), t_test.pvalue.round(3) print("paired t-test")
data1 = df_eval1['r_avg'][:400].to_numpy() data2 = df_eval2['r_avg'][:400].to_numpy() data3 = df_eval3['r_avg'][:400].to_numpy() print('P-Value R_AVG: %0.5f %0.5f' % (stat, p)) data1 = df_eval1['f1_avg'][:400].to_numpy() data2 = df_eval2['f1_avg'][:400].to_numpy() data3 = df_eval3['f1_avg'][:400].to_numpy() stat, p = stats.f_oneway(data1, data2, data3) print('P-Value F1_AVG: %0.5f %0.5f' % (stat, p)) print("--- T-TEST Metode Pemilihan Kalimat ---") data1 = df_eval3['jc_avg'][:400].to_numpy() data2 = df_st_eval3['jc_avg'][:400].to_numpy() stat, p = stats.ttest_rel(data1, data2) print('P-Value JC_AVG: %0.5f %0.5f' % (stat, p)) data1 = df_eval3['p_avg'][:400].to_numpy() data2 = df_st_eval3['p_avg'][:400].to_numpy() stat, p = stats.ttest_rel(data1, data2) print('P-Value P_AVG: %0.5f %0.5f' % (stat, p)) data1 = df_eval3['r_avg'][:400].to_numpy() data2 = df_st_eval3['r_avg'][:400].to_numpy() stat, p = stats.ttest_rel(data1, data2) print('P-Value R_AVG: %0.5f %0.5f' % (stat, p)) data1 = df_eval3['f1_avg'][:400].to_numpy() data2 = df_st_eval3['f1_avg'][:400].to_numpy() stat, p = stats.ttest_rel(data1, data2)
def main(): (current_work_dir_path, asset_dir_path, program_dir_path, conda_program_dir_path) = utils.get_dir_paths() num_of_threads = multiprocessing.cpu_count() mafft_plus_consalifold_ppvs = [] mafft_plus_consalifold_senss = [] mafft_plus_consalifold_fprs = [] mafft_plus_consalifold_f1_scores = [] mafft_plus_consalifold_mccs = [] probcons_plus_consalifold_ppvs = [] probcons_plus_consalifold_senss = [] probcons_plus_consalifold_fprs = [] probcons_plus_consalifold_f1_scores = [] probcons_plus_consalifold_mccs = [] clustalw_plus_consalifold_ppvs = [] clustalw_plus_consalifold_senss = [] clustalw_plus_consalifold_fprs = [] clustalw_plus_consalifold_f1_scores = [] clustalw_plus_consalifold_mccs = [] mafft_xinsi_plus_consalifold_ppvs = [] mafft_xinsi_plus_consalifold_senss = [] mafft_xinsi_plus_consalifold_fprs = [] mafft_xinsi_plus_consalifold_f1_scores = [] mafft_xinsi_plus_consalifold_mccs = [] ref_sa_plus_consalifold_ppvs = [] ref_sa_plus_consalifold_senss = [] ref_sa_plus_consalifold_fprs = [] ref_sa_plus_consalifold_f1_scores = [] ref_sa_plus_consalifold_mccs = [] contra_probcons_plus_consalifold_ppvs = [] contra_probcons_plus_consalifold_senss = [] contra_probcons_plus_consalifold_fprs = [] contra_probcons_plus_consalifold_f1_scores = [] contra_probcons_plus_consalifold_mccs = [] contra_clustalw_plus_consalifold_ppvs = [] contra_clustalw_plus_consalifold_senss = [] contra_clustalw_plus_consalifold_fprs = [] contra_clustalw_plus_consalifold_f1_scores = [] contra_clustalw_plus_consalifold_mccs = [] contra_mafft_plus_consalifold_ppvs = [] contra_mafft_plus_consalifold_senss = [] contra_mafft_plus_consalifold_fprs = [] contra_mafft_plus_consalifold_f1_scores = [] contra_mafft_plus_consalifold_mccs = [] contra_mafft_xinsi_plus_consalifold_ppvs = [] contra_mafft_xinsi_plus_consalifold_senss = [] contra_mafft_xinsi_plus_consalifold_fprs = [] contra_mafft_xinsi_plus_consalifold_f1_scores = [] contra_mafft_xinsi_plus_consalifold_mccs = [] contra_ref_sa_plus_consalifold_ppvs = [] contra_ref_sa_plus_consalifold_senss = [] contra_ref_sa_plus_consalifold_fprs = [] contra_ref_sa_plus_consalifold_f1_scores = [] contra_ref_sa_plus_consalifold_mccs = [] gammas = [2.**i for i in range(min_gamma, max_gamma + 1)] rna_fam_dir_path = asset_dir_path + "/compiled_rna_fams_test" ref_sa_dir_path = asset_dir_path + "/ref_sas_test" mafft_plus_consalifold_css_dir_path = asset_dir_path + "/mafft_plus_consalifold" probcons_plus_consalifold_css_dir_path = asset_dir_path + "/probcons_plus_consalifold" clustalw_plus_consalifold_css_dir_path = asset_dir_path + "/clustalw_plus_consalifold" mafft_xinsi_plus_consalifold_css_dir_path = asset_dir_path + "/mafft_xinsi_plus_consalifold" ref_sa_plus_consalifold_css_dir_path = asset_dir_path + "/ref_sa_plus_consalifold" contra_probcons_plus_consalifold_css_dir_path = asset_dir_path + "/contra_probcons_plus_consalifold" contra_clustalw_plus_consalifold_css_dir_path = asset_dir_path + "/contra_clustalw_plus_consalifold" contra_mafft_plus_consalifold_css_dir_path = asset_dir_path + "/contra_mafft_plus_consalifold" contra_mafft_xinsi_plus_consalifold_css_dir_path = asset_dir_path + "/contra_mafft_xinsi_plus_consalifold" contra_ref_sa_plus_consalifold_css_dir_path = asset_dir_path + "/contra_ref_sa_plus_consalifold" pool = multiprocessing.Pool(num_of_threads) for gamma in gammas: mafft_plus_consalifold_count_params = [] clustalw_plus_consalifold_count_params = [] mafft_xinsi_plus_consalifold_count_params = [] ref_sa_plus_consalifold_count_params = [] probcons_plus_consalifold_count_params = [] contra_probcons_plus_consalifold_count_params = [] contra_clustalw_plus_consalifold_count_params = [] contra_mafft_plus_consalifold_count_params = [] contra_mafft_xinsi_plus_consalifold_count_params = [] contra_ref_sa_plus_consalifold_count_params = [] gamma_str = str(gamma) if gamma < 1 else str(int(gamma)) for rna_fam_file in os.listdir(rna_fam_dir_path): if not rna_fam_file.endswith(".fa"): continue rna_seq_file_path = os.path.join(rna_fam_dir_path, rna_fam_file) rna_seq_lens = [ len(rna_seq.seq) for rna_seq in SeqIO.parse(rna_seq_file_path, "fasta") ] num_of_rnas = len(rna_seq_lens) (rna_fam_name, extension) = os.path.splitext(rna_fam_file) ref_css_file_path = os.path.join(ref_sa_dir_path, rna_fam_name + ".sth") ref_css = utils.get_css(ref_css_file_path) mafft_plus_consalifold_estimated_css_dir_path = os.path.join( mafft_plus_consalifold_css_dir_path, rna_fam_name) probcons_plus_consalifold_estimated_css_dir_path = os.path.join( probcons_plus_consalifold_css_dir_path, rna_fam_name) clustalw_plus_consalifold_estimated_css_dir_path = os.path.join( clustalw_plus_consalifold_css_dir_path, rna_fam_name) mafft_xinsi_plus_consalifold_estimated_css_dir_path = os.path.join( mafft_xinsi_plus_consalifold_css_dir_path, rna_fam_name) ref_sa_plus_consalifold_estimated_css_dir_path = os.path.join( ref_sa_plus_consalifold_css_dir_path, rna_fam_name) contra_probcons_plus_consalifold_estimated_css_dir_path = os.path.join( contra_probcons_plus_consalifold_css_dir_path, rna_fam_name) contra_clustalw_plus_consalifold_estimated_css_dir_path = os.path.join( contra_clustalw_plus_consalifold_css_dir_path, rna_fam_name) contra_mafft_plus_consalifold_estimated_css_dir_path = os.path.join( contra_mafft_plus_consalifold_css_dir_path, rna_fam_name) contra_mafft_xinsi_plus_consalifold_estimated_css_dir_path = os.path.join( contra_mafft_xinsi_plus_consalifold_css_dir_path, rna_fam_name) contra_ref_sa_plus_consalifold_estimated_css_dir_path = os.path.join( contra_ref_sa_plus_consalifold_css_dir_path, rna_fam_name) mafft_plus_consalifold_estimated_css_file_path = os.path.join( mafft_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( mafft_plus_consalifold_estimated_css_file_path) mafft_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) probcons_plus_consalifold_estimated_css_file_path = os.path.join( probcons_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( probcons_plus_consalifold_estimated_css_file_path) probcons_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) clustalw_plus_consalifold_estimated_css_file_path = os.path.join( clustalw_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( clustalw_plus_consalifold_estimated_css_file_path) clustalw_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) mafft_xinsi_plus_consalifold_estimated_css_file_path = os.path.join( mafft_xinsi_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( mafft_xinsi_plus_consalifold_estimated_css_file_path) mafft_xinsi_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) ref_sa_plus_consalifold_estimated_css_file_path = os.path.join( ref_sa_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( ref_sa_plus_consalifold_estimated_css_file_path) ref_sa_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) contra_probcons_plus_consalifold_estimated_css_file_path = os.path.join( contra_probcons_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( contra_probcons_plus_consalifold_estimated_css_file_path) contra_probcons_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) contra_clustalw_plus_consalifold_estimated_css_file_path = os.path.join( contra_clustalw_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( contra_clustalw_plus_consalifold_estimated_css_file_path) contra_clustalw_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) contra_mafft_plus_consalifold_estimated_css_file_path = os.path.join( contra_mafft_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( contra_mafft_plus_consalifold_estimated_css_file_path) contra_mafft_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) contra_mafft_xinsi_plus_consalifold_estimated_css_file_path = os.path.join( contra_mafft_xinsi_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( contra_mafft_xinsi_plus_consalifold_estimated_css_file_path) contra_mafft_xinsi_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) contra_ref_sa_plus_consalifold_estimated_css_file_path = os.path.join( contra_ref_sa_plus_consalifold_estimated_css_dir_path, "gamma=" + gamma_str + ".sth") estimated_css = utils.get_css( contra_ref_sa_plus_consalifold_estimated_css_file_path) contra_ref_sa_plus_consalifold_count_params.insert( 0, (rna_seq_lens, estimated_css, ref_css)) results = pool.map(get_bin_counts, probcons_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) probcons_plus_consalifold_ppvs.insert(0, ppv) probcons_plus_consalifold_senss.insert(0, sens) probcons_plus_consalifold_fprs.insert(0, fpr) probcons_plus_consalifold_f1_scores.append(f1_score) probcons_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, clustalw_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) clustalw_plus_consalifold_ppvs.insert(0, ppv) clustalw_plus_consalifold_senss.insert(0, sens) clustalw_plus_consalifold_fprs.insert(0, fpr) clustalw_plus_consalifold_f1_scores.append(f1_score) clustalw_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, mafft_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) mafft_plus_consalifold_ppvs.insert(0, ppv) mafft_plus_consalifold_senss.insert(0, sens) mafft_plus_consalifold_fprs.insert(0, fpr) mafft_plus_consalifold_f1_scores.append(f1_score) mafft_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, mafft_xinsi_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) mafft_xinsi_plus_consalifold_ppvs.insert(0, ppv) mafft_xinsi_plus_consalifold_senss.insert(0, sens) mafft_xinsi_plus_consalifold_fprs.insert(0, fpr) mafft_xinsi_plus_consalifold_f1_scores.append(f1_score) mafft_xinsi_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, ref_sa_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) ref_sa_plus_consalifold_ppvs.insert(0, ppv) ref_sa_plus_consalifold_senss.insert(0, sens) ref_sa_plus_consalifold_fprs.insert(0, fpr) ref_sa_plus_consalifold_f1_scores.append(f1_score) ref_sa_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, contra_probcons_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) contra_probcons_plus_consalifold_ppvs.insert(0, ppv) contra_probcons_plus_consalifold_senss.insert(0, sens) contra_probcons_plus_consalifold_fprs.insert(0, fpr) contra_probcons_plus_consalifold_f1_scores.append(f1_score) contra_probcons_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, contra_clustalw_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) contra_clustalw_plus_consalifold_ppvs.insert(0, ppv) contra_clustalw_plus_consalifold_senss.insert(0, sens) contra_clustalw_plus_consalifold_fprs.insert(0, fpr) contra_clustalw_plus_consalifold_f1_scores.append(f1_score) contra_clustalw_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, contra_mafft_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) contra_mafft_plus_consalifold_ppvs.insert(0, ppv) contra_mafft_plus_consalifold_senss.insert(0, sens) contra_mafft_plus_consalifold_fprs.insert(0, fpr) contra_mafft_plus_consalifold_f1_scores.append(f1_score) contra_mafft_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, contra_mafft_xinsi_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) contra_mafft_xinsi_plus_consalifold_ppvs.insert(0, ppv) contra_mafft_xinsi_plus_consalifold_senss.insert(0, sens) contra_mafft_xinsi_plus_consalifold_fprs.insert(0, fpr) contra_mafft_xinsi_plus_consalifold_f1_scores.append(f1_score) contra_mafft_xinsi_plus_consalifold_mccs.append(mcc) results = pool.map(get_bin_counts, contra_ref_sa_plus_consalifold_count_params) ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results)) contra_ref_sa_plus_consalifold_ppvs.insert(0, ppv) contra_ref_sa_plus_consalifold_senss.insert(0, sens) contra_ref_sa_plus_consalifold_fprs.insert(0, fpr) contra_ref_sa_plus_consalifold_f1_scores.append(f1_score) contra_ref_sa_plus_consalifold_mccs.append(mcc) image_dir_path = asset_dir_path + "/images" if not os.path.exists(image_dir_path): os.mkdir(image_dir_path) consalifold_avg_mccs = [ numpy.mean(clustalw_plus_consalifold_mccs), numpy.mean(mafft_plus_consalifold_mccs), numpy.mean(probcons_plus_consalifold_mccs), numpy.mean(mafft_xinsi_plus_consalifold_mccs), numpy.mean(ref_sa_plus_consalifold_mccs) ] contra_consalifold_avg_mccs = [ numpy.mean(contra_clustalw_plus_consalifold_mccs), numpy.mean(contra_mafft_plus_consalifold_mccs), numpy.mean(contra_probcons_plus_consalifold_mccs), numpy.mean(contra_mafft_xinsi_plus_consalifold_mccs), numpy.mean(contra_ref_sa_plus_consalifold_mccs) ] avg_mccs = consalifold_avg_mccs + contra_consalifold_avg_mccs data = { "Average Matthews correlation coefficient": avg_mccs, "Alignment probability inference method": ["Turner"] * 5 + ["CONTRAfold"] * 5, "Sequence alignment source": ["ClustalW", "MAFFT", "ProbCons-RNA ", "MAFFT X-INS-i", "Reference"] * 2 } data_frame = pandas.DataFrame(data=data) ax = seaborn.barplot(x="Sequence alignment source", y="Average Matthews correlation coefficient", hue="Alignment probability inference method", data=data_frame) ax.legend_.remove() fig = ax.get_figure() fig.tight_layout() fig.savefig(image_dir_path + "/consalifold_model_comparison_mcc_2.eps", bbox_inches="tight") fig.clf() consalifold_avg_f1_scores = [ numpy.mean(clustalw_plus_consalifold_f1_scores), numpy.mean(mafft_plus_consalifold_f1_scores), numpy.mean(probcons_plus_consalifold_f1_scores), numpy.mean(mafft_xinsi_plus_consalifold_f1_scores), numpy.mean(ref_sa_plus_consalifold_f1_scores) ] contra_consalifold_avg_f1_scores = [ numpy.mean(contra_clustalw_plus_consalifold_f1_scores), numpy.mean(contra_mafft_plus_consalifold_f1_scores), numpy.mean(contra_probcons_plus_consalifold_f1_scores), numpy.mean(contra_mafft_xinsi_plus_consalifold_f1_scores), numpy.mean(contra_ref_sa_plus_consalifold_f1_scores) ] avg_f1_scores = consalifold_avg_f1_scores + contra_consalifold_avg_f1_scores data = { "Average F1 score": avg_f1_scores, "Alignment probability inference method": ["Turner"] * 5 + ["CONTRAfold"] * 5, "Sequence alignment source": ["ClustalW", "MAFFT", "ProbCons-RNA ", "MAFFT X-INS-i", "Reference"] * 2 } data_frame = pandas.DataFrame(data=data) ax = seaborn.barplot(x="Sequence alignment source", y="Average F1 score", hue="Alignment probability inference method", data=data_frame) pyplot.ylim(0, 0.75) ax.legend(loc="upper left") fig = ax.get_figure() fig.tight_layout() fig.savefig(image_dir_path + "/consalifold_model_comparison_f1_score_2.eps", bbox_inches="tight") fig.clf() consalifold_mccs = clustalw_plus_consalifold_mccs + mafft_plus_consalifold_mccs + probcons_plus_consalifold_mccs + mafft_xinsi_plus_consalifold_mccs + ref_sa_plus_consalifold_mccs contra_consalifold_mccs = contra_clustalw_plus_consalifold_mccs + contra_mafft_plus_consalifold_mccs + contra_probcons_plus_consalifold_mccs + contra_mafft_xinsi_plus_consalifold_mccs + contra_ref_sa_plus_consalifold_mccs consalifold_f1_scores = clustalw_plus_consalifold_f1_scores + mafft_plus_consalifold_f1_scores + probcons_plus_consalifold_f1_scores + mafft_xinsi_plus_consalifold_f1_scores + ref_sa_plus_consalifold_f1_scores contra_consalifold_f1_scores = contra_clustalw_plus_consalifold_f1_scores + contra_mafft_plus_consalifold_f1_scores + contra_probcons_plus_consalifold_f1_scores + contra_mafft_xinsi_plus_consalifold_f1_scores + contra_ref_sa_plus_consalifold_f1_scores print("MCC-based paired t-test:", stats.ttest_rel(consalifold_mccs, contra_consalifold_mccs)) print("F1 score-based paired t-test:", stats.ttest_rel(consalifold_f1_scores, contra_consalifold_f1_scores))
def compare_images(label_nii_filename, image1_nii_filename, image2_nii_filename, requested_labels, min_volume, verbose_flag=False): # Load arrays label_nii = label_stats.read_nifti_file(label_nii_filename, 'Label file does not exist') image1_nii = label_stats.read_nifti_file(image1_nii_filename, 'Image file does not exist') image2_nii = label_stats.read_nifti_file(image2_nii_filename, 'Image file does not exist') # System Checks to verify that the Array Size and Dimensions are compatible image1_array = image1_nii.get_data() image2_array = image2_nii.get_data() label_array = label_nii.get_data() label_stats.image_shape_check(image1_array) label_stats.image_shape_check(image2_array) if not image1_array.shape == image2_array.shape: sys.exit('Image arrays must have the same shape') if not len(label_array.shape) == 3: sys.exit('Only supports 3D label arrays') if not image1_array.shape[0:len(label_array.shape)] == label_array.shape: sys.exit( 'Image array and label array do not have the same voxel dimensions' ) # Find a set of acceptable labels labels = label_stats.get_labels(requested_labels, label_array) # Permute array or expand so desired stats is along first dimension image1_array, nVolumes = label_stats.permute_image_array(image1_array) image2_array, nVolumes = label_stats.permute_image_array(image2_array) # Gather stats df_stats = pd.DataFrame( columns=('label_number', 'time_index', 'label_volume', 'boundary_image1_mean', 'boundary_image1_std', 'boundary_image1_min', 'boundary_image1_max', 'boundary_image2_mean', 'boundary_image2_std', 'boundary_image2_min', 'boundary_image2_max', 'image1_mean', 'image1_std', 'image1_min', 'image1_max', 'image2_mean', 'image2_std', 'image2_min', 'image2_max', 'scale', 'p_rel_scaled')) for ii, ii_label in enumerate(labels): mask = label_array == ii_label boundary_mask = binary_dilation(mask, structure=np.ones((3, 3, 3))) boundary_mask -= mask label_volume = np.sum(mask[:]) if label_volume >= min_volume: # Only perform paired t-test for volumes of a minimum size for jj in range(0, nVolumes): # Calculate signal intensity of boundary pixels boundary_image1_mean, boundary_image1_std, boundary_image1_min, boundary_image1_max = label_stats.individual_image_stats( image1_array[jj][boundary_mask]) boundary_image2_mean, boundary_image2_std, boundary_image2_min, boundary_image2_max = label_stats.individual_image_stats( image2_array[jj][boundary_mask]) scale = boundary_image1_mean / boundary_image2_mean # Scale image to match boundary pixels image1_mean, image1_std, image1_min, image1_max = label_stats.individual_image_stats( image1_array[jj][mask]) image2_mean, image2_std, image2_min, image2_max = label_stats.individual_image_stats( scale * image2_array[jj][mask]) # Calculate paired t-test from region of interest across two images t_rel, p_rel_scaled = stats.ttest_rel( image1_array[jj][mask], scale * image2_array[jj][mask]) # Save stats image_array_stats = [ ii_label, jj, label_volume, boundary_image1_mean, boundary_image1_std, boundary_image1_min, boundary_image1_max, boundary_image2_mean, boundary_image2_std, boundary_image2_min, boundary_image2_max, image1_mean, image1_std, image1_min, image1_max, image2_mean, image2_std, image2_min, image2_max, scale, p_rel_scaled ] if verbose_flag: print image_array_stats df_stats.loc[len(df_stats)] = image_array_stats return df_stats
healthy_without_Zeros.append(a[2:]) cancer_without_Zeros.append(b[2:]) healthy_without_Zeros = [ list(map(float, sublist)) for sublist in healthy_without_Zeros ] cancer_without_Zeros = [ list(map(float, sublist)) for sublist in cancer_without_Zeros ] # print(cancer_without_Zeros) # print(len(cancer_without_Zeros)) # print(Canser_names) # get the p-values when Samples are paired. p_val_pair = [] for x, y in zip(healthy_without_Zeros, cancer_without_Zeros): l = stats.ttest_rel(x, y).pvalue p_val_pair.append(l) # get the p-values when Samples are independent. p_val_ind = [] for x, y in zip(healthy_without_Zeros, cancer_without_Zeros): l = stats.ttest_ind(x, y).pvalue p_val_ind.append(l) # #Apply the FDR multiple tests correction method on the paired samples. # #list of tubles(reject:true for hypothesis that can be rejected for given alpha, # #p-values corrected,corrected alpha for Sidak method,corrected alpha for Bonferroni method) corrected_p_valpair_rej = multipletests(p_val_pair, alpha=0.05, method='fdr_bh')[0] corrected_p_val_pair = multipletests(p_val_pair, alpha=0.05,