def main():
  acc_array = {'smo':[], 'bayes':[], 'tree':[]}
  output = open("output_all.txt", 'w')
  output.write("classifier\taccuracy\t\tprecision(0)\t\tprecision(4)\t\trecall(0)\t\trecall(4)\n")  
  for method in ['smo', 'bayes', 'tree']:
    output.write("\n")
    for i in range(10):
      f = open(method+str(i)+".txt", 'r')
      lst = f.readlines()
      f.close()
      
      c00 = float(lst[-3].strip().split()[0])
      c01 = float(lst[-3].strip().split()[1])
      c10 = float(lst[-2].strip().split()[0])
      c11 = float(lst[-2].strip().split()[1])
      accuracy = (c00 + c11) / 1100.000
      acc_array[method].append(accuracy)
      
      precision0 = c00 / (c00 + c01)
      precision4 = c11 / (c10 + c11)
      recall0 = c00 / (c00 + c10)
      recall4 = c11 / (c01 + c11)
      output.write (method+"\t"+str(accuracy)+"\t"+str(precision0)+"\t"+str(precision4)+"\t"+str(recall0)+"\t"+str(recall4)+"\n")
  #print(acc_array)
  s1 = stats.ttest_rel(acc_array['smo'], acc_array['bayes'])
  s2 = stats.ttest_rel(acc_array['smo'], acc_array['tree'])
  s3 = stats.ttest_rel(acc_array['tree'], acc_array['bayes'])
  print(s1)
  print(s2)
  print(s3)
def analyze(c, subset, db, popularity):
    query_count = c.shape[0]
    print('distinct query count: %d' % query_count)
    if popularity not in c:
        print('warning: popularity column not found')
        c[popularity] = 1
    s = c[popularity].sum()
    print('query count: %d' % c[popularity].sum())
    r1 = c[subset] * c[popularity]
    r2 = c[db] * c[popularity]
    r3 = c['ql'] * c[popularity]
    r4 = c['ml'] * c[popularity]
    r5 = c['best'] * c[popularity]
    r6 = c['rand'] * c[popularity]
    s = c[popularity].sum()
    print('set \t sub \t db \t ql \t ml \t best \t rand')
    print('all \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f' %
          (r1.sum() / s, r2.sum() / s, r3.sum() / s, r4.sum() / s, r5.sum() /
           s, r6.sum() / s))
    b = c['Label'] == 1
    s = c[popularity][b].sum()
    r1 = c[subset][b] * c[popularity][b]
    r2 = c[db][b] * c[popularity][b]
    r3 = c['ql'][b] * c[popularity][b]
    r4 = c['ml'][b] * c[popularity][b]
    r5 = c['best'][b] * c[popularity][b]
    r6 = c['rand'][b] * c[popularity][b]
    sdf= c[popularity][b].sum()
    print('bad \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f' %
          (r1.sum() / s, r2.sum() / s, r3.sum() / s, r4.sum() / s, r5.sum() /
           s, r6.sum() / s))
    nb = c['Label'] == 0
    s = c[popularity][nb].sum()
    r1 = c[subset][nb] * c[popularity][nb]
    r2 = c[db][nb] * c[popularity][nb]
    r3 = c['ql'][nb] * c[popularity][nb]
    r4 = c['ml'][nb] * c[popularity][nb]
    r5 = c['best'][nb] * c[popularity][nb]
    r6 = c['rand'][nb] * c[popularity][nb]
    sdf= c[popularity][nb].sum()
    print('n_bad \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f \t %.2f' %
          (r1.sum() / s, r2.sum() / s, r3.sum() / s, r4.sum() / s, r5.sum() /
           s, r6.sum() / s))
    bad_count = b.value_counts()[True]
    print('%d distinct bad queries (%.2f %%)' %
          (bad_count, bad_count * 100 / query_count))
    ml_to_cache = c['ml_label'] * c[popularity]
    ql_to_cache = c['ql_label'] * c[popularity]
    best_to_cache = c['best'] * c[popularity]
    s = float(c[popularity].sum())
    print('queries sent to full db by ml: %.2f%%' % (ml_to_cache.sum() / s))
    print('queries sent to full db by ql: %.2f%%' % (ql_to_cache.sum() / s))
    print('queries sent to full db by best: %.2f%%' % (best_to_cache.sum() / s))
    print('queries with mrr > 0 on cache: %.2f%%' %
          (c[popularity][c[subset] > 0].sum() / c[popularity].sum()))
    print('queries with mrr > 0 on cache: %.2f%%' %
          (c[popularity][c[db] > 0].sum() / c[popularity].sum()))
    print('ml and rand ' + str(ttest_rel(c['ml'], c['rand'])))
    print('ql and rand ' + str(ttest_rel(c['ql'], c['rand'])))
    print('subset and rand ' + str(ttest_rel(c[subset], c['rand'])))
Beispiel #3
0
def plot_cd_data(pre_arr, peri_arr, post_arr):
	
	# Custom function to draw the p-value bars
	def label_diff(i,j,text,X,Y):
		x = (X[i]+X[j])/2 ##center of the p-val bar
		y = max(Y[i], Y[j])
		
		props = {'connectionstyle':'bar','arrowstyle':'-',\
					 'shrinkA':20,'shrinkB':20,'lw':2}
		ax.annotate(text, xy=(x,y+0.1), zorder=10)
		ax.annotate('', xy=(X[i],y), xytext=(X[j],y), arrowprops=props)

	##create a numpy array containing the mean vals for the bar chart
	means = np.array([pre_arr.mean(), peri_arr.mean(), post_arr.mean()])
	##get the standard error values
	errs = np.array([stats.sem(pre_arr), stats.sem(peri_arr), stats.sem(post_arr)])
	##calculate the p-values between each of the sets
	p_pre_peri = np.round(stats.ttest_rel(pre_arr, peri_arr)[1], 3)
	p_pre_post = np.round(stats.ttest_rel(pre_arr, post_arr)[1], 3)
	p_peri_post = np.round(stats.ttest_rel(peri_arr, post_arr)[1], 3)
	##put all the arrays into one big array to plot the
	##individual lines
	all_arr = np.zeros((3,pre_arr.size))
	all_arr[0,:] = pre_arr
	all_arr[1,:] = peri_arr
	all_arr[2,:] = post_arr

	##formatting stuff
	idx  = np.arange(3)    # the x locations for the groups
	width= 0.8
	labels = ('Pre', 'CD', 'Reinstatement')

	# Pull the formatting out here
	bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5}
	err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'}

	X = idx+width/2 ##position of the center of the bars

	fig, ax = plt.subplots()
	ax.p1 = plt.bar(idx, means, alpha = 0.5, **bar_kwargs)
	ax.errs = plt.errorbar(X, means, yerr=errs, **err_kwargs)

	##plot the individual lines on their own axis
	ax2 = ax.twinx()
	ax2.lines = plt.plot(np.linspace(0,3,3), all_arr)
	ax2.set_ylabel("Percent correct")


	# Call the function
	label_diff(0,1,'p='+str(p_pre_peri),X,means)
	label_diff(0,2,'p='+str(p_pre_post),X,means)
	label_diff(1,2,'p='+str(p_peri_post),X,means)

	ax.set_ylim(ymax=means.max()+0.3)
	plt.xticks(X, labels, color='k')
	plt.title("Performance during contingency degredation")
	ax.set_ylabel("Percent correct")
	plt.show()
Beispiel #4
0
def plot_fr_means(arrs1, arrs2, chunk1 = (0,10), chunk2 = (35,45), n = None):

	##grab the specified chunks
	arrs1_early = arrs1[:,chunk1[0]*60*1000:chunk1[1]*60*1000]
	arrs1_late = arrs1[:,chunk2[0]*60*1000:chunk2[1]*60*1000]
	arrs2_early = arrs2[:,chunk1[0]*60*1000:chunk1[1]*60*1000]
	arrs2_late = arrs2[:,chunk2[0]*60*1000:chunk2[1]*60*1000]
	##calculate the means across all the arrays
	means =np.array([arrs1_early.mean(), 
		arrs2_early.mean(), arrs1_late.mean(), 
		arrs2_late.mean()])*1000
	##get the across session means
	m_arrs1_early = arrs1_early.mean(axis = 1)*1000
	m_arrs2_early = arrs2_early.mean(axis = 1)*1000
	m_arrs1_late = arrs1_late.mean(axis = 1)*1000
	m_arrs2_late = arrs2_late.mean(axis = 1)*1000
	##get an array of SEM mesurements for the error bars
	errs = np.array([stats.sem(m_arrs1_early,axis = None), 
		stats.sem(m_arrs2_early,axis = None),
		stats.sem(m_arrs1_late,axis = None), 
		stats.sem(m_arrs2_late, axis = None)])
	##calculate the t-tests
	p_e1s = stats.ttest_rel(m_arrs1_early, m_arrs1_late)
	p_e2s = stats.ttest_rel(m_arrs2_early, m_arrs2_late)
	p_e12_early = stats.ttest_rel(m_arrs1_early, m_arrs2_early)
	p_e12_late = stats.ttest_rel(m_arrs1_late, m_arrs2_late)
	##print the ttest results
	print "p_e1s = " + str(p_e1s)
	print "p_e2s = " + str(p_e2s)
	print "p_e12_early = " + str(p_e12_early)
	print "p_e12_late = " + str(p_e12_late)
	##plot the bar graph
	##formatting stuff
	idx  = np.arange(4)    # the x locations for the groups
	width= 0.8
	labels = ('E1 early', 'E2_early', 'E1_late', 'E2_late')

	# Pull the formatting out here
	bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5}
	err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'}

	X = idx+width/2 ##position of the center of the bars

	fig, ax = plt.subplots()
	ax.p1 = plt.bar(idx, means, alpha = 0.5, **bar_kwargs)
	ax.errs = plt.errorbar(X, means, yerr=errs, **err_kwargs)

	ax.set_ylim(ymax=means.max()+means.max()/6.0)
	plt.xticks(X, labels, color='k')
	plt.title("Average firing rate within sessions")
	ax.set_ylabel("FR (Hz)")
	if n is not None:
		plt.text(0.2, means.max()+means.max()/10, "n= "+str(n)+" sessions")

	plt.show()
Beispiel #5
0
def write_transforms_to_file(transforms,filename="dummy_transforms.txt",min_pairs=3,p_level=0.05,std_min=0.0,id_assays=True,full_info=False):
    """
Write selected transformations to file.
min_pairs  : Minimum number of pairs per transformations
p_level    : Maximum p_value
std_min    : Minimum Standard deviation of differences within pairs
id_assays  : separately output statistics for using pairs from identical assays only
    """

    print "Writing significant transformations to file"
    if min_pairs < 2:
        print "At least 2 pairs per transformation are necessary for significance tests."
        print "min_pairs set to 2"
        min_pairs = 2

    header = "Transformation\tAssay_specific\tp-value\tAverage_Activity_Difference\tSigma_Differences\tnpairs"
    if full_info: header = header+"\tLigand_IDs\tlog(Activities[nM])\tAssay_Identity"
    header = header+"\n"
    f = open(filename,"w")
    f.write(header)

    for transf,pairs in transforms.iteritems():
        if len(pairs["ligand_ids"]) < min_pairs: continue
        diffs = pairs["deltas"]
        npairs_all = len(diffs)
        p_all = stats.ttest_rel(diffs,[0.0 for i in diffs])[1]
        av_all = sum(diffs)/npairs_all
        std_all = stats.tstd(diffs)
        if npairs_all >= min_pairs and p_all <= p_level and std_all >= std_min:
            f.write(transf+"\t"+"mixed_assays"+"\t"+"{:4.2}".format(p_all)+"\t"+"{:4.3}".format(av_all)+"\t"+"{:4.2}".format(std_all)+"\t"+str(npairs_all))
            if full_info:
                for i in range(npairs_all): f.write("\t"+pairs["ligand_ids"][i][0]+":"+pairs["ligand_ids"][i][1])
                for i in range(npairs_all): f.write("\t"+"{:4.3}".format(pairs["activities1"][i])+":"+"{:4.3}".format(pairs["activities2"][i]))
                for i in range(npairs_all): f.write("\t"+str(pairs["assay_identity"][i]))
            f.write("\n")
        if id_assays == False: continue
        diffs_id = list(set([pairs["deltas"][i] for i in range(npairs_all) if pairs["assay_identity"][i]]))
        npairs_id = len(diffs_id)
        if npairs_id < min_pairs: continue
        p_id = stats.ttest_rel(diffs_id,[0.0 for i in diffs_id])[1]
        av_id = sum(diffs_id)/npairs_id
        std_id = stats.tstd(diffs_id)
        if npairs_id >= min_pairs and p_id <= p_level and std_id >= std_min:
            f.write(transf+"\t"+"ident_assays"+"\t"+"{:4.2}".format(p_id)+"\t"+"{:4.3}".format(av_id)+"\t"+"{:4.2}".format(std_id)+"\t"+str(npairs_id))
            if full_info:
                for i in range(npairs_all):
                    if pairs["assay_identity"][i] == True: f.write("\t"+pairs["ligand_ids"][i][0]+":"+pairs["ligand_ids"][i][1])
                for i in range(npairs_all):
                    if pairs["assay_identity"][i] == True:f.write("\t"+"{:4.3}".format(pairs["activities1"][i])+":"+"{:4.2}".format(pairs["activities2"][i]))
                for i in range(npairs_all):
                   if pairs["assay_identity"][i] == True:f.write("\t"+str(pairs["assay_identity"][i]))
            f.write("\n")

    f.close()
Beispiel #6
0
def decoder_perf_stats(data_dir='/auto/tdrive/mschachter/data'):

    df = pd.read_csv(os.path.join(data_dir, 'aggregate', 'decoder_perfs_for_glm.csv'))

    decomps = ['full_psds', 'spike_rate', 'spike_rate+spike_sync']

    aprops = ['maxAmp', 'meanspect', 'q2', 'q3', 'skewspect', 'q1', 'entropytime', 'entropyspect', 'skewtime',
              'sal', 'maxfund', 'cvfund', 'minfund', 'stdspect', 'fund', 'kurtosisspect', 'kurtosistime',
              'voice2percent', 'fund2']

    r2_vals_by_aprop = dict()
    for aprop in aprops:

        r2_vals = dict()
        for decomp in decomps:
            r2_vals[decomp] = list()

        i = (df.aprop == aprop) & (df.r2 > 0) & ~np.isnan(df.r2)
        g = df[i].groupby(['bird', 'block', 'segment', 'hemi'])
        for (bird, block, segment, hemi), gdf in g:
            if len(gdf) != len(decomps):
                print "Missing data for aprop=%s, (%s,%s,%s,%s), len(gdf)=%d" % (aprop, bird, block, segment, hemi, len(gdf))
                continue

            for decomp in decomps:
                ii = gdf.decomp == decomp
                assert ii.sum() == 1
                r2_vals[decomp].append(gdf[ii].r2.values[0])

        r2_vals_by_aprop[aprop] = r2_vals

    for aprop in aprops:

        r2_vals = r2_vals_by_aprop[aprop]

        lfp_r2 = np.array(r2_vals['full_psds'])
        spike_r2 = np.array(r2_vals['spike_rate'])
        sync_r2 = np.array(r2_vals['spike_rate+spike_sync'])

        lfp_vs_spike_t,lfp_vs_spike_p = ttest_rel(lfp_r2, spike_r2)
        lfp_vs_sync_t, lfp_vs_sync_p = ttest_rel(lfp_r2, sync_r2)
        spike_vs_sync_t, spike_vs_sync_p = ttest_rel(spike_r2, sync_r2)

        print '----------- %s ------------' % aprop

        print 'N=%d' % len(lfp_r2)
        print 'lfp_r2 = %0.2f +/- %0.2f' % (lfp_r2.mean(), lfp_r2.std(ddof=1))
        print 'spike_r2 = %0.2f +/- %0.2f' % (spike_r2.mean(), spike_r2.std(ddof=1))
        print 'sync_r2 = %0.2f +/- %0.2f' % (sync_r2.mean(), sync_r2.std(ddof=1))

        print 'LFP vs Spike: t=%0.6f, p=%0.6f' % (lfp_vs_spike_t, lfp_vs_spike_p)
        print 'LFP vs Spike+Sync: t=%0.6f, p=%0.6f' % (lfp_vs_sync_t, lfp_vs_sync_p)
        print 'Spike vs Spike+Sync: t=%0.6f, p=%0.6f' % (spike_vs_sync_t, spike_vs_sync_p)
Beispiel #7
0
def directional(M, window=None, circ=False, extrapolate=True):
    """From a symmetrical matrix M of size n, return a vector d whose each 
    component d[i] is a T-test of two samples represented by vectors of size
    window on either side of the i-th pixel on the diagonal. Edge elements may 
    be extrapolated based on the vector size reduction, except in the case of 
    circular genomes. If they aren't, d will be of size n - 2*(window-1) 
    instead of n.
    """
    # Sanity checks
    if not type(M) is np.ndarray:
        M = np.array(M)

    if M.shape[0] != M.shape[1]:
        raise ValueError("Matrix is not square.")

    try:
        n = min(M.shape)
    except AttributeError:
        n = M.size

    # Default window argument
    if window is None:
        window = max(n // 100, 5)

    if window >= n:
        raise ValueError("Please choose a smaller window size.")

    try:
        from scipy.stats import ttest_rel
    except ImportError as e:
        print("I couldn't import scipy's stats module which is needed to compute directionality index.")
        print(str(e))
        raise

    if circ:
        d = [ttest_rel(np.array(list(M[i, i - window:]) + list(M[i, :i])),
                       M[i, i:i + window])[0] for i in range(window)]
    elif extrapolate:
        d = [ttest_rel(M[i, 0:i], M[i, i:2 * i])[0] for i in range(window)]
    else:
        d = []

    d += [ttest_rel(M[i, i - window:i], M[i, i:i + window])[0]
          for i in range(window, n - window)]

    if circ:
        d += [ttest_rel(M[i, i - window:i], M[i, i:i + window])[0]
              for i in range(window)]
    elif extrapolate:
        d += [ttest_rel(M[i, i - window:i], np.array(list(M[i, i:]) +
                                                     list(M[i, :window - (n - i)])))[0] for i in range(n - window, n)]

    return d
Beispiel #8
0
 def output_stats(self):
     '''
         Compute and store all statistics in a csv file. 
         
         Order: first the adaptive measure, then Resnik, Lin, Jiang, and simGIC (first all of them wrt biological process sub-ontology, 
         then all of them wrt molecular function sub-ontology, finally all of them wrt cellular component sub-ontology).
     '''
     
     stats = np.zeros((13, 8))
     pvals = np.zeros(8)
     c = 0
     
     all_learners = [self.mytr] + sum([[self.resniktr[root], self.lintr[root], self.jiangtr[root], self.simgictr[root]] for root in ['BIO', 'MOL', 'CEL']], [])
     
     best_prec = 1
     best_rec = 1
     best_f1 = 1
     best_area = 1
     
     
     
     for h in all_learners:
         stats[c,0] = np.mean(h.precisions)
         stats[c,1] = np.std(h.precisions)
         stats[c,2] = np.mean(h.recalls)
         stats[c,3] = np.std(h.recalls)
         stats[c,4] = np.mean(h.f1s)
         stats[c,5] = np.std(h.f1s)
         stats[c,6] = np.mean(h.areas)
         stats[c,7] = np.std(h.areas)
         
         if c > 0: 
             if stats[c,0] > stats[best_prec, 0]:
                 best_prec = c
             if stats[c,2] > stats[best_rec, 2]:
                 best_rec = c
             if stats[c,4] > stats[best_f1, 4]: 
                 best_f1 = c
             if stats[c, 6] > stats[best_area, 6]:
                 best_area = c
                 
         c+=1
     
     np.savetxt(statfiles[self.species], stats, delimiter='\t', header='precision \t std \t recall \t std \t F1 \t std \t ROC area \t std')
     
     pvals[0], pvals[1] = ttest_rel(self.mytr.precisions, all_learners[best_prec].precisions)
     pvals[2], pvals[3] = ttest_rel(self.mytr.recalls, all_learners[best_rec].recalls)
     pvals[4], pvals[5] = ttest_rel(self.mytr.f1s, all_learners[best_f1].f1s)
     pvals[6], pvals[7] = ttest_rel(self.mytr.areas, all_learners[best_area].areas)
     
     np.savetxt(pvalfiles[self.species], np.expand_dims(pvals, 0), delimiter='\t', header = 'precision \t p-value \t recall \t  p-value \t F1 \t p-value \t ROC area \t p-value')
     
     return stats
def run_statistics(voxel_name):
    # Load dataframes
    controls_a = pd.read_csv(os.path.join(statistics_dir, '%s_controls_a.csv'%voxel_name ), index_col = 0)
    controls_b = pd.read_csv(os.path.join(statistics_dir, '%s_controls_b.csv'%voxel_name ), index_col = 0)

    # Run an independent t-test on QUALITY PARAMETERS
    ttpaired_fwhm = stats.ttest_rel(controls_a['Linewidth'], controls_b['Linewidth'])
    ttpaired_snr  = stats.ttest_rel(controls_a['SNR'], controls_b['SNR'])



    print "FWHM: T-statistic is %.3f and the p-value is %.3f." % ttpaired_fwhm

    return controls_a, controls_b
def main():
    file1 = 'data/output1.json'
    file2 = 'data/output2.json'
    file05 = 'data/output05.json'

    control_fees = []
    exp2_fees = []
    exp05_fees = []

    with open(file1) as f:
        contents = f.read()
        data = json.loads(contents)
        blocks = data['blocks']
        txs = [ block['transactions'] for block in blocks ]
        control_fees = [ float(tx[0]['fee']) for tx in txs ]
        
    with open(file2) as f:
        contents = f.read()
        data = json.loads(contents)
        blocks = data['blocks']
        txs = [ block['transactions'] for block in blocks ]
        exp2_fees = [ float(tx[0]['fee']) for tx in txs ]

    with open(file05) as f:
        contents = f.read()
        data = json.loads(contents)
        blocks = data['blocks']
        txs = [ block['transactions'] for block in blocks ]
        exp05_fees = [ float(tx[0]['fee']) for tx in txs ]

    t, p = stats.ttest_rel(control_fees, exp2_fees)
    print '--------------------------------------------'
    print 'Comparing the control group to the 2x Group |'
    print 'T Stat: {}'.format(t)
    print 'P value: {}'.format(p)
    print '--------------------------------------------'

    t, p = stats.ttest_rel(control_fees, exp05_fees)
    print '--------------------------------------------'
    print 'Comparing the control group to the 0.5x Group |'
    print 'T Stat: {}'.format(t)
    print 'P value: {}'.format(p)
    print '--------------------------------------------'

    control_mean = sum(control_fees) / len(control_fees)
    exp2_mean = sum(exp2_fees) / len(exp2_fees)

    print 'Control mean: {}'.format(control_mean)
    print 'Exp2 Mean: {}'.format(exp2_mean)
Beispiel #11
0
    def stats_test(self, agg, test='ttest'):
        d = agg.shape[0]

        if test == 'ttest':
            # 2-tail T-Test
            ttest = (np.zeros((agg.shape[1]*(agg.shape[1]-1)/2, agg.shape[2])),
                     np.zeros((agg.shape[1]*(agg.shape[1]-1)/2, agg.shape[2])))
            ii = 0
            for c1 in range(agg.shape[1]):
                for c2 in range(c1+1,agg.shape[1]):
                    thisTtest = stats.ttest_rel(agg[:,c1,:], agg[:,c2,:], axis = 0)
                    ttest[0][ii,:] = thisTtest[0]
                    ttest[1][ii,:] = thisTtest[1]
                    ii += 1
            ttestPrint(title = '**** 2-tail T-Test of related samples ****',
                values = ttest, plotOpt = plotOpt,
                type = 2)

        elif test == 'ttest_1samp':
            # One-sample t-test
            m = .5
            oneSample = stats.ttest_1samp(agg, m, axis = 0)
            ttestPrint(title = '**** One-sample t-test: difference from %.2f ****' %m,
                values = oneSample, plotOpt = plotOpt, type = 1)

        elif test == 'binomial':
            # Binomial test
            binom = np.apply_along_axis(stats.binom_test,0,agg)
            print binom
            return binom
Beispiel #12
0
def plotTrials(data, fish, CSname, key, step, offset=0, pp=None):
    fig = figure(figsize=(12,8), facecolor='w')
    
    ax1 = fig.add_subplot(121) # raw trace
    ax2 = fig.add_subplot(222) # learning curve
    ax3 = fig.add_subplot(224) # bar plot
    preP, postP, postP2 = [], [], []
    longestUS = 0
    for n, measurement in enumerate(data[fish][CSname]):
        tr = n+1
        CS, USs, preRange = measurement['events']
        subplot(ax1)
        mi = -step*(tr-1)
        ma = mi + step
        drawLines(mi, ma, (preRange, [preRange+(USs[0]-CS)], preRange))
        longestUS = max([us-CS+preRange*3/2 for us in USs]+[longestUS])
        
        # 'measurement[key]': vector around the CS timing (+/-) preRange. i.e., preRange is the center
        ax1.plot(measurement[key]-step*(tr-1)+offset)
        title(CSname+': '+key)                                                                  # cf. preRange = 3600 frames
        pre = measurement[key][:preRange].mean()+offset                                       # 2 min window
        post = measurement[key][preRange:preRange+(USs[0]-CS)].mean()+offset                  # 23 s window
        post2 = measurement[key][preRange+(USs[0]-CS):preRange*3/2+(USs[0]-CS)].mean()+offset # 1 min window after US
        preP.append(pre)
        postP.append(post)
        postP2.append(post2)

        ax3.plot([1, 2, 3], [pre, post, post2],'o-')

    ax1.set_xlim([0,longestUS])
    ax1.axis('off')

    subplot(ax2)
    x = range(1, tr+1)
    y = np.diff((preP,postP), axis=0).ravel()
    ax2.plot( x, y, 'ko-', linewidth=2 )
    ax2.plot( x, np.zeros_like(x), '-.', linewidth=1, color='gray' )
    # grid()
    slope, intercept, rvalue, pval, stderr = stats.stats.linregress(x,y) 
    title('slope = zero? p-value = %f' % pval)
    ax2.set_xlabel("Trial#")
    ax2.set_xlim([0.5,tr+0.5])
    ax2.set_ylabel('CS - pre')
    
    subplot(ax3)
    ax3.bar([0.6, 1.6, 2.6], [np.nanmean(preP), np.nanmean(postP), np.nanmean(postP2)], facecolor='none')
    t, pval = stats.ttest_rel(postP, preP)
    title('paired t p-value = %f' % pval)
    ax3.set_xticks([1,2,3])
    ax3.set_xticklabels(['pre', CSname, measurement['matchedUSname']])
    ax3.set_xlim([0.5,3.5])
    ax3.set_ylabel('Raw mean values')

    tight_layout(2, h_pad=1, w_pad=1)
    
    if pp:
        fig.savefig(pp, format='pdf')
    close('all')

    return np.vstack((preP, postP, postP2))
Beispiel #13
0
def compare_normalised(model_score_path, hemisphere, weight, lateral, atlas, y_file, list_compare):

    jl_file_name = 'r2score_{h}_{atlas}_{lateral}_connmat_{y}_weighted{w}.jl'.format(h=hemisphere, atlas=atlas,
                                                                                     lateral=lateral,y=y_file,w=weight)
    print jl_file_name
    jl_path = op.join(model_score_path, jl_file_name)
    if op.isfile(jl_path):
        jl = joblib.load(jl_path)

        print "%s vs %s " %(list_compare[0], list_compare[1])
        a = jl[list_compare[0]]
        b = jl[list_compare[1]]
        a = a.astype(float)
        b = b.astype(float)

        if 'AHS22' in a.index:
            a = a.drop('AHS22')
            b = b.drop('AHS22')

        t, p = stats.ttest_rel(a, b)
        str = cs.get_difference_ttest(list_compare[0],list_compare[1], t, float(p))

        a_score = cs.convert_to_dataframe(a, hemisphere, lateral, atlas, list_compare[0], y_file, weight)
        b_score = cs.convert_to_dataframe(b, hemisphere, lateral, atlas, list_compare[1], y_file, weight)
        all_score = pd.concat([a_score, b_score])

        return all_score, {'p':p, 't':t, 'result':str}
    else:
        print '%s not exists'%jl_path
Beispiel #14
0
def ttest(list1, list2):
   a1 = np.array(list1)
   a2 = np.array(list2)
   diff = a1 - a2
   t, prob = stats.ttest_rel(a1, a2)
   print np.mean(diff), np.std(diff), t, prob
   return np.mean(diff), np.std(diff), t, prob
Beispiel #15
0
def plot_modulation_depth(arr_early, arr_late, sigma):
	arr_early = ss.zscored_fr(arr_early, sigma).max(axis = 0)
	arr_early = np.nan_to_num(arr_early)
	arr_late = ss.zscored_fr(arr_late, sigma).max(axis = 0)
	arr_late = np.nan_to_num(arr_late)
	if arr_early.size > arr_late.size:
		arr_early = np.random.choice(arr_early, size = arr_late.size, replace = False)
	if arr_late.size > arr_early.size:
		arr_late = np.random.choice(arr_late, size = arr_early.size, replace = False)
	early_sem = stats.sem(arr_early)
	early_mean = arr_early.mean()
	late_sem = stats.sem(arr_late)
	late_mean = arr_late.mean()
	p_val = stats.ttest_rel(arr_early, arr_late)
	print "p val is = " + str(p_val)
	# Pull the formatting out here
	width = 0.8	
	bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5}
	err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'}	
	means = np.array([early_mean, late_mean])
	errs = np.array([early_sem, late_sem])
	idx = np.arange(2)
	X = idx+width/2	
	labels = ['E1 early', 'E1_late']
	plt.bar(idx, means, alpha = 0.5,**bar_kwargs)
	plt.errorbar(X, means, yerr = errs,**err_kwargs)
	plt.xticks(X, labels)
	plt.ylabel('z-scored modulation depth')
	plt.title('Change in modulation depth from early in session to late in session')
	plt.show()
Beispiel #16
0
def eeg_twosample_ttest(array1,array2):
	"""  Two-sample t-test comparing the values of two EEG data-sets 
		
	Args:
		array1 :  contains EEG data of multiple subjects (nsub x ntpts)
		array1 :  contains EEG data of the same multiple subjects (nsub x ntpts) but from a different condition		
		
	Returns:
		t : t-values, one for each of the timepoints
		p : p-values, also one for each of the timepoints
		
	Dependence:
		scipy.stats.ttest_rel
		
	"""	
	from scipy.stats import ttest_rel
	s1 = array1.shape
	p = np.zeros(s1[1])
	t = np.zeros(s1[1])
	for i in range(s1[1]):
		tval,pval = ttest_rel(array1[:,i],array2[:,i])
		p[i]=pval
		t[i]=tval
		
	return t,p
Beispiel #17
0
def compute_paired_ttest(best_test_score):
    """
    """
    
    df_col_name = ['ind_vs_union', 'ind_vs_mtl', \
        'ind_vs_mtmkl', 'union_vs_mtl', 'union_vs_mtmkl', \
        'mtl_vs_mtmkl']
    pairs_test = [('individual', 'union'), ('individual', 'mtl'), \
        ('individual', 'mtmkl'), ('union', 'mtl'), ('union', 'mtmkl'), \
        ('mtl', 'mtmkl')]
    org_names = best_test_score.keys()

    ttest_p_val = numpy.zeros((len(org_names), len(pairs_test)))
    #ttest_p_val = numpy.zeros((len(pairs_test), len(org_names)))

    for org_idx, org_code in enumerate(org_names):
        meth_perf = best_test_score[org_code]

        for pair_idx, rel_pair in enumerate(pairs_test):
            t_stats, p_val = stats.ttest_rel(meth_perf[rel_pair[0]], meth_perf[rel_pair[1]])
            
            ttest_p_val[org_idx, pair_idx] = p_val
            #ttest_p_val[pair_idx, org_idx] = p_val
        
    
    df_pval = pandas.DataFrame(ttest_p_val, columns=df_col_name, index=org_names)
    #df_pval = pandas.DataFrame(ttest_p_val, columns=org_names, index=pairs_test)
    
    return df_pval 
Beispiel #18
0
def TTestPaired(data1, data2):
    for i in range(len(data1)):
        if data1[i] is ma.masked:
            data2[i] = ma.masked
        elif data2[i] is ma.masked:
            data1[i] = ma.masked
    c1 = Count(data1)
    c2 = Count(data2)
    if c1 != c2:
        df = 0
        t = 1.0
        prob = -1.0
        d = 0.0
    else:
        cov = 0.0
        df = c1 - 1
        cov = Sum((data1-Mean(data1))*(data2-Mean(data2))) / df
        sd = math.sqrt((SampVar(data1)+SampVar(data2)-2.0 * cov)/float(c1))
        diff = data1 - data2
        try:
            t, prob = stats.ttest_rel(data1, data2)
            d = Mean(diff) / SampStdDev(diff)
        except ZeroDivisionError:
            t = 0.0
            prob = 1.0
    result = {}
    result['t'] = t
    result['df'] = df
    result['prob'] = prob
    result['d'] = d
    result['quote'] = "<b>Quote: </b> <i>t</i> (%d) = %.3f, <i>p</i> = %1.4f, d = %.3f<br />"
    result['quotetxt'] = "Quote: t (%d) = %.3f, p = %1.4f, d = %.3f\n"
    return result
Beispiel #19
0
def compare_omission(mt_para_corpus, si_para_corpus, lang):
   tag_weights, tok_weights = get_omission_weights(mt_para_corpus, si_para_corpus, lang)

   mask = []
   for mt_sent_pair, si_sent_pair in zip(mt_para_corpus.sent_pairs, si_para_corpus.sent_pairs):
      if mt_sent_pair.good_alignment and si_sent_pair.good_alignment:
         mask.append(True)
      else:
         mask.append(False)
   mt_omit, mt_omit_detail, mt_omit_tok = count_omission(mask, mt_para_corpus, tag_weights, tok_weights, lang)
   si_omit, si_omit_detail, si_omit_tok = count_omission(mask, si_para_corpus, tag_weights, tok_weights, lang)

   top_k = 10
   print 'MT tag omissions:'
   print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in mt_omit if tag_weights[x[0]] > 0]).encode('utf-8')
   print u'MT tok omissions:'
   print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in mt_omit_tok[:top_k] if tok_weights[x[0]] > 0]).encode('utf8')
   print 'SI tag omissions:'
   print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in si_omit if tag_weights[x[0]] > 0]).encode('utf8')
   print 'SI tok omissions:'
   print u'\n'.join(['%s\t%f' % (x[0], x[1]) for x in si_omit_tok[:top_k] if tok_weights[x[0]] > 0]).encode('utf8')

   print 'Sentence omission stats:'
   for tag in tag_weights:
      if tag_weights[tag] > 0:
         mt_mean = sum(mt_omit_detail[tag])
         si_mean = sum(si_omit_detail[tag])
         t, prob = stats.ttest_rel(mt_omit_detail[tag], si_omit_detail[tag])
         if prob < 0.05:
            print (u'%s\t%f\t%f\t%f\t%f' % (tag, mt_mean, si_mean, t, prob)).encode('utf8')
Beispiel #20
0
def compare_poincare_baker():

    """
    bread measurements by poincare 365 fitted normal distribution with mean 950g and 50g standard deviation
    """ 

    poincare_sample = [ random.normalvariate(950, 50) for i in xrange(366) ]
    poincare_sample_cdf = MakeCdfFromList(poincare_sample, 'poincare')
    
    baker_sample = []
    for i in xrange(366):
        baker_sample.append(max(random.normalvariate(950, 50) for i in xrange(4)))
    baker_cdf = MakeCdfFromList(baker_sample)
    
        
    print poincare_sample_cdf.Mean(),
    print baker_cdf.Mean()
    myplot.Clf()
    myplot.Cdfs([poincare_sample_cdf, baker_cdf])
    pyplot.xlim(600, 1120)
    pyplot.legend(loc=0)
    myplot.SaveFormat('../resources/plots/poincare_vs_baker', 'png')
    
    # t-test
    t_test = stats.ttest_rel(poincare_sample, baker_sample)
    
    print "t-test statistic is %s with a p-value %s" % t_test
Beispiel #21
0
def mushconditions2compare(roi, conditiondict, data, dataindex, subjindex, roiindex, condindex, subjlist):
    subjlist = [s for s in subjlist if s in set([d[subjindex] for d in data])]
    mushes = []
    means = []
    sems = []
    condlabels = []
    f, ax = plt.subplots(figsize=[3, 6])
    for mushedcondn, mushedcond in enumerate(conditiondict.keys()):
        condlabels.append(mushedcond)
        conds = conditiondict[mushedcond]['conds']
        color = conditiondict[mushedcond]['color']
        subjmushes = []
        for subj in subjlist:
            subjbetas = np.array([d[dataindex] for d in data if d[condindex] in conds and d[subjindex] == subj])
            subjbetas = subjbetas.astype('float64')
            subjmushes.append(np.mean(subjbetas))
        mushes.append(subjmushes)
        condstd = np.std(subjmushes, ddof=1)
        condsem = condstd / np.sqrt(len(subjmushes))
        sems.append(condsem)
        plotmeans = [0, 0]
        plotsems = [0, 0]
        plotmeans[mushedcondn] = np.mean(subjmushes)
        plotsems[mushedcondn] = condsem
        ax.bar(range(len(plotmeans)), plotmeans, yerr=plotsems, color=color, error_kw={'ecolor': color})
    ax.set_title(data[0][roiindex])
    ax.set_xlim([0, len(plotmeans)])
    ax.set_xticks(np.arange(len(plotmeans)) + .5)
    ax.set_xticklabels(condlabels, rotation=90)
    array1 = mushes[0]
    array2 = mushes[1]
    df = len(array1) - 1
    t, p = sst.ttest_rel(array1, array2)
    string = roi + ' :' + condlabels[0] + '-' + condlabels[1] + ': t(%.0f)=%.3f, p=%.3f.' % (df, t, p)
    print string
Beispiel #22
0
    def accept(self):
        self.no_exeption = False
        self.con = float(self.con_edit.text())
        first_sample, counts = self.dataset.GetNumericValues(self.currentGroup)
        second_sample, counts2 = self.dataset.GetNumericValues(self.currentVar)
        if first_sample == second_sample:
            self.no_exeption = True
            QtGui.QMessageBox.warning(self, u'Uyarı', u'Test için farklı örneklemleri seçiniz!', QtGui.QMessageBox.Cancel,
                                QtGui.QMessageBox.NoButton, QtGui.QMessageBox.NoButton)

        else:
            self.t_score, self.pvalue = stats.ttest_rel(first_sample, second_sample)
            if len(first_sample) < len(second_sample):
                self.df = len(first_sample)-1
            else:
                self.df = len(second_sample)-1

            mean1 = sum(first_sample)/len(first_sample)
            mean2 = sum(second_sample)/len(second_sample)
            self.means = [mean1, mean2]

            if self.radio_noteq.isChecked():
                pass
            elif self.radio_greater.isChecked():
                self.pvalue /= 2
            elif self.radio_less.isChecked():
                self.pvalue /= 2

            self.P_obs = stats.t.ppf(1-self.con, self.df)
def directional(A, nw):
    n1 = A.shape[0]  
    print("Size of the matrix entetered for the directional index:")
    print(n1)
    signal1 = np.zeros((n1, 1));
    
    for i in range(0,n1) :
        vect_left = [];
        vect_right = [];
        
        for k in range(i-1,i-nw-1,-1) :
            kp =k; 
            if k < 0 :
                kp = n1 +k ;
            if A[i,kp] > 0 :
                vect_left.append(math.log(A[i,kp]));    
            else :
                vect_left.append(0);  
                    
                
        for k in range(i+1,i+nw+1) : 
            kp =k;
            if k >= n1 :
                kp = k - n1;
            if A[i,kp] > 0 :
                vect_right.append(math.log(A[i,kp]));    
            else :
                vect_right.append(0);  
                           
        if sum(vect_left) != 0 and sum(vect_right) != 0 :
            signal1[i] =  stats.ttest_rel(vect_right,vect_left)[0];
        else :
            signal1[i] =  0;
                           
    return signal1
Beispiel #24
0
def ttest(filename1, filename2):
    qids1, values1 = load_evaluation_file(arguments.filename1)
    qids2, values2 = load_evaluation_file(arguments.filename2)

    if qids1.shape[0] != qids2.shape[0]:
        raise ValueError("number of queries in files do not match (%d != %d)" % (qids1.shape[0], qids2.shape[0]))

    qids1_sort_idxs = np.argsort(qids1)
    qids2_sort_idxs = np.argsort(qids2)

    qids1 = qids1[qids1_sort_idxs]
    qids2 = qids2[qids2_sort_idxs]

    if np.any(qids1 != qids2):
        raise ValueError("files do not contain the same queries")

    values1 = values1[qids1_sort_idxs]
    values2 = values2[qids2_sort_idxs]

    mean1 = np.mean(values1)
    mean2 = np.mean(values2)

    t_statistic, p_value = ttest_rel(values1, values2)

    return values1.shape[0], mean1, mean2, t_statistic, p_value
Beispiel #25
0
    def significance(self, fun, other, test="wilcoxon"):
        """computes stats significance of difference between two sets
        of scores test can be paired wilcoxon, mannwhitney for indep
        samples, or paired ttest.
        """
        scores1 = self.map_doc_scores(fun)
        scores2 = other.map_doc_scores(fun)
        if isinstance(scores1[0], float) or isinstance(scores1[0], int):
            pass
        else:
            # TODO: this is suspicious
            scores1 = [x for x, _ in scores1]
            scores2 = [x for x, _ in scores2]

        # differences = [(x, y) for (x, y) in zip(scores1, scores2) if x != y]
        # print(difference, file=sys.stderr)
        # print(d2, file=sys.stderr)
        # print([x for (i,x) in enumerate(d1) if x!=d2[i]], file=sys.stderr)
        assert len(scores1) == len(scores1)

        results = {}
        if test == "wilcoxon" or test == "all":
            results["wilcoxon"] = wilcoxon(scores1, scores2)[1]
        if test == "ttest" or test == "all":
            results["paired ttest"] = ttest_rel(scores1, scores2)[1]
        if test == "mannwhitney" or test == "all":
            results["mannwhitney"] = mannwhitneyu(scores1, scores2)[1]
        return results
def compute(growth_npz, debug):
  from dataloader import DataLoader

  loader = DataLoader(growth_npz)
  mat = loader.matrix

  # we capture 1 scans extra of information, so ignore it
  tvec = mat[:-1, 0]
  residual_vec = mat[:-1, 5]

  dt = (tvec[2]-tvec[0])/2

  US_exp_window = 10 * 60 // dt

  n = (residual_vec.size - US_exp_window) // 2

  residual_vec_pre_US = residual_vec[:n]
  residual_vec_post_US = residual_vec[-n:]

  u0 = np.mean(residual_vec_pre_US)
  sd0 = np.std(residual_vec_pre_US)

  u1 = np.mean(residual_vec_post_US)
  sd1 = np.std(residual_vec_post_US)

  print growth_npz
  print '\tMean residual up to',(tvec[n-1] - tvec[0])/60, 'min'
  print bcolors.OKGREEN + '\t\t%.2f'%(u0), '\tSD %.2f'%(sd0), bcolors.ENDC


  print '\tMean residual from',(tvec[-n] - tvec[0])/60, 'min'
  print bcolors.OKGREEN + '\t\t%.2f'%(u1), '\tSD %.2f'%(sd1), bcolors.ENDC

  print '\tu_1 - u_0'
  # http://stattrek.com/sampling/difference-in-means.aspx?tutorial=ap
  # http://onlinestatbook.com/2/sampling_distributions/samplingdist_diff_means.html
  # Note that the result is the standard error of the mean, which is the
  # standard deviation in difference of mean
  sd_diff = (sd0**2/residual_vec_pre_US.size + sd1**2/residual_vec_post_US.size)**0.5
  print bcolors.WARNING + '\t\t%.2f'%(u1-u0), '\tSEM=%.2f'%(sd_diff), bcolors.ENDC

  import scipy.stats as stats
  print '\tDependent t-test'
  t, p = stats.ttest_rel(residual_vec_pre_US, residual_vec_post_US)
  print '\t\t'+bcolors.FAIL, 't=%.2f, p=%.4f'%(t, p), bcolors.ENDC

  if debug:
    import matplotlib.pyplot as plt
    import matplotlib_setup
    from utils import keypress

    plt.plot(residual_vec_pre_US, label='pre')
    plt.hold(True)
    plt.plot(residual_vec_post_US, label='post')
    plt.legend()

    plt.gcf().canvas.mpl_connect('key_press_event', keypress)

    plt.show()
    plt.close()
Beispiel #27
0
def reaction_times_second_step(sessions, fig_no = 1):
    'Reaction times for second step pokes as function of common / rare transition.'
    sec_step_IDs = ut.get_IDs(sessions[0].IDs, ['right_active', 'left_active'])
    median_RTs_common = np.zeros(len(sessions))
    median_RTs_rare   = np.zeros(len(sessions))
    for i,session in enumerate(sessions):
        event_times = ut.get_event_times(session.time_stamps, session.event_codes, session.IDs)
        left_active_times = event_times['left_active']
        right_active_times = event_times['right_active']
        left_reaction_times  = _latencies(left_active_times,  event_times['left_poke'])
        right_reaction_times = _latencies(right_active_times, event_times['right_poke'])
        ordered_reaction_times = np.hstack((left_reaction_times,right_reaction_times))\
                                 [np.argsort(np.hstack((left_active_times,right_active_times)))]
        transitions = session.blocks['trial_trans_state'] == session.CTSO['transitions']  # common vs rare.                 
        median_RTs_common[i] = np.median(ordered_reaction_times[ transitions])
        median_RTs_rare[i]    = np.median(ordered_reaction_times[~transitions])
    mean_RT_common = 1000 * np.mean(median_RTs_common)
    mean_RT_rare   = 1000 * np.mean(median_RTs_rare)
    SEM_RT_common = 1000 * np.sqrt(np.var(median_RTs_common/len(sessions)))
    SEM_RT_rare   = 1000 * np.sqrt(np.var(median_RTs_rare  /len(sessions)))
    p.figure(fig_no)
    p.bar([1,2],[mean_RT_common, mean_RT_rare], yerr = [SEM_RT_common,SEM_RT_rare])
    p.xlim(0.8,3)
    p.ylim(mean_RT_common * 0.8, mean_RT_rare * 1.1)
    p.xticks([1.4, 2.4], ['Common', 'Rare'])
    p.title('Second step reaction times')
    p.ylabel('Reaction time (ms)')
    print('Paired t-test P value: {}'.format(ttest_rel(median_RTs_common, median_RTs_rare)[1]))
def test_paired_ttest_with_diff_sums(data):
    model, X_test = data

    pairs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
    nb_pairs = len(pairs)

    nb_features, nb_classes, nb_cases = 1717, 4, 20
    batch_size = 5
    process_X_data_func_args = {'nb_features': nb_features}

    dlc_gen = deeplift_contribs_generator(model, X_test, 
        process_X_data_func=process_X_data, nb_features=nb_features, 
        nb_classes=nb_classes, batch_size=batch_size,
        process_X_data_func_args=process_X_data_func_args)

    sums_D, sums_D2, sums_contribs, pairs = diff_sums_from_generator(dlc_gen, 
        nb_features=nb_features, nb_classes=nb_classes)

    unadjusted_t_values, p_values = paired_ttest_with_diff_sums(sums_D, 
        sums_D2, pairs=pairs, nb_cases=nb_cases)

    assert unadjusted_t_values.shape == (nb_pairs, nb_features)
    assert p_values.shape == (nb_pairs, nb_features)

    # force only 1 batch with abnormally high batch_size parameter
    alt_dlc_gen = deeplift_contribs_generator(model, X_test, 
        process_X_data_func=process_X_data, nb_features=nb_features, 
        nb_classes=nb_classes, batch_size=109971161161043253 % 8085,
        process_X_data_func_args=process_X_data_func_args)

    # non-streaming paired t-test implementation... fails with larger 
    # datasets due to large matrix sizes (e.g., memory overflow), but
    # works as an alternative implementation for a tiny unit testing dataset
    alt_t_values, alt_p_values = [], []
    for idx, contribs in enumerate(alt_dlc_gen):
        assert not idx # check only 1 batch (idx == 0)
        for i, j in pairs:
            curr_t_values = np.zeros((nb_features, ))
            curr_p_values = np.zeros((nb_features, ))

            for f in range(nb_features):
                t, p = ttest_rel(contribs[i][:, f], contribs[j][:, f])
                curr_t_values[f] = t
                curr_p_values[f] = p

            alt_t_values.append(curr_t_values)
            alt_p_values.append(curr_p_values)

    for r in range(len(pairs)):
        t = unadjusted_t_values[r]
        alt_t = alt_t_values[r]
        p = p_values[r] # already bonferroni adjusted
        alt_p = bonferroni(alt_p_values[r], nb_pairs * nb_features)

        assert t.shape == alt_t.shape
        assert p.shape == alt_p.shape

        assert np.all(del_nans(np.abs(alt_t - t)) < epsilon)
        assert np.all(del_nans(np.abs(alt_p - p)) < epsilon)
Beispiel #29
0
def PlotXference_IND():
    global EyeData

    global Events
    plt.style.use("ggplot")

    fig = plt.figure()
    fig.suptitle("Individual trials X coordinates")
    inf = fig.add_subplot(121)
    nof = fig.add_subplot(122)

    inference = []
    noference = []
    inf_cross = []
    nof_cross = []

    for idx in range(0, len(Events)):
        inf_slices, _inf_cross = FilterSlices(EyeData[idx], Events[idx], "Inference", "typeB", 0)
        inference.append(inf_slices)
        inf_cross.append(_inf_cross)
        nof_slices, _nof_cross = FilterSlices(EyeData[idx], Events[idx], "Noference", "typeA", 0)
        noference.append(nof_slices)
        nof_cross.append(_nof_cross)

        for trial in range(0, len(inference[idx])):
            inf.plot(inference[idx][trial, :])

        for trial in range(0, len(noference[idx])):
            nof.plot(noference[idx][trial, :])

    nof.set_xlabel("No inference trials")
    inf.set_xlabel("Inference trials")
    inf.set_ylabel("Changes in gaze X position")
    inf.axhline(y=1200, color="black", linestyle="--")
    nof.axhline(y=1200, color="black", linestyle="--")

    ticks = inf.get_xticks() * 16
    inf.set_xticklabels(ticks.astype(int))
    nof.set_xticklabels(ticks.astype(int))

    t, p = stats.ttest_rel(inf_cross, nof_cross)
    print(t)
    print(p)

    fig2 = plt.figure()
    box = plt.subplot(111)
    fig2.suptitle("Average number of center crossing per subject", fontweight="bold")

    bp1 = box.boxplot([inf_cross, nof_cross], patch_artist=True)

    bp1["boxes"][0].set(color="b", linewidth=0, alpha=0.5)
    bp1["boxes"][1].set(color="m", linewidth=0, alpha=0.5)

    box.set_xticklabels(["Inference trials", "no inference trials"])

    box.set_ylabel("Average amount of crossing per subject", fontweight="bold")

    box.set_ylim(-0.01, 2.01)
    return (inf_cross, nof_cross)
def ttest():
    SVM = array(
        [
            [61.3636, 60.6, 62.23, 64.9, 57.8],
            [62.1818, 61.96, 62.4, 63.1, 61.3],
            [60.3636, 59.6, 61.3, 64.4, 56.4],
            [58.6364, 58.6, 58.6, 58.7, 58.5],
            [64, 63.8, 64.2, 64.5, 63.5],
            [61.2727, 60.2, 62.6, 66.5, 56],
            [62.1818, 60.8, 64, 68.5, 55.8],
            [62, 62, 62, 62.2, 61.8],
            [62.5455, 65.9, 63.2, 65.1, 60],
            [62, 61.1, 63.1, 65.6, 58.4],
        ]
    )

    Bayes = array(
        [
            [58.3636, 60.5, 57, 48.4, 68.4],
            [57.9, 60.9, 56.2, 42.5, 71.6],
            [57, 59.1, 55.7, 45.5, 68.5],
            [55.5, 57.4, 54.4, 42.9, 68.2],
            [58.1, 59.7, 56.9, 49.8, 66.4],
            [57.72, 59.4, 56.5, 48.7, 66.7],
            [60.4, 63.3, 58.5, 49.3, 71.5],
            [58.9, 61.1, 57.4, 49.1, 68.7],
            [59.2, 61.8, 57.5, 48, 70.4],
            [57.5, 57.5, 56.1, 44.7, 70.4],
        ]
    )

    DeTree = array(
        [
            [60.3, 61.5, 59.3, 54.9, 65.6],
            [59.4, 59.5, 59.2, 58.5, 60.2],
            [57.9, 58.2, 57.7, 56.4, 59.5],
            [55.8, 55.9, 55.7, 54.7, 56.9],
            [60.2, 60.9, 59.5, 58.4, 63.6],
            [58.2, 58.1, 58.3, 58.9, 57.5],
            [58.5, 58.8, 58.1, 56.4, 60.5],
            [58.7, 59.1, 58.4, 56.7, 60.7],
            [58.2, 58.2, 57.9, 56.4, 60],
            [59.2, 59.3, 59, 57.8, 60],
        ]
    )

    for i in range(0, 10):
        """print "======SVM and Bayes======="
        r = stats.ttest_rel(SVM[i], Bayes[i])
        print r[1]
        
        
        #print "======SVM and Decision Tree======="
        r = stats.ttest_rel(SVM[i], DeTree[i])
        print r[1]
"""
        # print "======Bayes and Decision Tree======="
        r = stats.ttest_rel(Bayes[i], DeTree[i])
        print r[1]
Beispiel #31
0
def calculatePairedTTest(jsonCollect, model, embed_type):
    random.seed(116)
    initialRand = random.getstate()
    embed = None
    transformer = None
    coefficients = []
    rPattern = r'https:\/\/stackoverflow\.com\/questions\/\d+'
    if sample:
        jsonCollect = jsonCollect[:100]

    urlMapping = {}
    urlList = []

    linkedDists = []
    foreignDists = []

    differences = []

    for jsonObject in jsonCollect:
        qUrl = jsonObject['q_url']
        urlMapping[qUrl] = jsonObject
        urlList.append(qUrl)
    number_posts_with_stackOverflow_links = 0
    num_stackOverflow_links = []
    for idx, jsonObject in enumerate(jsonCollect):
        if idx % 1000 == 0:
            print(
                f'calculatePairedTTest: finished {idx} out of {len(jsonCollect)}'
            )
        qUrl = jsonObject['q_url']
        all_content = jsonObject['q_text']
        answerCollection = jsonObject['answers']
        for answer in answerCollection:
            answerText = answer['a_text']
            all_content += '  ' + answerText
        urls = re.findall(
            '(https://)([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',
            all_content)
        filtered_urls = []
        for url_parts in urls:
            url = ''.join(url_parts)
            if 'stackoverflow.com/questions' in url:
                filtered_urls.append(url)
        # q_urls = [url for url in urls if 'https://stackoverflow.com/questions/' in url]
        # urlContent = jsonObject['stackoverflow_urls']
        urlContent = list(filtered_urls)
        if len(filtered_urls) > 0:
            number_posts_with_stackOverflow_links += 1
            num_stackOverflow_links.append(len(filtered_urls))
        for potentialUrl in urlContent:
            urlMatch = re.search(rPattern, potentialUrl)
            if urlMatch == None:
                continue
            actualUrl = urlMatch.group(0)
            if actualUrl not in urlMapping or qUrl == actualUrl:
                continue
            post2Object = urlMapping[actualUrl]

            post1Url = qUrl
            post2Url = actualUrl

            post1EmbeddingArray = embed_sentences(jsonObject['q_text'], model,
                                                  embed_type)
            post2EmbeddingArray = embed_sentences(post2Object["q_text"], model,
                                                  embed_type)

            linkedDist = np.linalg.norm(post1EmbeddingArray -
                                        post2EmbeddingArray)**2
            if linkedDist <= .001:
                continue

            post3Url = random.choice(urlList)
            post4Url = random.choice(urlList)

            while post3Url == post1Url or post3Url == post2Url:
                post3Url = random.choice(urlList)

            while post4Url == post2Url or post4Url == post1Url:
                post4Url = random.choice(urlList)

            post3EmbeddingArray = embed_sentences(
                urlMapping[post3Url]["q_text"], model, embed_type)
            post4EmbeddingArray = embed_sentences(
                urlMapping[post4Url]["q_text"], model, embed_type)

            post1And3Dist = np.linalg.norm(post1EmbeddingArray -
                                           post3EmbeddingArray)**2
            post2And4Dist = np.linalg.norm(post2EmbeddingArray -
                                           post4EmbeddingArray)**2

            foreignDistAverage = (post1And3Dist + post2And4Dist) / 2

            linkedDists.append(linkedDist)
            foreignDists.append(foreignDistAverage)

            difference = foreignDistAverage - linkedDist

            differences.append(difference)

    results = stat.ttest_rel(foreignDists, linkedDists)
    random.setstate(initialRand)
    print('Result of T statistic calculation is:', results)
    print('Number of forum posts with stackoverflow links = ',
          number_posts_with_stackOverflow_links)
    print('Average number of links per post: ',
          statistics.mean(num_stackOverflow_links))
Beispiel #32
0
def stat_test(box_data1, box_data2, test, **stats_params):
    test_short_name = ''
    pval = None
    formatted_output = None
    if test == 'Levene':
        stat, pval = stats.levene(box_data1, box_data2, **stats_params)
        test_short_name = 'levene'
        formatted_output = ("Levene test of variance, "
                            "P_val={:.3e} stat={:.3e}").format(pval, stat)
    elif test == 'Mann-Whitney':
        u_stat, pval = stats.mannwhitneyu(box_data1,
                                          box_data2,
                                          alternative='two-sided',
                                          **stats_params)
        test_short_name = 'M.W.W.'
        formatted_output = ("Mann-Whitney-Wilcoxon test two-sided "
                            "P_val={:.3e} U_stat={:.3e}").format(pval, u_stat)
    elif test == 'Mann-Whitney-gt':
        u_stat, pval = stats.mannwhitneyu(box_data1,
                                          box_data2,
                                          alternative='greater',
                                          **stats_params)
        test_short_name = 'M.W.W.'
        formatted_output = ("Mann-Whitney-Wilcoxon test greater "
                            "P_val={:.3e} U_stat={:.3e}").format(pval, u_stat)
    elif test == 'Mann-Whitney-ls':
        u_stat, pval = stats.mannwhitneyu(box_data1,
                                          box_data2,
                                          alternative='less',
                                          **stats_params)
        test_short_name = 'M.W.W.'
        formatted_output = ("Mann-Whitney-Wilcoxon test smaller "
                            "P_val={:.3e} U_stat={:.3e}").format(pval, u_stat)
    elif test == 't-test_ind':
        stat, pval = stats.ttest_ind(a=box_data1, b=box_data2, **stats_params)
        test_short_name = 't-test_ind'
        formatted_output = ("t-test independent samples, "
                            "P_val={:.3e} stat={:.3e}").format(pval, stat)
    elif test == 't-test_welch':
        stat, pval = stats.ttest_ind(a=box_data1,
                                     b=box_data2,
                                     equal_var=False,
                                     **stats_params)
        test_short_name = 't-test_welch'
        formatted_output = ("Welch's t-test independent samples, "
                            "P_val={:.3e} stat={:.3e}").format(pval, stat)
    elif test == 't-test_paired':
        stat, pval = stats.ttest_rel(a=box_data1, b=box_data2, **stats_params)
        test_short_name = 't-test_rel'
        formatted_output = ("t-test paired samples, "
                            "P_val={:.3e} stat={:.3e}").format(pval, stat)
    elif test == 'Wilcoxon':
        if "zero_method" in stats_params.keys():
            zero_method = stats_params["zero_method"]
            del stats_params["zero_method"]
        else:
            zero_method = len(box_data1) <= 20 and "pratt" or "wilcox"
        print("Using zero_method ", zero_method)
        stat, pval = stats.wilcoxon(box_data1,
                                    box_data2,
                                    zero_method=zero_method,
                                    **stats_params)
        # stat, pval = wilcoxon_exact.wilcoxon_exact(box_data1, box_data2)
        test_short_name = 'Wilcoxon'
        formatted_output = ("Wilcoxon test (paired samples), "
                            "P_val={:.3e} stat={:.3e}").format(pval, stat)
    elif test == "wilcoxon-exact":
        stat, pval = wilcoxon_exact(box_data1, box_data2, alternative="less")
        test_short_name = 'Wilcoxon'
        formatted_output = ("Wilcoxon test (paired samples), "
                            "P_val={:.3e} stat={:.3e}").format(pval, stat)
    return pval, formatted_output, test_short_name
Beispiel #33
0
# plot the data
setFonts(20)
plt.plot(x, 'o', ms=10, label='pre')
plt.plot(xs, 'r*', ms=12, label='post')
plt.bar(index,
        dx,
        width=0.5,
        align='center',
        color=0.75 * np.ones(3),
        label='pre-post')

# Format the plot
plt.legend(loc='upper left')
plt.axhline(0, ls='--')
plt.xlim(-0.3, 5.3)
plt.ylim(-0.2, 6.2)
plt.xlabel('Subject Nr')
plt.ylabel('Value')
plt.tight_layout()

# P-values for paired and unpaired T-tests
_, p_paired = stats.ttest_rel(x, xs)
_, p_ind = stats.ttest_ind(x, xs)
print('A paired comparison yields p={p_paired:.4f},' +
      f' while an unpaired T-test gives us p={p_ind:.3f}')

# Show and save figure
outFile = 'pairedTtest.png'
showData(outFile)
def class34(filename, i):
    ''' This function performs experiment 3.4
    
    Parameters
       filename : string, the name of the npz file from Task 2
       i: int, the index of the supposed best classifier (from task 3.1)  
    '''
    # Set timer
    start = timeit.default_timer()

    kf = KFold(n_splits=5, random_state=1, shuffle=True)

    feats = np.load(filename)
    feats = feats[feats.files[0]]  # (40000,174)

    X = feats[..., :-1]  # first 173 element for all 40,000 inputs -> input
    y = feats[..., -1]  # last column of feats -> label

    output = np.zeros((5, 5))

    # Count time
    stop = timeit.default_timer()
    print('Starting the folding')
    print(stop - start)

    f = 0  # counter for fold
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for classifier in range(5):
            print('Now working on classifier ' + str(classifier))
            if classifier == 0:
                clf = SVC(kernel='linear', max_iter=10000)
            if classifier == 1:
                clf = SVC(kernel='rbf', max_iter=10000,
                          gamma=2)  # default is rdf
            if classifier == 2:
                clf = RandomForestClassifier(max_depth=5, n_estimators=10)
            if classifier == 3:
                clf = MLPClassifier(alpha=0.05)
            if classifier == 4:
                clf = AdaBoostClassifier()

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            c = confusion_matrix(y_test, y_pred)
            output[f][classifier] = accuracy(c)  #adding to the output array

        stop = timeit.default_timer()
        print('Done with ' + str(f + 1) + ' fold')
        print(stop - start)
        f += 1

    iBest = i - 1
    # h[:,1]    the 2nd coloumn only of np array h

    p_values = []
    for column in range(output.shape[1]):
        if column != iBest:
            S = stats.ttest_rel(output[:, column], output[:, iBest])
            #print(output[:,column])
            #print(output[:, iBest])
            #print(S)
            p_values.append(S[1])

    with open('./a1_3.4.csv', 'w', newline='') as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        for line in output:  # Write the results for 32K data into
            writer.writerow(line)
        writer.writerow(p_values)
Beispiel #35
0
s2 = df2['JobSatisfaction'].sample(n=50, random_state=1)

# a qq plot of sample 1
fig = sm.qqplot(df1['JobSatisfaction'], fit = True, line= '45')
plt.show()

# a qq plot of sample 2
fig = sm.qqplot(df2['JobSatisfaction'], fit = True, line= '45')
plt.show()

print(s1.mean())

print(s2.mean())

# let us perfrom our ttest
tstat, pval = stats.ttest_rel(s1, s2)
print('t statistic is: ', tstat)
print('p value is: ', pval)

# let us use a loop to analyse our results
if pval<0.05:
    print("Reject null hypothesis")
else:
    print("Accept null hypothesis")

"""Rejecting the null hypothesis means that we have enough statistical evidence to state that, There is a statistically significant difference in satisfaction level between employees who retained and those who left.

# Conclusion

We set out to identify determinant factors that lead to staff attrition in a company. 
	#print(stc_d.data[1,:])
	#print('\n')
	dd_per_sub[subject_idx,:] = numpy.mean(stc_d.data, axis=1)
		

	stc_w = mne.read_source_estimate(TFCE_data_path.format(subject, 'dW', time_lbl, lbl_name))
	#print('dW')	
	#print(stc_w.data[26,:])
	#print(stc_w.data[1,:])
	#print('\n')
	dw_per_sub[subject_idx,:] = numpy.mean(stc_w.data, axis=1)	

	stc_d = []
	stc_w = []

t_stat, p_val = stats.ttest_rel(dw_per_sub, dd_per_sub, axis=0)
print('p_val')
print(p_val[26])
print(p_val[1])
print('\n')

#p_val inversion for STC Viewer
p_val = 1-p_val
print('1-p_val')
print(p_val[26])
print(p_val[1])
print('\n')

#binarize p_val
#p_val[p_val < 0.95] = 0 # significance threshold
#p_val[p_val > 0.95] = 1
    sns.set(style="dark", font_scale=2)
    sns.despine()
    ax.set_ylim([55, 95])

    for cycle in range(4):
        stat, p = wilcoxon(accuracies_3DC[cycle], accuracies_myo[cycle])
        print(p)
        if p < 0.05:
            p_rounded = np.round(p, decimals=5)
            if p_rounded > 0:
                label_diff(current_cycle=cycle, p_value=p_rounded, sign_to_use="=")
            else:
                label_diff(current_cycle=cycle, p_value=0.0001, sign_to_use="<")
        print("Normality : ", shapiro(accuracies_3DC[cycle]-accuracies_myo[cycle]))
    plt.show()

    for cycle in range(4):
        print("Cycle: ", cycle+1)
        _, normality_p_value = shapiro(accuracies_myo[cycle] - accuracies_3DC[cycle])
        print("Normality p-value: ", normality_p_value)
        if normality_p_value < 0.1:
            print("p-value t-test: N.A.")
            _, p = wilcoxon(accuracies_3DC[cycle], accuracies_myo[cycle])
            print("p-value Wilcoxon : ", p)
        else:
            _, p = ttest_rel(accuracies_3DC[cycle], accuracies_myo[cycle])
            print("p-value t-test: ", p)
            _, p = wilcoxon(accuracies_3DC[cycle], accuracies_myo[cycle])
            print("p-value Wilcoxon : ", p)

Beispiel #38
0
            'Fog': lr_recall[6],
            'Lix': lr_recall[7],
            'WSF1': lr_recall[8],
            'WSF2': lr_recall[9],
            'WSF3': lr_recall[10],
            'WSF4': lr_recall[11],
            'FEAT': feat_recall,
            'BASE': random_recall
        },
        index=[n for n in range(0, 10)])
    recall_performance.mean()

    # ================================================================================================================

    # compare f1 to random baseline
    stats.ttest_rel(f1_performance.BASE,
                    f1_performance.FEAT)  # significant for a = .01
    stats.ttest_rel(f1_performance.BASE,
                    f1_performance.CLI)  # significant for a = .01
    stats.ttest_rel(f1_performance.BASE,
                    f1_performance.Fog)  # significant for a = .01
    stats.ttest_rel(f1_performance.BASE,
                    f1_performance.WSF3)  # significant for a = .01
    stats.ttest_rel(f1_performance.BASE,
                    f1_performance.WSF4)  # significant for a = .01
    stats.ttest_rel(f1_performance.BASE,
                    f1_performance.Lix)  # significant for a = .01
    stats.ttest_rel(f1_performance.BASE,
                    f1_performance.WSF2)  # significant for a = .01
    stats.ttest_rel(f1_performance.BASE,
                    f1_performance.WSF1)  # significant for a = .01
    stats.ttest_rel(f1_performance.BASE,
Beispiel #39
0
def stats_stps(corrs1, corrs2, fisherz=True, permutation=True, iter=5000):

    """
    Conduct the statistical analysis for results of EEG-like data(for STPS)

    Parameters
    ----------
    corrs1 : array
        The correlation coefficients under condition1.
        The shape of corrs1 must be [n_subs, n_chls, n_ts]. n_subs, n_chls, n_ts represent the number of subjects, the
        number of channels and the number of time-points.
    corrs2 : array
        The correlation coefficients under condition2.
        The shape of corrs2 must be [n_subs, n_chls, n_ts]. n_subs, n_chls, n_ts represent the number of subjects, the
        number of channels and the number of time-points.
    fisherz : bool True or False. Default is True.
        Conduct Fisher-Z transform.
    permutation : bool True or False. Default is False.
        Use permutation test or not.
    iter : int. Default is 5000.
        The times for iteration.

    Returns
    -------
    stats : array
        The statistical results.
        The shape of stats is [n_chls, n_ts, 2]. n_chls, n_ts represent the number of channels and the number of
        time-points. 2 represents a t-value and a p-value.

    Notes
    -----
    n_subs must >= 6.
    """

    if len(np.shape(corrs1)) != 3 or len(np.shape(corrs2)) != 3 or np.shape(corrs1)[1] != np.shape(corrs2)[1] or \
            np.shape(corrs1)[2] != np.shape(corrs2)[2]:

        return "Invalid input!"

    # get the number of subjects, channels & time-points
    subs, chls, ts = np.shape(corrs1)

    # subs>=6
    if subs < 6:
        return print("the number of subjects is too small!")

    # initialize the corrs
    stats = np.zeros([chls, ts, 2], dtype=np.float)

    # get r-map
    rs1 = corrs1
    rs2 = corrs2

    # calculate the statistical results
    for i in range(chls):
        for j in range(ts):

            if fisherz is True:

                # Fisher r to z
                zs1 = 0.5*np.log((1+rs1)/(1-rs1))
                zs2 = 0.5*np.log((1+rs2)/(1-rs2))

            # t test
            stats[i, j] = ttest_rel(rs1[:, i, j], rs2[:, i, j])

            if permutation == True:

                stats[i, j, 1] = permutation_test(zs1[:, i, j], zs2[:, i, j], iter=iter)

    return stats
def find_pvalue_2smaple_paired(a, b):
    """a and b are arrays"""
    pval_2sided = stats.ttest_rel(a, b)
    return pval_2sided
Beispiel #41
0
from scipy.stats import ttest_rel, ttest_ind

ddnn_raw = np.loadtxt("./output_ddnn_old.txt",delimiter=',')[:,:10]
sing_raw = np.loadtxt("./output_sing_old.txt",delimiter=',')
gmm_raw = np.loadtxt("./output_sing_gmm.txt",delimiter=',')

ddnn = np.mean(ddnn_raw,axis=0)
sing = np.mean(sing_raw,axis=0)
gmm = np.mean(gmm_raw,axis=0)
ys = range(0,1000,100)
print ddnn.shape
print len(ys)
plot_ddnn, = plt.plot(ys,ddnn,label='Decentralized')
plot_sing, = plt.plot(ys,sing,label='Centralized')
plot_gmm, = plt.plot(ys,gmm,label='GMM')

last_DNN = ddnn_raw[:,9]
last_GMM = gmm_raw[:,9]
last_SING = sing_raw[:,9]
print gmm.shape
print ddnn.shape
print sing.shape
t,r = ttest_rel(last_DNN,last_GMM)
print sing[9]
print gmm[9]
plt.legend([plot_ddnn, plot_sing, plot_gmm], loc=4)
plt.xlabel("Number of epochs")
plt.ylabel("Accuracy score")
print r

#plt.show()
                                          data = d, 
                                          x_estimator = np.mean, 
                                          x_ci = "ci", 
                                          ci = 95,
                                          n_boot = 5000,
                                          line_kws = {'lw': 5},
                                          color = "darkgrey")

        # save only the slope
        arr[i,j] = slope

# close all figures
plt.close()

#%%

"""
Compute the t-test
"""

import scipy.stats as stats

m1, sd1 = np.round(np.mean(arr[:,0]), 1), np.round(np.std(arr[:,0]), 1)
m2, sd2 = np.round(np.mean(arr[:,1]), 1), np.round(np.std(arr[:,1]), 1)

print(m1, sd1)
print(m2, sd2)

t_stat, p_val = stats.ttest_rel(arr[:,0], arr[:,1])
print("T-statistic: {}\np-value: {}".format(np.round(t_stat, 3),
                                            np.round(p_val, 3)))
Beispiel #43
0
sentiment_singularity_csv = '~/git/afit_mlperf_training/sentiment_analysis/results/1xP100_sentiment_analysis_singularity.csv'
sentiment_native_csv = '~/git/afit_mlperf_training/sentiment_analysis/results/1xP100_sentiment_analysis_native.csv'

native_df = pandas.read_csv(sentiment_native_csv,
                            '\n',
                            names=['Native Runtime (Seconds)'])
singularity_df = pandas.read_csv(sentiment_singularity_csv,
                                 '\n',
                                 names=['Singularity Runtime (Seconds)'])
native_df = native_df.drop([35])
singularity_df = singularity_df.drop([18])

df = native_df.join(singularity_df).dropna()
print(df.describe())

t_val_rel = stats.ttest_rel(df.loc[:, 'Native Runtime (Seconds)'],
                            df.loc[:, 'Singularity Runtime (Seconds)'])
print(t_val_rel)
t_val_ind = stats.ttest_ind(df.loc[:, 'Native Runtime (Seconds)'],
                            df.loc[:, 'Singularity Runtime (Seconds)'])
print(t_val_ind)

ax = plt.gca()

df.plot(kind='hist', y='Native Runtime (Seconds)', color='red', ax=ax)
df.plot(kind='hist', y='Singularity Runtime (Seconds)', color='blue', ax=ax)

plt.savefig('P100_Histogram.png')
plt.show()
Beispiel #44
0
def plot_mean_podf(po, sz=200, typ='cp', typek='', pos=(0, 4, 4, 8)):
    global freq, df
    ax = py.subplot(gs[pos[2]:pos[3], pos[0]:pos[1]])
    set_axis(ax, -0.05, 1.1, letter=letters[po])
    # df = pd.read_excel(saveDir+'g_and_h_mjh.xlsx')
    results = pd.read_excel(saveDir + 'results.xlsx')
    df = pd.read_excel(saveDir + 'g_and_h.xlsx')
    df2 = pd.read_excel(saveDir + 'Injection times.xlsx')
    if 'xyl' in typek: color = 'purple'
    else: color = 'red'
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    plot_len = 55
    min_minus = 20
    min_plus = 35
    okno = 60
    time_gh = np.linspace(-20, min_plus, plot_len)
    hfo = np.zeros((len(lista_rats), plot_len))
    gamma = np.zeros((len(lista_rats), plot_len))
    for i in range(len(lista_rats)):
        row = df2.loc[df2['RAT'] == int(lista_rats[i])]
        start = int(row[typek].values[0] / (okno))
        # print(start)
        hfo[i] = df[lista_rats[i] + 'HFO_' + typ +
                    typek].values[start - min_minus:start + min_plus]
        gamma[i] = df[lista_rats[i] + 'gamma_' + typ +
                      typek].values[start - min_minus:start + min_plus]
        # py.plot(hfo[i], color='indianred')
        # py.plot(gamma[i], color = 'blue')
    sem = len(lista_rats)**(1 / 2)
    m_hfo = hfo.mean(axis=0)
    s_hfo = hfo.std(axis=0) / sem
    m_gamma = gamma.mean(axis=0)
    s_gamma = gamma.std(axis=0) / sem
    py.plot(time_gh, m_gamma, color='blue')
    py.fill_between(time_gh,
                    m_gamma - s_gamma,
                    m_gamma + s_gamma,
                    alpha=0.3,
                    color='blue')
    py.plot(time_gh, m_hfo, color=color)
    py.fill_between(time_gh,
                    m_hfo - s_hfo,
                    m_hfo + s_hfo,
                    alpha=0.3,
                    color=color)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    py.ylabel('Power of dom. freq.($mV^2$)', fontsize=fsize)
    py.yscale('log')
    py.xlabel('Time (min)', fontsize=fsize)
    ax = py.subplot(gs[pos[2]:pos[3], pos[1] + 1:pos[1] + 5])
    set_axis(ax, -0.05, 1.1, letter=letters[po + 1])
    bef_gamma, rly_gamma, lat_gamma = [], [], []
    bef_hfo, rly_hfo, lat_hfo = [], [], []
    for i in range(len(lista_rats)):
        bef_gamma.append(gamma[i, bs - 5:bs].mean())
        rly_gamma.append(gamma[i, early_s:early_f].mean())
        lat_gamma.append(gamma[i, late_s:late_f].mean())
        bef_hfo.append(hfo[i, bs - 5:bs].mean())
        rly_hfo.append(hfo[i, early_s:early_f].mean())
        lat_hfo.append(hfo[i, late_s:late_f].mean())
        # if typek=='xyl':
        #     py.plot([hfo[i, bs-3:bs].mean(), hfo[i,early_s:early_f].mean(), hfo[i, late_s:late_f].mean()], marker = 'o', color = 'indianred')
        # else:
        py.plot([
            hfo[i, bs - 5:bs].mean(), hfo[i, early_s:early_f].mean(),
            hfo[i, late_s:late_f].mean()
        ],
                marker='o',
                color=color)
        py.plot([
            gamma[i, bs - 5:bs].mean(), gamma[i, early_s:early_f].mean(),
            gamma[i, late_s:late_f].mean()
        ],
                marker='o',
                color='blue')
        # py.text(-.1, hfo[i, bs-3:bs].mean(), lista_rats[i])

    results[typek + 'bef_gamma'] = bef_gamma
    results[typek + 'rly_gamma'] = rly_gamma
    results[typek + 'lat_gamma'] = lat_gamma

    results[typek + 'bef_hfo'] = bef_hfo
    results[typek + 'rly_hfo'] = rly_hfo
    results[typek + 'lat_hfo'] = lat_hfo

    results['rats'] = lista_rats
    results.to_excel(saveDir + 'results.xlsx',
                     sheet_name='sheet1',
                     index=False)

    shift = np.asarray(rly_hfo).mean() / 10
    max_ind = np.max(np.array([rly_hfo, lat_hfo]))
    print('shap', st.shapiro(bef_gamma)[1])
    print('shap', st.shapiro(rly_gamma)[1])
    print('shap', st.shapiro(lat_gamma)[1])
    pvalue = st.ttest_rel(bef_gamma, rly_gamma)[1]
    print('gamma pval', pvalue)
    py.text(.9, max_ind + shift, pval(pvalue), color='blue')
    pvalue = st.ttest_rel(bef_gamma, lat_gamma)[1]
    py.text(1.9, max_ind + shift, pval(pvalue), color='blue')

    shift = np.asarray(rly_hfo).mean() * 2
    print('shap', st.shapiro(bef_hfo)[1])
    print('shap', st.shapiro(rly_hfo)[1])
    print('shap', st.shapiro(lat_hfo)[1])
    pvalue = st.ttest_rel(bef_hfo, rly_hfo)[1]
    print('hfo pval', pvalue)
    py.text(.9, max_ind + shift, pval(pvalue), color=color)
    pvalue = st.ttest_rel(bef_hfo, lat_hfo)[1]
    py.text(1.9, max_ind + shift, pval(pvalue), color=color)

    py.ylabel('Power of dom. freq.($mV^2$)', fontsize=fsize)
    py.yscale('log')
    py.xticks([0, 1, 2], ['base', 'early Ket', 'late Ket'], fontsize=fsize)
    if typek == 'xyl':
        py.xticks([0, 1, 2], ['base', 'early KX', 'late KX'], fontsize=fsize)
    else:
        ket = mpatches.Patch(color='red', label='HFO after Ket.')
        kx = mpatches.Patch(color='purple', label='HFO after KX')
        gam = mpatches.Patch(color='blue', label='Gamma 30-65 Hz')
        ax.legend(handles=[ket, kx, gam],
                  loc='center',
                  bbox_to_anchor=(1.7, 0.5),
                  ncol=1,
                  frameon=True,
                  fontsize=20)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    py.xlim(-.2, 2.2)
Beispiel #45
0
def main():
    if len(sys.argv) < 3:
        print("You did not give enough arguments\n ")
        sys.exit(1)
    filename_A = sys.argv[1]
    filename_B = sys.argv[2]
    alpha = sys.argv[3]

    with open(filename_A) as f:
        data_A = f.read().splitlines()

    with open(filename_B) as f:
        data_B = f.read().splitlines()

    data_A = list(map(float, data_A))
    data_B = list(map(float, data_B))

    print(
        "\nPossible statistical tests: Shapiro-Wilk, Anderson-Darling, Kolmogorov-Smirnov, t-test, Wilcoxon, McNemar, Permutation, Bootstrap"
    )
    name = raw_input("\nEnter name of statistical test: ")

    ### Normality Check
    if (name == "Shapiro-Wilk" or name == "Anderson-Darling"
            or name == "Kolmogorov-Smirnov"):
        output = normality_check(data_A, data_B, name, alpha)

        if (float(output) > float(alpha)):
            #answer = raw_input("\nThe normal test is significant, would you like to perform a t-test for checking significance of difference between results? (Y\N) ")
            if (answer == 'Y'):
                # two sided t-test
                t_results = stats.ttest_rel(data_A, data_B)
                # correct for one sided test
                pval = t_results[1] / 2
                if (float(pval) <= float(alpha)):
                    print(
                        "\nTest result is significant with p-value: {}".format(
                            pval))
                    return
                else:
                    print("\nTest result is not significant with p-value: {}".
                          format(pval))
                    return
            else:
                answer2 = raw_input(
                    "\nWould you like to perform a different test (permutation or bootstrap)? If so enter name of test, otherwise type 'N' "
                )
                if (answer2 == 'N'):
                    print("\nbye-bye")
                    return
                else:
                    name = answer2
        else:
            #answer = raw_input("\nThe normal test is not significant, would you like to perform a non-parametric test for checking significance of difference between results? (Y\N) ")
            if (answer == 'Y'):
                answer2 = raw_input(
                    "\nWhich test (Permutation or Bootstrap)? ")
                name = answer2
            else:
                print("\nbye-bye")
                return

    ### Statistical tests

    # Paired Student's t-test: Calculate the T-test on TWO RELATED samples of scores, a and b. for one sided test we multiply p-value by half
    if (name == "t-test"):
        t_results = stats.ttest_rel(data_A, data_B)
        # correct for one sided test
        pval = float(t_results[1]) / 2
        if (float(pval) <= float(alpha)):
            print("\nTest result is significant with p-value: {}".format(pval))
            return
        else:
            print("\nTest result is not significant with p-value: {}".format(
                pval))
            return

    # Wilcoxon: Calculate the Wilcoxon signed-rank test.
    if (name == "Wilcoxon"):
        wilcoxon_results = stats.wilcoxon(data_A, data_B)
        if (float(wilcoxon_results[1]) <= float(alpha)):
            print("\nTest result is significant with p-value: {}".format(
                wilcoxon_results[1]))
            return
        else:
            print("\nTest result is not significant with p-value: {}".format(
                wilcoxon_results[1]))
            return

    if (name == "McNemar"):
        print(
            "\nThis test requires the results to be binary : A[1, 0, 0, 1, ...], B[1, 0, 1, 1, ...] for success or failure on the i-th example."
        )
        f_obs = calculateContingency(data_A, data_B, len(data_A))
        mcnemar_results = mcNemar(f_obs)
        if (float(mcnemar_results) <= float(alpha)):
            print("\nTest result is significant with p-value: {}".format(
                mcnemar_results))
            return
        else:
            print("\nTest result is not significant with p-value: {}".format(
                mcnemar_results))
            return

    if (name == "Permutation"):
        R = max(10000, int(len(data_A) * (1 / float(alpha))))
        pval = rand_permutation(data_A, data_B, len(data_A), R)
        if (float(pval) <= float(alpha)):
            print("\nTest result is significant with p-value: {}".format(pval))
            return
        else:
            print("\nTest result is not significant with p-value: {}".format(
                pval))
            return

    if (name == "Bootstrap"):
        R = max(10000, int(len(data_A) * (1 / float(alpha))))
        pval = Bootstrap(data_A, data_B, len(data_A), R)
        if (float(pval) <= float(alpha)):
            print("\nTest result is significant with p-value: {}".format(pval))
            return
        else:
            print("\nTest result is not significant with p-value: {}".format(
                pval))
            return

    else:
        print("\nInvalid name of statistical test")
        sys.exit(1)
    for j in range(1, len(algo1[0])):
        algo1_temp.append(float(algo1[i][j]))
        algo2_temp.append(float(algo2[i][j]))
    #print(algo1_temp)
    #print(algo2_temp)
    temp1 = [0 for i in range(10)]
    temp2 = [0 for i in range(10)]
    for j in range(10):
        for k in range(10):
            temp1[j] += algo1_temp[10 * j + k]
            temp2[j] += algo2_temp[10 * j + k]
        temp1[j] /= 10
        temp2[j] /= 10
    #print(temp1)
    #print(temp2)
    t_test = stats.ttest_rel(temp1, temp2)
    #print(t_test)

    t_test_value = t_test[0]

    if t_test[1] > 0.05:
        t_test_value = 0

    index = int(algo1[i][0])

    #if t test result is positive, algo1 is better, if t test result is negative, algo2 is better, if t test result is 0, then draw
    if t_test_value == 0:
        label[index] = 0
    elif t_test_value > 0:
        label[index] = 1
    elif t_test_value < 0:
Beispiel #47
0
def get_different_from_best(results_df,
                            raw_results_df,
                            metric='aupr',
                            id_name='gene'):
    """Identify best-performing data types for each gene.

    As an alternative to just identifying the data type with the best average
    performance, we want to also identify data types that are "statistically
    equivalent" to the best performer. For each gene, we do the following:

    1) get all data types that significantly outperform the permuted baseline
       ("well-performing" data types)
    2) do pairwise t-tests comparing the best performing data types with
       other well-performing data types
    3) apply an FDR correction for the total number of t-tests

    In each case where the null hypothesis is accepted, we say both data types
    are statistically equivalent. If the null is rejected, the relevant data
    type does not provide statistically equivalent performance to the best
    performing data type.
    """
    from scipy.stats import ttest_rel

    comparison_pvals = []
    for identifier in results_df[id_name].unique():
        # compare best with other data types that are significant from
        # baseline, using pairwise t-tests
        # null hypothesis = each pair of results distributions is the same

        # get best data type
        best_data_ix = (results_df[results_df[id_name] ==
                                   identifier].loc[:, 'delta_mean'].idxmax())
        best_data_type = results_df.iloc[best_data_ix, :].training_data

        # get other significant data types
        other_data_types = (
            results_df[(results_df[id_name] == identifier)
                       & (results_df.training_data != best_data_type) &
                       (results_df.reject_null)])['training_data'].values

        best_data_dist = (
            raw_results_df[(raw_results_df.identifier == identifier)
                           & (raw_results_df.training_data == best_data_type) &
                           (raw_results_df.signal == 'signal') &
                           (raw_results_df.data_type == 'test')]).sort_values(
                               by=['seed', 'fold'])[metric].values

        if len(other_data_types) == 0:
            continue

        for other_data_type in other_data_types:
            # do pairwise t-tests
            other_data_dist = (raw_results_df[
                (raw_results_df.identifier == identifier)
                & (raw_results_df.training_data == other_data_type) &
                (raw_results_df.signal == 'signal') &
                (raw_results_df.data_type == 'test')]).sort_values(
                    by=['seed', 'fold'])[metric].values

            p_value = ttest_rel(best_data_dist, other_data_dist)[1]

            best_id = '{}, {}'.format(identifier, best_data_type)
            other_id = '{}, {}'.format(identifier, other_data_type)

            comparison_pvals.append(
                [identifier, best_data_type, other_data_type, p_value])

    comparison_df = pd.DataFrame(
        comparison_pvals,
        columns=[id_name, 'best_data_type', 'other_data_type', 'p_value'])

    # apply multiple testing correction and identify significant similarities
    from statsmodels.stats.multitest import multipletests
    corr = multipletests(comparison_df['p_value'], alpha=0.05, method='fdr_bh')
    comparison_df = comparison_df.assign(corr_pval=corr[1],
                                         accept_null=~corr[0])

    # add column to results_df for statistically equal to best
    equal_to_best = []
    for _, vals in results_df.iterrows():
        if not vals['reject_null']:
            equal_to_best.append(False)
        else:
            comp_gene_df = comparison_df[comparison_df[id_name] ==
                                         vals[id_name]]
            if vals['training_data'] in comp_gene_df.best_data_type.values:
                equal_to_best.append(True)
            elif vals['training_data'] in comp_gene_df.other_data_type.values:
                # reject null = means are significantly different
                # accept null = means are statistically the same
                # so accept null = alternate data type is statistically the
                # same as the best data type
                equal_to_best.append(
                    comp_gene_df[comp_gene_df.other_data_type ==
                                 vals['training_data']].accept_null.values[0])
            else:
                # this happens when the data type is the only significant one
                equal_to_best.append(True)

    results_df = results_df.assign(equal_to_best=equal_to_best)
    return results_df
Beispiel #48
0
import pandas as pd
from scipy import stats

measurements = pd.read_csv('data/hawaii_measurements.csv')
measurements['month'] = pd.to_datetime(measurements.date).dt.month
june = measurements[measurements['month'] == 6]
december = measurements[measurements['month'] == 12]
june_grp = june.groupby('station')
december_grp = december.groupby('station')
june_avg = june_grp.tobs.mean()
december_avg = december_grp.tobs.mean()
print(stats.ttest_rel(june_avg, december_avg))
print(
    '''I did a paired t-test because these are temperature observations from the same stations across two different months. The results return a small p-value (p < 0.05) which indicates a statistically significant difference between June and December temperatures across all years.'''
)
def main():
    (current_work_dir_path, asset_dir_path, program_dir_path,
     conda_program_dir_path) = utils.get_dir_paths()
    num_of_threads = multiprocessing.cpu_count()
    gammas = [2.**i for i in range(-7, 11)]
    mafft_xinsi_params = []
    consalifold_params = []
    posterior_consalifold_params = []
    rna_seq_dir_path = asset_dir_path + "/compiled_rna_fams_test"
    mafft_xinsi_dir_path = asset_dir_path + "/mafft_xinsi"
    mafft_xinsi_plus_consalifold_dir_path = asset_dir_path + "/mafft_xinsi_plus_consalifold"
    posterior_mafft_xinsi_plus_consalifold_dir_path = asset_dir_path + "/posterior_mafft_xinsi_plus_consalifold"
    if not os.path.isdir(mafft_xinsi_plus_consalifold_dir_path):
        os.mkdir(mafft_xinsi_plus_consalifold_dir_path)
    if not os.path.isdir(posterior_mafft_xinsi_plus_consalifold_dir_path):
        os.mkdir(posterior_mafft_xinsi_plus_consalifold_dir_path)
    sub_thread_num = 4
    for rna_seq_file in os.listdir(rna_seq_dir_path):
        if not rna_seq_file.endswith(".fa"):
            continue
        rna_seq_file_path = os.path.join(rna_seq_dir_path, rna_seq_file)
        (rna_family_name, extension) = os.path.splitext(rna_seq_file)
        mafft_xinsi_output_file_path = os.path.join(mafft_xinsi_dir_path,
                                                    rna_family_name + ".aln")
        mafft_xinsi_plus_consalifold_output_dir_path = os.path.join(
            mafft_xinsi_plus_consalifold_dir_path, rna_family_name)
        posterior_mafft_xinsi_plus_consalifold_output_dir_path = os.path.join(
            posterior_mafft_xinsi_plus_consalifold_dir_path, rna_family_name)
        if not os.path.isdir(mafft_xinsi_plus_consalifold_output_dir_path):
            os.mkdir(mafft_xinsi_plus_consalifold_output_dir_path)
        if not os.path.isdir(
                posterior_mafft_xinsi_plus_consalifold_output_dir_path):
            os.mkdir(posterior_mafft_xinsi_plus_consalifold_output_dir_path)
        consalifold_params.insert(
            0, (sub_thread_num, mafft_xinsi_output_file_path,
                mafft_xinsi_plus_consalifold_output_dir_path, False))
        posterior_consalifold_params.insert(
            0, (sub_thread_num, mafft_xinsi_output_file_path,
                posterior_mafft_xinsi_plus_consalifold_output_dir_path, True))
    # ConsAliFold's execution.
    pool = multiprocessing.Pool(int(num_of_threads / sub_thread_num))
    consalifold_results = pool.map(bench_consalifold, consalifold_params)
    consalifold_output_file_path = asset_dir_path + "/consalifold_running_times_turner.dat"
    write_consalifold_results(consalifold_results,
                              consalifold_output_file_path)
    data_turner = read_consalifold_results(consalifold_output_file_path)
    posterior_consalifold_results = pool.map(bench_consalifold,
                                             posterior_consalifold_params)
    posterior_consalifold_output_file_path = asset_dir_path + "/consalifold_running_times_posterior.dat"
    write_consalifold_results(posterior_consalifold_results,
                              posterior_consalifold_output_file_path)
    data_posterior = read_consalifold_results(
        posterior_consalifold_output_file_path)
    data = {
        "Running time (s)":
        data_turner + data_posterior,
        "Pair-matching probability inference method":
        ["ConsProb"] * len(data_turner) +
        ["LocARNA-P + our PCT"] * len(data_posterior)
    }
    data_frame = pandas.DataFrame(data=data)
    ax = seaborn.boxplot(x="Pair-matching probability inference method",
                         y="Running time (s)",
                         data=data_frame,
                         sym="")
    fig = ax.get_figure()
    fig.tight_layout()
    image_dir_path = asset_dir_path + "/images"
    if not os.path.exists(image_dir_path):
        os.mkdir(image_dir_path)
    fig.savefig(image_dir_path +
                "/consalifold_model_comparison_running_time.eps",
                bbox_inches="tight")
    fig.clf()
    print("Running time significance test: ",
          stats.ttest_rel(data_turner, data_posterior))
                        ) or experiment_name.startswith('original'):
                            print(
                                'this element is teh baseline and we do not evaluate with respect to itself'
                            )
                        elif 'spsa' in experiment_name or 'zoo' in experiment_name:
                            print('Spsa e Zoo not under evaluation')
                        else:
                            # Take the baseline and compare with the model
                            baseline = ttest_map[baseline]
                            actual_experiment = ttest_map[experiment_name]

                            base = []
                            test = []

                            for user_id in actual_experiment.keys():
                                base.append(baseline[user_id])
                                test.append(actual_experiment[user_id])

                            p = stats.ttest_rel(base, test).pvalue
                            star = ''
                            if p <= 0.05:
                                star = '*'
                            else:
                                start = '#'

                            line = '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(
                                dataset_name, an_metric, analyzed_k,
                                experiment_name, p, star)
                            f.writelines(line)
                            print(line)
# A researcher noted the number of chocolate chips consumed by 10 rats, with and without electrical stimulation.
# The data set s1 represents consumption with stimulation, and s2 without simulation.
s1 = [12, 7, 3, 11, 8, 5, 14, 7, 9, 10]
s2 = [8, 7, 4, 14, 6, 7, 12, 5, 5, 8]

# Compute t-statistic for the above samples, and display the t-score and p-value in separate lines.
# Hint: Use the ttest_rel function available in scipy.
import numpy as np
from scipy import stats

a = np.array(s1)
b = np.array(s2)

## Checking with the internal scipy function
t2, p2 = stats.ttest_rel(a, b)
print("t = " + str(t2))
print("p = " + str(p2))
from scipy import stats
SIGNIFICANCE_LEVEL = 0.05

H0 = 'As the tree depth increases, the mean accuracies for both DT and BT remain the same i.e. their performance does not change with respect to each other.'
H1 = 'As the tree depth increases, the mean accuracies of DT != mean accuracies of BT i.e. their performance differs with respect to each other.'

print('H0: {}'.format(H0))
print('H1: {}'.format(H1))

# Accuracies from the 10-Fold Cross Validation for each depth and each model
dt_accs_depth_3 = [0.72, 0.77, 0.74, 0.69, 0.79, 0.75, 0.75, 0.76, 0.73, 0.7]
dt_accs_depth_5 = [0.75, 0.75, 0.72, 0.7, 0.78, 0.75, 0.72, 0.76, 0.75, 0.7]
dt_accs_depth_7 = [0.75, 0.73, 0.71, 0.68, 0.74, 0.75, 0.72, 0.78, 0.77, 0.7]
dt_accs_depth_9 = [0.76, 0.73, 0.71, 0.67, 0.73, 0.72, 0.71, 0.8, 0.74, 0.72]
dt_accs = [dt_accs_depth_3, dt_accs_depth_5, dt_accs_depth_7, dt_accs_depth_9]

bt_accs_depth_3 = [0.73, 0.77, 0.73, 0.7, 0.79, 0.75, 0.75, 0.76, 0.73, 0.7]
bt_accs_depth_5 = [0.74, 0.78, 0.72, 0.7, 0.78, 0.75, 0.75, 0.77, 0.73, 0.7]
bt_accs_depth_7 = [0.76, 0.78, 0.74, 0.72, 0.79, 0.75, 0.74, 0.77, 0.76, 0.7]
bt_accs_depth_9 = [0.75, 0.77, 0.75, 0.72, 0.76, 0.75, 0.75, 0.8, 0.76, 0.71]
bt_accs = [bt_accs_depth_3, bt_accs_depth_5, bt_accs_depth_7, bt_accs_depth_9]

depths = [3, 5, 7, 9]
for i in range(len(depths)):
    depth = depths[i]
    t_val, p_val = stats.ttest_rel(dt_accs[i], bt_accs[i])
    print(
        'Depth: {} H0 for DT and BT: t-statistics = {}, p-value = {} Reject with significance level of {}? {}'
        .format(depth, t_val, p_val, SIGNIFICANCE_LEVEL,
                (p_val < SIGNIFICANCE_LEVEL)))
Beispiel #53
0
def stats_stpsfmri(corrs1, corrs2, fisherz=True, permutation=False, iter=5000):

    """
    Conduct the statistical analysis for results of fMRI data (STPS searchlight)

    Parameters
    ----------
    corrs1 : array
        The correlation coefficients under condition1.
        The shape of corrs1 must be [n_subs, n_x, n_y, n_z]. n_subs, n_x, n_y, n_z represent the number of subjects,
        the number of calculation units for searchlight along the x, y, z axis.
    corrs2 : array
        The correlation coefficients under condition2.
        The shape of corrs2 must be [n_subs, n_x, n_y, n_z]. n_subs, n_x, n_y, n_z represent the number of subjects,
        the number of calculation units for searchlight along the x, y, z axis.
    fisherz : bool True or False. Default is True.
        Conduct Fisher-Z transform.
    permutation : bool True or False. Default is False.
        Use permutation test or not.
    iter : int. Default is 5000.
        The times for iteration.

    Returns
    -------
    stats : array
        The statistical results.
        The shape of stats is [n_x, n_y, n_z, 2]. n_x, n_y, n_z represent the number of calculation units for
        searchlight along the x, y, z axis and 2 represents a t-value and a p-value.

    Notes
    -----
    n_subs must >= 6.
    """

    if len(np.shape(corrs1)) != 4 or len(np.shape(corrs2)) != 4 or np.shape(corrs1)[1] != np.shape(corrs2)[1] \
            or np.shape(corrs1)[2] != np.shape(corrs2)[2] or np.shape(corrs1)[3] != np.shape(corrs2)[3]:

        return "Invalid input!"

    # get the number of subjects
    subs = np.shape(corrs1)[0]

    # subs>=6
    if subs < 6:
        return print("the number of subjects is too small!")

    # get the number of the calculation units in the x, y, z directions
    n_x, n_y, n_z = np.shape(corrs1)[1:]

    # initialize the corrs
    stats = np.zeros([n_x, n_y, n_z, 2], dtype=np.float)

    # get r-map
    rs1 = corrs1
    rs2 = corrs2

    # calculate the statistical results
    for i in range(n_x):
        for j in range(n_y):
            for k in range(n_z):

                if fisherz is True:

                    # Fisher r to z
                    zs1 = 0.5 * np.log((1 + rs1) / (1 - rs1))
                    zs2 = 0.5 * np.log((1 + rs2) / (1 - rs2))

                # t test
                stats[i, j, k] = ttest_rel(rs1[:, i, j, k], rs2[:, i, j, k])

                if permutation == True:
                    stats[i, j, k, 1] = permutation_test(zs1[:, i, j, k], zs2[:, i, j, k], iter=iter)

    return stats
Beispiel #54
0
while (count2 > 30):
    r = random.randint(0, count2 - 1)
    sampled_singularity_df = sampled_singularity_df.drop(
        sampled_singularity_df.index[r])
    count2 = count2 - 1
print('After Sampling:\n')
print(sampled_native_df.describe())
print(sampled_singularity_df.describe())
#df = native_df.merge(singularity_df, how='left')
#print(df.describe())
print('p-value:\t 0.05\n')
print('degrees of freedom:\t ~60\n')
print('Critical t-val:\t 2.0\n')

t_val_rel = stats.ttest_rel(
    sampled_native_df.loc[:, 'Native Runtime (Seconds)'],
    sampled_singularity_df.loc[:, 'Singularity Runtime (Seconds)'])
print(t_val_rel)
t_val_ind = stats.ttest_ind(
    sampled_native_df.loc[:, 'Native Runtime (Seconds)'],
    sampled_singularity_df.loc[:, 'Singularity Runtime (Seconds)'])
print(t_val_ind)

ax = plt.gca()

sampled_native_df.plot(kind='hist',
                       y='Native Runtime (Seconds)',
                       color='red',
                       ax=ax)
sampled_singularity_df.plot(kind='hist',
                            y='Singularity Runtime (Seconds)',
Beispiel #55
0
# Data stored in form of xlsx with contents:
"""
   group  data
0      1    34
1      1    37
2      1    28
3      1    36
4      1    30
5      2    43
6      2    45
7      2    47
8      2    49
9      2    39
"""

# Assume these data are paired sample.

# ------------------------------------------------------------------------------

IS_t_test = pd.read_excel('E:\\IS_t_test.xlsx')

Group1 = IS_t_test[IS_t_test['group'] == 1]['data']
Group2 = IS_t_test[IS_t_test['group'] == 2]['data']

print(ttest_rel(Group1, Group2))
"""
(-5.6873679190073361, 0.00471961872448184)
"""
# The first element from output is the value of t
# The second element from output is p-value
Beispiel #56
0
df1 = pd.DataFrame({'sample':[6,5,5,4,6,7,6,4,5,6,4,5,5,6,4,8,6,5,6,7]})
df2 = pd.DataFrame({'sample':[7,5,7,8,7,8,8,5,7,6,5,5,6,6,5,7,9,7,7,8]})
t_result = stats.ttest_ind(df1, df2)
t, p = t_result.statistic.round(3), t_result.pvalue.round(3)
print("2-Sample t-test")
print("t검정통계량:{}".format(t))
print("p-value:{}".format(p))


# In[14]:


# page67 t-test연습
df1 = pd.DataFrame({'before':[720,589,780,648,720,589,780,648,780,648]})
df2 = pd.DataFrame({'after':[810,670,790,712,810,670,790,712,790,712]})
t_test = stats.ttest_rel(df1, df2)
t, p = t_result.statistic.round(3), t_test.pvalue.round(3)
print("paired t-test")
print("t:{}".format(t))
print("p:{}".format(p))


# In[15]:


# page68 paried t-test 실습
df1 = pd.DataFrame({'before':[720,589,780,648,720,589,780,648,780,648]})
df2 = pd.DataFrame({'after':[710,580,787,712,750,600,782,670,790,680]})
t_test = stats.ttest_rel(df1, df2)
t, p = t_result.statistic.round(3), t_test.pvalue.round(3)
print("paired t-test")
Beispiel #57
0
    data1 = df_eval1['r_avg'][:400].to_numpy()
    data2 = df_eval2['r_avg'][:400].to_numpy()
    data3 = df_eval3['r_avg'][:400].to_numpy()
    print('P-Value R_AVG: %0.5f %0.5f' % (stat, p))

    data1 = df_eval1['f1_avg'][:400].to_numpy()
    data2 = df_eval2['f1_avg'][:400].to_numpy()
    data3 = df_eval3['f1_avg'][:400].to_numpy()
    stat, p = stats.f_oneway(data1, data2, data3)
    print('P-Value F1_AVG: %0.5f %0.5f' % (stat, p))

    print("--- T-TEST Metode Pemilihan Kalimat ---")
    data1 = df_eval3['jc_avg'][:400].to_numpy()
    data2 = df_st_eval3['jc_avg'][:400].to_numpy()
    stat, p = stats.ttest_rel(data1, data2)
    print('P-Value JC_AVG: %0.5f %0.5f' % (stat, p))

    data1 = df_eval3['p_avg'][:400].to_numpy()
    data2 = df_st_eval3['p_avg'][:400].to_numpy()
    stat, p = stats.ttest_rel(data1, data2)
    print('P-Value P_AVG: %0.5f %0.5f' % (stat, p))

    data1 = df_eval3['r_avg'][:400].to_numpy()
    data2 = df_st_eval3['r_avg'][:400].to_numpy()
    stat, p = stats.ttest_rel(data1, data2)
    print('P-Value R_AVG: %0.5f %0.5f' % (stat, p))

    data1 = df_eval3['f1_avg'][:400].to_numpy()
    data2 = df_st_eval3['f1_avg'][:400].to_numpy()
    stat, p = stats.ttest_rel(data1, data2)
def main():
    (current_work_dir_path, asset_dir_path, program_dir_path,
     conda_program_dir_path) = utils.get_dir_paths()
    num_of_threads = multiprocessing.cpu_count()
    mafft_plus_consalifold_ppvs = []
    mafft_plus_consalifold_senss = []
    mafft_plus_consalifold_fprs = []
    mafft_plus_consalifold_f1_scores = []
    mafft_plus_consalifold_mccs = []
    probcons_plus_consalifold_ppvs = []
    probcons_plus_consalifold_senss = []
    probcons_plus_consalifold_fprs = []
    probcons_plus_consalifold_f1_scores = []
    probcons_plus_consalifold_mccs = []
    clustalw_plus_consalifold_ppvs = []
    clustalw_plus_consalifold_senss = []
    clustalw_plus_consalifold_fprs = []
    clustalw_plus_consalifold_f1_scores = []
    clustalw_plus_consalifold_mccs = []
    mafft_xinsi_plus_consalifold_ppvs = []
    mafft_xinsi_plus_consalifold_senss = []
    mafft_xinsi_plus_consalifold_fprs = []
    mafft_xinsi_plus_consalifold_f1_scores = []
    mafft_xinsi_plus_consalifold_mccs = []
    ref_sa_plus_consalifold_ppvs = []
    ref_sa_plus_consalifold_senss = []
    ref_sa_plus_consalifold_fprs = []
    ref_sa_plus_consalifold_f1_scores = []
    ref_sa_plus_consalifold_mccs = []
    contra_probcons_plus_consalifold_ppvs = []
    contra_probcons_plus_consalifold_senss = []
    contra_probcons_plus_consalifold_fprs = []
    contra_probcons_plus_consalifold_f1_scores = []
    contra_probcons_plus_consalifold_mccs = []
    contra_clustalw_plus_consalifold_ppvs = []
    contra_clustalw_plus_consalifold_senss = []
    contra_clustalw_plus_consalifold_fprs = []
    contra_clustalw_plus_consalifold_f1_scores = []
    contra_clustalw_plus_consalifold_mccs = []
    contra_mafft_plus_consalifold_ppvs = []
    contra_mafft_plus_consalifold_senss = []
    contra_mafft_plus_consalifold_fprs = []
    contra_mafft_plus_consalifold_f1_scores = []
    contra_mafft_plus_consalifold_mccs = []
    contra_mafft_xinsi_plus_consalifold_ppvs = []
    contra_mafft_xinsi_plus_consalifold_senss = []
    contra_mafft_xinsi_plus_consalifold_fprs = []
    contra_mafft_xinsi_plus_consalifold_f1_scores = []
    contra_mafft_xinsi_plus_consalifold_mccs = []
    contra_ref_sa_plus_consalifold_ppvs = []
    contra_ref_sa_plus_consalifold_senss = []
    contra_ref_sa_plus_consalifold_fprs = []
    contra_ref_sa_plus_consalifold_f1_scores = []
    contra_ref_sa_plus_consalifold_mccs = []
    gammas = [2.**i for i in range(min_gamma, max_gamma + 1)]
    rna_fam_dir_path = asset_dir_path + "/compiled_rna_fams_test"
    ref_sa_dir_path = asset_dir_path + "/ref_sas_test"
    mafft_plus_consalifold_css_dir_path = asset_dir_path + "/mafft_plus_consalifold"
    probcons_plus_consalifold_css_dir_path = asset_dir_path + "/probcons_plus_consalifold"
    clustalw_plus_consalifold_css_dir_path = asset_dir_path + "/clustalw_plus_consalifold"
    mafft_xinsi_plus_consalifold_css_dir_path = asset_dir_path + "/mafft_xinsi_plus_consalifold"
    ref_sa_plus_consalifold_css_dir_path = asset_dir_path + "/ref_sa_plus_consalifold"
    contra_probcons_plus_consalifold_css_dir_path = asset_dir_path + "/contra_probcons_plus_consalifold"
    contra_clustalw_plus_consalifold_css_dir_path = asset_dir_path + "/contra_clustalw_plus_consalifold"
    contra_mafft_plus_consalifold_css_dir_path = asset_dir_path + "/contra_mafft_plus_consalifold"
    contra_mafft_xinsi_plus_consalifold_css_dir_path = asset_dir_path + "/contra_mafft_xinsi_plus_consalifold"
    contra_ref_sa_plus_consalifold_css_dir_path = asset_dir_path + "/contra_ref_sa_plus_consalifold"
    pool = multiprocessing.Pool(num_of_threads)
    for gamma in gammas:
        mafft_plus_consalifold_count_params = []
        clustalw_plus_consalifold_count_params = []
        mafft_xinsi_plus_consalifold_count_params = []
        ref_sa_plus_consalifold_count_params = []
        probcons_plus_consalifold_count_params = []
        contra_probcons_plus_consalifold_count_params = []
        contra_clustalw_plus_consalifold_count_params = []
        contra_mafft_plus_consalifold_count_params = []
        contra_mafft_xinsi_plus_consalifold_count_params = []
        contra_ref_sa_plus_consalifold_count_params = []
        gamma_str = str(gamma) if gamma < 1 else str(int(gamma))
        for rna_fam_file in os.listdir(rna_fam_dir_path):
            if not rna_fam_file.endswith(".fa"):
                continue
            rna_seq_file_path = os.path.join(rna_fam_dir_path, rna_fam_file)
            rna_seq_lens = [
                len(rna_seq.seq)
                for rna_seq in SeqIO.parse(rna_seq_file_path, "fasta")
            ]
            num_of_rnas = len(rna_seq_lens)
            (rna_fam_name, extension) = os.path.splitext(rna_fam_file)
            ref_css_file_path = os.path.join(ref_sa_dir_path,
                                             rna_fam_name + ".sth")
            ref_css = utils.get_css(ref_css_file_path)
            mafft_plus_consalifold_estimated_css_dir_path = os.path.join(
                mafft_plus_consalifold_css_dir_path, rna_fam_name)
            probcons_plus_consalifold_estimated_css_dir_path = os.path.join(
                probcons_plus_consalifold_css_dir_path, rna_fam_name)
            clustalw_plus_consalifold_estimated_css_dir_path = os.path.join(
                clustalw_plus_consalifold_css_dir_path, rna_fam_name)
            mafft_xinsi_plus_consalifold_estimated_css_dir_path = os.path.join(
                mafft_xinsi_plus_consalifold_css_dir_path, rna_fam_name)
            ref_sa_plus_consalifold_estimated_css_dir_path = os.path.join(
                ref_sa_plus_consalifold_css_dir_path, rna_fam_name)
            contra_probcons_plus_consalifold_estimated_css_dir_path = os.path.join(
                contra_probcons_plus_consalifold_css_dir_path, rna_fam_name)
            contra_clustalw_plus_consalifold_estimated_css_dir_path = os.path.join(
                contra_clustalw_plus_consalifold_css_dir_path, rna_fam_name)
            contra_mafft_plus_consalifold_estimated_css_dir_path = os.path.join(
                contra_mafft_plus_consalifold_css_dir_path, rna_fam_name)
            contra_mafft_xinsi_plus_consalifold_estimated_css_dir_path = os.path.join(
                contra_mafft_xinsi_plus_consalifold_css_dir_path, rna_fam_name)
            contra_ref_sa_plus_consalifold_estimated_css_dir_path = os.path.join(
                contra_ref_sa_plus_consalifold_css_dir_path, rna_fam_name)
            mafft_plus_consalifold_estimated_css_file_path = os.path.join(
                mafft_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                mafft_plus_consalifold_estimated_css_file_path)
            mafft_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            probcons_plus_consalifold_estimated_css_file_path = os.path.join(
                probcons_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                probcons_plus_consalifold_estimated_css_file_path)
            probcons_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            clustalw_plus_consalifold_estimated_css_file_path = os.path.join(
                clustalw_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                clustalw_plus_consalifold_estimated_css_file_path)
            clustalw_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            mafft_xinsi_plus_consalifold_estimated_css_file_path = os.path.join(
                mafft_xinsi_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                mafft_xinsi_plus_consalifold_estimated_css_file_path)
            mafft_xinsi_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            ref_sa_plus_consalifold_estimated_css_file_path = os.path.join(
                ref_sa_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                ref_sa_plus_consalifold_estimated_css_file_path)
            ref_sa_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            contra_probcons_plus_consalifold_estimated_css_file_path = os.path.join(
                contra_probcons_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                contra_probcons_plus_consalifold_estimated_css_file_path)
            contra_probcons_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            contra_clustalw_plus_consalifold_estimated_css_file_path = os.path.join(
                contra_clustalw_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                contra_clustalw_plus_consalifold_estimated_css_file_path)
            contra_clustalw_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            contra_mafft_plus_consalifold_estimated_css_file_path = os.path.join(
                contra_mafft_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                contra_mafft_plus_consalifold_estimated_css_file_path)
            contra_mafft_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            contra_mafft_xinsi_plus_consalifold_estimated_css_file_path = os.path.join(
                contra_mafft_xinsi_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                contra_mafft_xinsi_plus_consalifold_estimated_css_file_path)
            contra_mafft_xinsi_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
            contra_ref_sa_plus_consalifold_estimated_css_file_path = os.path.join(
                contra_ref_sa_plus_consalifold_estimated_css_dir_path,
                "gamma=" + gamma_str + ".sth")
            estimated_css = utils.get_css(
                contra_ref_sa_plus_consalifold_estimated_css_file_path)
            contra_ref_sa_plus_consalifold_count_params.insert(
                0, (rna_seq_lens, estimated_css, ref_css))
        results = pool.map(get_bin_counts,
                           probcons_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        probcons_plus_consalifold_ppvs.insert(0, ppv)
        probcons_plus_consalifold_senss.insert(0, sens)
        probcons_plus_consalifold_fprs.insert(0, fpr)
        probcons_plus_consalifold_f1_scores.append(f1_score)
        probcons_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts,
                           clustalw_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        clustalw_plus_consalifold_ppvs.insert(0, ppv)
        clustalw_plus_consalifold_senss.insert(0, sens)
        clustalw_plus_consalifold_fprs.insert(0, fpr)
        clustalw_plus_consalifold_f1_scores.append(f1_score)
        clustalw_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts, mafft_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        mafft_plus_consalifold_ppvs.insert(0, ppv)
        mafft_plus_consalifold_senss.insert(0, sens)
        mafft_plus_consalifold_fprs.insert(0, fpr)
        mafft_plus_consalifold_f1_scores.append(f1_score)
        mafft_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts,
                           mafft_xinsi_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        mafft_xinsi_plus_consalifold_ppvs.insert(0, ppv)
        mafft_xinsi_plus_consalifold_senss.insert(0, sens)
        mafft_xinsi_plus_consalifold_fprs.insert(0, fpr)
        mafft_xinsi_plus_consalifold_f1_scores.append(f1_score)
        mafft_xinsi_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts,
                           ref_sa_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        ref_sa_plus_consalifold_ppvs.insert(0, ppv)
        ref_sa_plus_consalifold_senss.insert(0, sens)
        ref_sa_plus_consalifold_fprs.insert(0, fpr)
        ref_sa_plus_consalifold_f1_scores.append(f1_score)
        ref_sa_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts,
                           contra_probcons_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        contra_probcons_plus_consalifold_ppvs.insert(0, ppv)
        contra_probcons_plus_consalifold_senss.insert(0, sens)
        contra_probcons_plus_consalifold_fprs.insert(0, fpr)
        contra_probcons_plus_consalifold_f1_scores.append(f1_score)
        contra_probcons_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts,
                           contra_clustalw_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        contra_clustalw_plus_consalifold_ppvs.insert(0, ppv)
        contra_clustalw_plus_consalifold_senss.insert(0, sens)
        contra_clustalw_plus_consalifold_fprs.insert(0, fpr)
        contra_clustalw_plus_consalifold_f1_scores.append(f1_score)
        contra_clustalw_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts,
                           contra_mafft_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        contra_mafft_plus_consalifold_ppvs.insert(0, ppv)
        contra_mafft_plus_consalifold_senss.insert(0, sens)
        contra_mafft_plus_consalifold_fprs.insert(0, fpr)
        contra_mafft_plus_consalifold_f1_scores.append(f1_score)
        contra_mafft_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts,
                           contra_mafft_xinsi_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        contra_mafft_xinsi_plus_consalifold_ppvs.insert(0, ppv)
        contra_mafft_xinsi_plus_consalifold_senss.insert(0, sens)
        contra_mafft_xinsi_plus_consalifold_fprs.insert(0, fpr)
        contra_mafft_xinsi_plus_consalifold_f1_scores.append(f1_score)
        contra_mafft_xinsi_plus_consalifold_mccs.append(mcc)
        results = pool.map(get_bin_counts,
                           contra_ref_sa_plus_consalifold_count_params)
        ppv, sens, fpr, f1_score, mcc = get_metrics(final_sum(results))
        contra_ref_sa_plus_consalifold_ppvs.insert(0, ppv)
        contra_ref_sa_plus_consalifold_senss.insert(0, sens)
        contra_ref_sa_plus_consalifold_fprs.insert(0, fpr)
        contra_ref_sa_plus_consalifold_f1_scores.append(f1_score)
        contra_ref_sa_plus_consalifold_mccs.append(mcc)
    image_dir_path = asset_dir_path + "/images"
    if not os.path.exists(image_dir_path):
        os.mkdir(image_dir_path)
    consalifold_avg_mccs = [
        numpy.mean(clustalw_plus_consalifold_mccs),
        numpy.mean(mafft_plus_consalifold_mccs),
        numpy.mean(probcons_plus_consalifold_mccs),
        numpy.mean(mafft_xinsi_plus_consalifold_mccs),
        numpy.mean(ref_sa_plus_consalifold_mccs)
    ]
    contra_consalifold_avg_mccs = [
        numpy.mean(contra_clustalw_plus_consalifold_mccs),
        numpy.mean(contra_mafft_plus_consalifold_mccs),
        numpy.mean(contra_probcons_plus_consalifold_mccs),
        numpy.mean(contra_mafft_xinsi_plus_consalifold_mccs),
        numpy.mean(contra_ref_sa_plus_consalifold_mccs)
    ]
    avg_mccs = consalifold_avg_mccs + contra_consalifold_avg_mccs
    data = {
        "Average Matthews correlation coefficient":
        avg_mccs,
        "Alignment probability inference method":
        ["Turner"] * 5 + ["CONTRAfold"] * 5,
        "Sequence alignment source":
        ["ClustalW", "MAFFT", "ProbCons-RNA ", "MAFFT X-INS-i", "Reference"] *
        2
    }
    data_frame = pandas.DataFrame(data=data)
    ax = seaborn.barplot(x="Sequence alignment source",
                         y="Average Matthews correlation coefficient",
                         hue="Alignment probability inference method",
                         data=data_frame)
    ax.legend_.remove()
    fig = ax.get_figure()
    fig.tight_layout()
    fig.savefig(image_dir_path + "/consalifold_model_comparison_mcc_2.eps",
                bbox_inches="tight")
    fig.clf()
    consalifold_avg_f1_scores = [
        numpy.mean(clustalw_plus_consalifold_f1_scores),
        numpy.mean(mafft_plus_consalifold_f1_scores),
        numpy.mean(probcons_plus_consalifold_f1_scores),
        numpy.mean(mafft_xinsi_plus_consalifold_f1_scores),
        numpy.mean(ref_sa_plus_consalifold_f1_scores)
    ]
    contra_consalifold_avg_f1_scores = [
        numpy.mean(contra_clustalw_plus_consalifold_f1_scores),
        numpy.mean(contra_mafft_plus_consalifold_f1_scores),
        numpy.mean(contra_probcons_plus_consalifold_f1_scores),
        numpy.mean(contra_mafft_xinsi_plus_consalifold_f1_scores),
        numpy.mean(contra_ref_sa_plus_consalifold_f1_scores)
    ]
    avg_f1_scores = consalifold_avg_f1_scores + contra_consalifold_avg_f1_scores
    data = {
        "Average F1 score":
        avg_f1_scores,
        "Alignment probability inference method":
        ["Turner"] * 5 + ["CONTRAfold"] * 5,
        "Sequence alignment source":
        ["ClustalW", "MAFFT", "ProbCons-RNA ", "MAFFT X-INS-i", "Reference"] *
        2
    }
    data_frame = pandas.DataFrame(data=data)
    ax = seaborn.barplot(x="Sequence alignment source",
                         y="Average F1 score",
                         hue="Alignment probability inference method",
                         data=data_frame)
    pyplot.ylim(0, 0.75)
    ax.legend(loc="upper left")
    fig = ax.get_figure()
    fig.tight_layout()
    fig.savefig(image_dir_path +
                "/consalifold_model_comparison_f1_score_2.eps",
                bbox_inches="tight")
    fig.clf()
    consalifold_mccs = clustalw_plus_consalifold_mccs + mafft_plus_consalifold_mccs + probcons_plus_consalifold_mccs + mafft_xinsi_plus_consalifold_mccs + ref_sa_plus_consalifold_mccs
    contra_consalifold_mccs = contra_clustalw_plus_consalifold_mccs + contra_mafft_plus_consalifold_mccs + contra_probcons_plus_consalifold_mccs + contra_mafft_xinsi_plus_consalifold_mccs + contra_ref_sa_plus_consalifold_mccs
    consalifold_f1_scores = clustalw_plus_consalifold_f1_scores + mafft_plus_consalifold_f1_scores + probcons_plus_consalifold_f1_scores + mafft_xinsi_plus_consalifold_f1_scores + ref_sa_plus_consalifold_f1_scores
    contra_consalifold_f1_scores = contra_clustalw_plus_consalifold_f1_scores + contra_mafft_plus_consalifold_f1_scores + contra_probcons_plus_consalifold_f1_scores + contra_mafft_xinsi_plus_consalifold_f1_scores + contra_ref_sa_plus_consalifold_f1_scores
    print("MCC-based paired t-test:",
          stats.ttest_rel(consalifold_mccs, contra_consalifold_mccs))
    print("F1 score-based paired t-test:",
          stats.ttest_rel(consalifold_f1_scores, contra_consalifold_f1_scores))
Beispiel #59
0
def compare_images(label_nii_filename,
                   image1_nii_filename,
                   image2_nii_filename,
                   requested_labels,
                   min_volume,
                   verbose_flag=False):

    # Load arrays

    label_nii = label_stats.read_nifti_file(label_nii_filename,
                                            'Label file does not exist')
    image1_nii = label_stats.read_nifti_file(image1_nii_filename,
                                             'Image file does not exist')
    image2_nii = label_stats.read_nifti_file(image2_nii_filename,
                                             'Image file does not exist')

    # System Checks to verify that the Array Size and Dimensions are compatible

    image1_array = image1_nii.get_data()
    image2_array = image2_nii.get_data()
    label_array = label_nii.get_data()

    label_stats.image_shape_check(image1_array)
    label_stats.image_shape_check(image2_array)

    if not image1_array.shape == image2_array.shape:
        sys.exit('Image arrays must have the same shape')

    if not len(label_array.shape) == 3:
        sys.exit('Only supports 3D label arrays')

    if not image1_array.shape[0:len(label_array.shape)] == label_array.shape:
        sys.exit(
            'Image array and label array do not have the same voxel dimensions'
        )

    # Find a set of acceptable labels

    labels = label_stats.get_labels(requested_labels, label_array)

    # Permute array or expand so desired stats is along first dimension

    image1_array, nVolumes = label_stats.permute_image_array(image1_array)
    image2_array, nVolumes = label_stats.permute_image_array(image2_array)

    # Gather stats

    df_stats = pd.DataFrame(
        columns=('label_number', 'time_index', 'label_volume',
                 'boundary_image1_mean', 'boundary_image1_std',
                 'boundary_image1_min', 'boundary_image1_max',
                 'boundary_image2_mean', 'boundary_image2_std',
                 'boundary_image2_min', 'boundary_image2_max', 'image1_mean',
                 'image1_std', 'image1_min', 'image1_max', 'image2_mean',
                 'image2_std', 'image2_min', 'image2_max', 'scale',
                 'p_rel_scaled'))

    for ii, ii_label in enumerate(labels):

        mask = label_array == ii_label
        boundary_mask = binary_dilation(mask, structure=np.ones((3, 3, 3)))
        boundary_mask -= mask

        label_volume = np.sum(mask[:])

        if label_volume >= min_volume:  # Only perform paired t-test for volumes of a minimum size

            for jj in range(0, nVolumes):

                # Calculate signal intensity of boundary pixels

                boundary_image1_mean, boundary_image1_std, boundary_image1_min, boundary_image1_max = label_stats.individual_image_stats(
                    image1_array[jj][boundary_mask])
                boundary_image2_mean, boundary_image2_std, boundary_image2_min, boundary_image2_max = label_stats.individual_image_stats(
                    image2_array[jj][boundary_mask])

                scale = boundary_image1_mean / boundary_image2_mean

                # Scale image to match boundary pixels

                image1_mean, image1_std, image1_min, image1_max = label_stats.individual_image_stats(
                    image1_array[jj][mask])
                image2_mean, image2_std, image2_min, image2_max = label_stats.individual_image_stats(
                    scale * image2_array[jj][mask])

                # Calculate paired t-test from region of interest across two images
                t_rel, p_rel_scaled = stats.ttest_rel(
                    image1_array[jj][mask], scale * image2_array[jj][mask])

                # Save stats
                image_array_stats = [
                    ii_label, jj, label_volume, boundary_image1_mean,
                    boundary_image1_std, boundary_image1_min,
                    boundary_image1_max, boundary_image2_mean,
                    boundary_image2_std, boundary_image2_min,
                    boundary_image2_max, image1_mean, image1_std, image1_min,
                    image1_max, image2_mean, image2_std, image2_min,
                    image2_max, scale, p_rel_scaled
                ]

                if verbose_flag:
                    print image_array_stats

                df_stats.loc[len(df_stats)] = image_array_stats

    return df_stats
Beispiel #60
0
        healthy_without_Zeros.append(a[2:])
        cancer_without_Zeros.append(b[2:])
healthy_without_Zeros = [
    list(map(float, sublist)) for sublist in healthy_without_Zeros
]
cancer_without_Zeros = [
    list(map(float, sublist)) for sublist in cancer_without_Zeros
]
# print(cancer_without_Zeros)
# print(len(cancer_without_Zeros))
# print(Canser_names)

# get the p-values when Samples are paired.
p_val_pair = []
for x, y in zip(healthy_without_Zeros, cancer_without_Zeros):
    l = stats.ttest_rel(x, y).pvalue
    p_val_pair.append(l)

# get the p-values when Samples are independent.
p_val_ind = []
for x, y in zip(healthy_without_Zeros, cancer_without_Zeros):
    l = stats.ttest_ind(x, y).pvalue
    p_val_ind.append(l)

# #Apply the FDR multiple tests correction method on the paired samples.
# #list of tubles(reject:true for hypothesis that can be rejected for given alpha,
# #p-values corrected,corrected alpha for Sidak method,corrected alpha for Bonferroni method)
corrected_p_valpair_rej = multipletests(p_val_pair,
                                        alpha=0.05,
                                        method='fdr_bh')[0]
corrected_p_val_pair = multipletests(p_val_pair, alpha=0.05,