def in_out_mask_ttest_from_conn_map(wpth,rseedz, imsk, omsk):

    oldpth = pthswp(wpth)

    incde = codegen(6)
    outcde = codegen(6)

    print 'generating data masks....'
    inmsk = img_2_maskd_array(imsk)
    outmsk = img_2_maskd_array(omsk)

    print 'beginning tests'
    ind = []
    for i in range(len(rseedz)):
        ind.append((i+1))
    df = pandas.DataFrame(np.zeros((len(rseedz),4)),index = ind, columns =['t','p','wt','wp'])
    for i,seed in enumerate(rseedz):
        os.system('fslmaths %s -mas %s %s'%(seed,imsk,incde))
        os.system('fslmaths %s -mas %s %s'%(seed,omsk,outcde))
        invals = np.ma.masked_array(ni.load('%s.nii.gz'%(incde)).get_data(),mask = inmsk).flatten()
        outvals = np.ma.masked_array(ni.load('%s.nii.gz'%(outcde)).get_data(),mask = outmsk).flatten()
        t,p = st.ttest_ind(invals,outvals)
        wt,wp = st.ttest_ind(invals,outvals,equal_var = False)
        df.ix[(i+1),'t'] = t 
        df.ix[(i+1),'p'] = p
        df.ix[(i+1),'wt'] = wt
        df.ix[(i+1),'wp'] = wp
        print 'finished with seed %s'%(seed)

    os.system('rm %s* %s*'%(incde,outcde))
    os.chdir(oldpth)

    return df
Example #2
0
def test_continuous(a, b):
    # simple t-test
    try:
        p_value = stats.ttest_ind(a, b)[1]
    except:
        p_value = 1
    return p_value, "%.2g" % mean(a)
Example #3
0
def print_table():
    print >> sys.stderr, "printing table now"
    for i in range(len(Table[0])):
        a = bin2pos(i)
        if ifSlim:
            s = 0
            for j in range(len(Table)):
                s = s + Table[j][i]
            if s == 0:
                continue
        print a.chr, "\t", a.start, "\t", a.stop,
        if ifCompare:
            x = []
            y = []
            for j in range(case_number):
                x.append(Table[j][i])
            for j in range(case_number, len(Table)):
                y.append(Table[j][i])
            (t, p) = ttest_ind(x, y)
            print "\t", t, "\t", p,
        for j in range(len(Table)):
            if ifNormalize:
                print "\t%.2f" % Table[j][i],
            else:
                print "\t", Table[j][i],
        for j in range(len(AnnoTable)):
            print "\t", AnnoTable[j][i],
        print
Example #4
0
def permutation_ttest(W, B):
    p_value = permutation_test(W, B,
                               method='approximate',
                               num_rounds=100,
                               func=lambda W, B: stats.ttest_ind(W, B),
                               seed=0)
    return 1 if p_value < 0.05 else 0
Example #5
0
def _ttest(orig_score, rep_score, rpd=True, pbar=False):
    """

    @param orig_score: The original scores.
    @param rep_score: The reproduced/replicated scores.
    @param rpd: Boolean indicating if the evaluated runs are reproduced.
    @param pbar: Boolean value indicating if progress bar should be printed.
    @return: Generator with p-values.
    """
    if rpd:  # paired two-tailed t-test
        topic_scores_orig = topic_scores(orig_score)
        topic_scores_rep = topic_scores(rep_score)

        generator = tqdm(
            topic_scores_orig.items()) if pbar else topic_scores_orig.items()

        for measure, scores in generator:
            yield measure, ttest_rel(scores,
                                     topic_scores_rep.get(measure)).pvalue

    else:  # else unpaired two-tailed t-test
        topic_scores_orig = topic_scores(orig_score)
        topic_scores_rep = topic_scores(rep_score)

        generator = tqdm(
            topic_scores_orig.items()) if pbar else topic_scores_orig.items()

        for measure, scores in generator:
            yield measure, ttest_ind(scores,
                                     topic_scores_rep.get(measure)).pvalue
Example #6
0
def test_continuous(a, b):
    # simple t-test
    try:
        p_value = stats.ttest_ind(a, b)[1]
    except:
        p_value = 1
    return p_value, "%.2g" % mean(a)
Example #7
0
def print_table():
    print >>sys.stderr,"printing table now"
    for i in range(len(Table[0])):
        a=bin2pos(i)
        if ifSlim:
            s=0
            for j in range(len(Table)):
                s=s+Table[j][i]
            if s==0:
                continue
        print a.chr,"\t",a.start,"\t",a.stop,
        if ifCompare:
            x=[]
            y=[]
            for j in range(case_number):
                x.append(Table[j][i])
            for j in range(case_number,len(Table)):
                y.append(Table[j][i])
            (t,p)=ttest_ind(x,y)
            print "\t",t,"\t",p,
        for j in range(len(Table)):
            if ifNormalize:
                print "\t%.2f"%Table[j][i], 
            else:
                print "\t",Table[j][i],
        for j in range(len(AnnoTable)):
            print "\t",AnnoTable[j][i],
        print 
Example #8
0
def jitter_MWU(values, start, mid, end):
    """
    RETURN A BETTER MIDPOINT< ACCOUNTING FOR t-test RESULTS
    """

    # ADD SOME CONSTRAINTS TO THE RANGE OF VALUES TESTED
    m_start = min(mid, max(start + MIN_POINTS, mid - JITTER))
    m_end = max(mid, min(mid + JITTER, end - MIN_POINTS))
    if m_start == m_end:
        return no_good_edge, no_good_edge, mid
    mids = np.array(range(m_start, m_end))

    # MWU SCORES
    try:
        m_score = np.array([
            stats.mannwhitneyu(
                values[max(start, m - MAX_POINTS):m],
                values[m:min(end, m + MAX_POINTS)],
                use_continuity=True,
                alternative="two-sided",
            ) for m in mids
        ])

        t_score = np.array([
            stats.ttest_ind(
                values[max(start, m - MAX_POINTS):m],
                values[m:min(end, m + MAX_POINTS)],
                equal_var=False,
            ) for m in mids
        ])

    except Exception as e:
        e = Except.wrap(e)
        if "All numbers are identical" in e:
            return no_good_edge, no_good_edge, mids[0]
        raise e

    # TOTAL SUM-OF-SQUARES
    # DO NOT KNOW WHAT THIS WAS DOING
    # if m_start - start == 0:
    #     # WE CAN NOT OFFSET BY ONE, SO WE ADD A DUMMY VALUE
    #     v_prefix = np.array([np.nan] + list(not_right(cumSS(values[start:m_end]), 1)))
    # else:
    #     # OFFSET BY ONE, WE WANT cumSS OF ALL **PREVIOUS** VALUES
    #     v_prefix = not_right(
    #         not_left(cumSS(values[start:m_end]), m_start - start - 1), 1
    #     )
    # v_suffix = not_right(cumSS(values[m_start:end][::-1])[::-1], end - m_end)
    # v_score = v_prefix + v_suffix
    # pvalue = np.sqrt(m_score[:, 1] * v_score)  # GOEMEAN OF SCORES

    # PICK LOWEST
    pvalue = np.sqrt(m_score[:, 1] * t_score[:, 1])
    best = np.argmin(pvalue)

    return Data(pvalue=m_score[best, 1]), Data(pvalue=t_score[best,
                                                              1]), mids[best]
Example #9
0
 def select(self, n, array, elems, result, ref):
     if n == 0:
         t2, p2 = stats.ttest_ind(elems, ref)
         result.append(p2)
     else:
         for i in range(len(array)):
             result = self.select(n - 1, array[i + 1:], elems + [array[i]],
                                  result, ref)
     return result
Example #10
0
 def perform_ttest_on_averages(self):
     """
     Performs t-test on the average cos-sim score of each user
     :return:
     """
     control_scores = self.get_avg_scores('control')
     patient_scores = self.get_avg_scores('patients')
     ttest = stats.ttest_ind(control_scores, patient_scores)
     return ttest
Example #11
0
def ttest_f(train, trainLabel, loop):
    (a, b) = np.shape(train)
    score = np.zeros(b)
    target = trainLabel
    for i in range(b):
        fea = train[:, i]
        val = ttest_ind(fea, target)[0]
        score[i] = val
    ranking = np.argsort(score)[::-1]
    return ranking[0:loop]
Example #12
0
def roi_ttest():
    """
    compare rsfc difference between ROIs
    scheme: hemi-separately network-wise
    """
    import numpy as np
    import pickle as pkl
    import pandas as pd
    from scipy.stats.stats import ttest_ind
    from cxy_hcp_ffa.lib.predefine import net2label_cole
    from commontool.stats import EffectSize

    # parameters
    hemis = ('lh', 'rh')
    roi_pair = ('pFus-face', 'mFus-face')
    data_file = pjoin(work_dir, 'rsfc_individual2Cole_{}.pkl')
    compare_name = f"{roi_pair[0].split('-')[0]}_vs_" \
                   f"{roi_pair[1].split('-')[0]}"

    # outputs
    out_file = pjoin(work_dir,
                     f"rsfc_individual2Cole_{compare_name}_ttest.csv")

    # start
    trg_names = list(net2label_cole.keys())
    trg_labels = list(net2label_cole.values())
    out_data = {'network': trg_names}
    es = EffectSize()
    for hemi in hemis:
        data = pkl.load(open(data_file.format(hemi), 'rb'))
        assert data['trg_label'] == trg_labels

        out_data[f'CohenD_{hemi}'] = []
        out_data[f't_{hemi}'] = []
        out_data[f'P_{hemi}'] = []
        for trg_idx, trg_name in enumerate(trg_names):
            sample1 = data[roi_pair[0]][:, trg_idx]
            sample2 = data[roi_pair[1]][:, trg_idx]
            nan_vec1 = np.isnan(sample1)
            nan_vec2 = np.isnan(sample2)
            print(f'#NAN in sample1:', np.sum(nan_vec1))
            print(f'#NAN in sample2:', np.sum(nan_vec2))
            sample1 = sample1[~nan_vec1]
            sample2 = sample2[~nan_vec2]
            d = es.cohen_d(sample1, sample2)
            t, p = ttest_ind(sample1, sample2)
            out_data[f'CohenD_{hemi}'].append(d)
            out_data[f't_{hemi}'].append(t)
            out_data[f'P_{hemi}'].append(p)

    # save out
    out_data = pd.DataFrame(out_data)
    out_data.to_csv(out_file, index=False)
def in_out_mask_ttest_from_conn_map(wpth, rseedz, imsk, omsk):

    oldpth = pthswp(wpth)

    incde = codegen(6)
    outcde = codegen(6)

    print 'generating data masks....'
    inmsk = img_2_maskd_array(imsk)
    outmsk = img_2_maskd_array(omsk)

    print 'beginning tests'
    ind = []
    for i in range(len(rseedz)):
        ind.append((i + 1))
    df = pandas.DataFrame(np.zeros((len(rseedz), 4)),
                          index=ind,
                          columns=['t', 'p', 'wt', 'wp'])
    for i, seed in enumerate(rseedz):
        os.system('fslmaths %s -mas %s %s' % (seed, imsk, incde))
        os.system('fslmaths %s -mas %s %s' % (seed, omsk, outcde))
        invals = np.ma.masked_array(ni.load('%s.nii.gz' % (incde)).get_data(),
                                    mask=inmsk).flatten()
        outvals = np.ma.masked_array(ni.load('%s.nii.gz' %
                                             (outcde)).get_data(),
                                     mask=outmsk).flatten()
        t, p = st.ttest_ind(invals, outvals)
        wt, wp = st.ttest_ind(invals, outvals, equal_var=False)
        df.ix[(i + 1), 't'] = t
        df.ix[(i + 1), 'p'] = p
        df.ix[(i + 1), 'wt'] = wt
        df.ix[(i + 1), 'wp'] = wp
        print 'finished with seed %s' % (seed)

    os.system('rm %s* %s*' % (incde, outcde))
    os.chdir(oldpth)

    return df
Example #14
0
def ttest_variables_targetBinary(data, X, y=None, method='data'):
    """
    Calculates the effect of feature variables on target variable
    using ttest and save it on a DataFrame
    :param
        data - DataFrame with relevant columns or dictionary with statistics (mean, std)
        X - feature columns
        y - target variable name,  only binary y is accepted for now
        method - ttest to be used, 'either' data or 'statistics' (when only mean and std available)
    :return
        result - DataFrame with column name, statistic value and p-value
    """
    result = pd.DataFrame(columns=['col', 'statistic_value', 'pvalue'])
    if method == 'data':
        assert isinstance(data, pd.DataFrame), "Data is not a DataFrame"
        assert y is not None, "Target variable name not provided"
        y_levels = data[y].unique()
        assert len(y_levels) == 2, "More that two levels in target variable"
        for col in X:
            data_col = data.dropna(subset=[col])
            level_0 = data_col.loc[data_col[y] == y_levels[0], col]
            level_1 = data_col.loc[data_col[y] == y_levels[1], col]
            stat, p = ttest_ind(level_0, level_1)
            to_append = pd.DataFrame([{
                'col': col,
                'statistic_value': stat,
                'pvalue': p
            }])
            result = result.append(to_append, sort=False, ignore_index=True)
    elif method == 'statistics':
        assert isinstance(data, dict), "Data is not a Dictionary"
        for col in X:
            assert col in data, "{} not in data".format(col)
            data_col = data[col]
            stats_ = ['mean1', 'mean2', 'std1', 'std2', 'nob1', 'nob2']
            for stat in stats_:
                assert stat in data_col, "{} not in {} data".format(stat, col)
            mean1, std1, nob1 = data_col['mean1'], data_col['std1'], data_col[
                'nob1']
            mean2, std2, nob2 = data_col['mean1'], data_col['std2'], data_col[
                'nob2']
            stat, p = ttest_ind_from_stats(mean1, std1, nob1, mean2, std2,
                                           nob2)
            to_append = pd.DataFrame([{
                'col': col,
                'statistic_value': stat,
                'pvalue': p
            }])
            result = result.append(to_append, sort=False, ignore_index=True)
    return result
def jitter_MWU(values, start, mid, end):
    # ADD SOME CONSTRAINTS TO THE RANGE OF VALUES TESTED
    m_start = min(mid, max(start + MIN_POINTS, mid - JITTER))
    m_end = max(mid, min(mid + JITTER, end - MIN_POINTS))
    if m_start == m_end:
        return no_good_edge, no_good_edge, mid
    mids = np.array(range(m_start, m_end))

    # MWU SCORES
    m_score = np.array(
        [
            stats.mannwhitneyu(
                values[max(start, m - MAX_POINTS) : m],
                values[m : min(end, m + MAX_POINTS)],
                use_continuity=True,
                alternative="two-sided",
            )
            for m in mids
        ]
    )

    t_score = np.array(
        [
            stats.ttest_ind(
                values[max(start, m - MAX_POINTS) : m],
                values[m : min(end, m + MAX_POINTS)],
                equal_var=False,
            )
            for m in mids
        ]
    )

    # TOTAL SUM-OF-SQUARES
    if m_start - start == 0:
        # WE CAN NOT OFFSET BY ONE, SO WE ADD A DUMMY VALUE
        v_prefix = np.array([np.nan] + list(not_right(cumSS(values[start:m_end]), 1)))
    else:
        # OFFSET BY ONE, WE WANT cumSS OF ALL **PREVIOUS** VALUES
        v_prefix = not_right(
            not_left(cumSS(values[start:m_end]), m_start - start - 1), 1
        )
    v_suffix = not_right(cumSS(values[m_start:end][::-1])[::-1], end - m_end)
    v_score = v_prefix + v_suffix

    # PICK LOWEST
    pvalue = np.sqrt(m_score[:, 1] * v_score)  # GOEMEAN OF SCORES
    best = np.argmin(pvalue)

    return Data(pvalue=m_score[best, 1]), Data(pvalue=t_score[best, 1]), mids[best]
def t_test_ind(dataset, target_col, protected_col, equal_var=0):
    """
    performs the independent two-sample t-Test, or Welch's test if equality of the variances is not
    given

    @param dataset:
    @param target_col:      name of the column that contains the classifier results
    @param protected_col:   name of the column that contains the protection status
    @param equal_var:       if True, perform a standard independent 2 sample test that
                            assumes equal population variances and sample size. If False (default), perform Welch’s t-test,
                            which does not assume equal population variance

    @return: calculated t-statistic and two-tailed p-value

    """
    protected_targets = dataset.get_all_targets_of_group(
        target_col, protected_col, 1)
    nonprotected_targets = dataset.get_all_targets_of_group(
        target_col, protected_col, 0)
    return ttest_ind(protected_targets, nonprotected_targets, equal_var)
Example #17
0
    def transform(self, labels):

        ds = self.dataset
        conditions = self.conditions
        single_value = self.sample_value

        if conditions == single_value == None:
            raise ValueError()
        elif len(conditions) > 2:
            raise ValueError()

        if single_value != None:
            t, p = ttest_1samp(ds, single_value, axis=0)
            return t, p

        t, p = ttest_ind(ds[labels == conditions[0]],
                         ds[labels == conditions[1]],
                         axis=0)
        #print ds.shape
        #print t.shape
        t[np.isnan(t)] = 1
        return t
Example #18
0
 def run(self, labels):
     
     ds = self.dataset
     conditions = self.conditions
     single_value = self.sample_value
     
     if conditions == single_value == None:
         raise ValueError()
     elif len(conditions)>2:
         raise ValueError()
 
     if single_value != None:
         t, p = ttest_1samp(ds, single_value, axis=0)
         return t, p
 
 
     t, p = ttest_ind(ds[labels == conditions[0]],
                  ds[labels == conditions[1]],
                  axis=0
                  )
     #print ds.shape
     #print t.shape
     t[np.isnan(t)] = 1
     return t
Example #19
0
def main(symbol_dict):

    # Here's where I put it all together. Once the slope of each fund
    # have been calculated, I run a two-tailed independent t-test
    # comparing the sets of slopes for the two types of funds. The
    # test returns a tuple of the form (test statistic, p-value)

    # The results require us to set a statistical significance level (alpha).
    # (when I was analyzing astroparticle physics data, we used
    # alpha = 5.7x10^(-5), but for this, alpha = 0.05 or 0.01 should be
    # sufficient.) The results can be interpreted as follows:

    # p > alpha/2: There is insufficient evidence to reject the claim
    # that the two samples have the same mean (i.e. there is no difference
    # between the leveraged bonds and the emerging market stocks)

    # p < alpha/2 and t < 0: the average increase in value of an emerging
    # market fund is likely greater than that of a leveraged bond fund over the
    # same period

    # p < alpha/2 and t > 0: the average increase in value of an emerging
    # market fund is likely less than that of a leveraged bond fund over the
    # same period

    # Note: I am running a Welch's t-test rather than a Student's t-test
    # because there is no reason to assume that the two populations have equal
    # variances.

    slopes = {}
    for key in symbol_dict:
        for bond in symbol_dict[key]:
            pull_historical_data(bond, key)
        slopes[key] = get_slopes(symbol_dict[key], key)
    return stats.ttest_ind(slopes['US_bond'],
                           slopes['emerging_market'],
                           equal_var=False)
Example #20
0
def student_t_test_ind(approaches, accuracy_values, save_path):
    # calculate the two sided unpaired students t-test from scipy
    # it compare all approaches with each other
    # calculate the T-test for the means of two independent samples of scores
    student_t_test_ind_frame = pd.DataFrame()
    for i in range(len(approaches)):
        for j in range(i, len(approaches), 1):
            # iterate through approaches
            approach_i = approaches[i]
            approach_j = approaches[j]
            values_i = accuracy_values.loc[:, approach_i]
            values_j = accuracy_values.loc[:, approach_j]
            t_statistic, two_tailed_p_test = stats.ttest_ind(values_i, values_j)
            student_t_test_ind_frame.at[approach_i, approach_j] = two_tailed_p_test

    save_path.mkdir(parents=True, exist_ok=True)
    fig = plt.figure(figsize=(4, 2))
    ax = fig.subplots()
    ax = sns.heatmap(student_t_test_ind_frame, ax=ax, annot=True, fmt="0.3f", cmap="autumn", vmin=0, vmax=0.05)
    plt.xticks(rotation=45)
    fig.canvas.start_event_loop(sys.float_info.min)
    path = save_path / 'students-test_scipy_ind.png'
    fig.savefig(path, bbox_inches='tight', dpi=100)
    plt.close(fig)
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])==0:
			print >> stderr,xtickLabels[c],"discarded"
			del plotData[c]
			del xtickLabels[c]


	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		try:
			print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]
		except:
			print >> stdout, xtickLabels[x],"NA","NA"

	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				try:
					pvalue=ttest_ind(plotData[x],plotData[y])[1]
				except:
					pvalue=1.0

				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				try:
					pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				except:
					pvalue=1.0
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
Example #22
0
####Pearson test for correlations
print('Correlation test for continuous variables')
for i in range(0, len(continuous)):
    for j in range(i + 1, len(continuous)):
        print('corr between', continuous[i], 'and', continuous[j], ' : r=',
              round(pearsonr(stud[continuous[i]], stud[continuous[j]])[0], 3),
              ' p=',
              round(pearsonr(stud[continuous[i]], stud[continuous[j]])[1], 3))

####T-test for G3 mean: binary features
for var in binary:
    print('G3 mean analisys by ' + var)
    print(stud.groupby([var]).mean()['G3'])
    x = stud.where(stud[var] == np.unique(stud[var])[0]).dropna()['G3']
    y = stud.where(stud[var] == np.unique(stud[var])[1]).dropna()['G3']
    print(ttest_ind(x, y))
    plt.figure()
    sns.boxplot(x=var, y='G3', data=stud)
    plt.show()

#The null hypothesis is not rejected in the variables:
#famsize, parents cohabitation, scholar support, family support,
#paid, extracurricular activities, and nursery.

####Spearman rho test between G3 and ordinal features
for var in ordinal:
    print('Spearman rho test between G3 and' + ' ' + var)
    print(spearmanr(stud.G3, stud[var]))

#The null hypothesis of nulity correlation of correlation is not
#rejected in the variables: famrel
        numsamples=500
        samp_size=10
        alpha=0.05
        print "\n",lith[f],round(np.mean(curRh),0)
        smd.append(SMD_analysis(curUCS,numsamples,samp_size,alpha))
        curax=plt.gca()
        if f==len(lith)-1:
            curax.set_xlabel("UCS, MPa")
        plt.setp(curax.get_yticklabels(), visible=False)
        plt.tight_layout()

    

    for f in range(len(lith)):
        cursmd=smd[f]
        print ""
        for g in range(f+1,len(lith)):
            comsmd=smd[g]

            T,pT=st.ttest_ind(cursmd,comsmd)
            KS,pKS=st.ks_2samp(cursmd,comsmd)
            print [lith[f], lith[g], round(pT,3), round(pKS,3)]
                          
                          
plt.show()        
        
                
                
            
            
plt.title('Difference between charges of a smoker and a non-smoker')
plt.show()
plt.close()
# No apparent relationship between gender and charges

###########################################################################
# 1. T-test to check the dependancy of smoking and charges
h0 = "Charges of smoker and non-smoker are the same"
h1 = "Charges of smoker and non-smoker are not the same"

# selecting charges corressponding to smokers as an array
x = np.array(insurance_df[insurance_df['smoker'] == 'yes']['charges'])
# selecting charges corressponding to non-smokers as an array
y = np.array(insurance_df[insurance_df['smoker'] == 'no']['charges'])

t, p_value = stats.ttest_ind(x, y, axis=0)

# For significance level of 5%

if p_value < 0.05:
    print(f'{h1} as the p_value {p_value.__round__(3)} < 0.05')
else:
    print(f'{h0} as the p_value {p_value.__round__(3)} > 0.05')

print(
    'Analysis: Charges of smoker and non-smoker are not the same as p_value < 0.05'
)
#############################################################################

# 2.  BMI of males differ from females significantly
nTrialsSuffix = "-nTrials(1)";
wholeSuffix=  truncatedSuffix + controlledErrorSuffix + nTrialsSuffix;
# trainEvalData = [pickle.load(open(folder + "trainingSetIDs " + str(i) + "truncated-trainSetEvaluated.dat")) for i in setNumbers)];
simpleTestSampleData = [pickle.load(open("simpleData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-evaluated.dat")) for i in setNumbers];
simpleShuffledData = [pickle.load(open("simpleData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-randomizedRSEvaluated.dat")) for i in setNumbers];
simpleRandomANNData = [pickle.load(open("simpleData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-randomANNEvaluated.dat")) for i in setNumbers];

complexTestSampleData = [pickle.load(open("complexData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-evaluated.dat")) for i in setNumbers];
complexShuffledData = [pickle.load(open("complexData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-randomizedRSEvaluated.dat")) for i in setNumbers];
complexRandomANNData = [pickle.load(open("complexData/" + "trainingSetIDs " + str(i) + wholeSuffix + "-randomANNEvaluated.dat")) for i in setNumbers];


ind = np.arange(N);  # the x locations for the groups
width = 0.2;       # the width of the bars

print("Ttest simple Exp. vs Permuted", ttest_ind(simpleTestSampleData, simpleShuffledData));
print("Ttest simple Exp. vs Random", ttest_ind(simpleTestSampleData, simpleRandomANNData));


print("Ttest complex Exp. vs Permuted", ttest_ind(complexTestSampleData, complexShuffledData));
print("Ttest complex Exp. vs Random", ttest_ind(complexTestSampleData, complexRandomANNData));


fig, ax = plt.subplots();

x = [.15, .25, .35, .55, .65, .75]
point0 = ax.errorbar(x[2], np.mean(simpleTestSampleData),
			yerr = np.std(simpleTestSampleData), 
			fmt = 'o', markersize = 5, color = 'g', ecolor = 'g');
point1 = ax.errorbar(x[1], np.mean(simpleShuffledData),
			yerr = np.std(simpleShuffledData), 
def poetry_analysis():

    poems_needed = [
        'i-will-sing-you-one-o.txt', 'place-for-a-third.txt',
        'the-runaway.txt', 'wild-grapes.txt', 'a-winter-eden.txt',
        'sitting-by-a-bush-in-broad-sunlight.txt', 'new-hampshire.txt',
        'pea-brush.txt', 'the-most-of-it.txt', 'the-times-table.txt'
    ]

    calculate_cli = []
    calculate_ari = []
    calculate_fkgl = []

    for file in poems_needed:
        with open(os.path.join("allpoems", file), "r") as infile:
            all_text = infile.readlines()
            entry = ''.join(all_text)
            split_words = entry.split()
            no_punc = re.sub(r"[^\w\d\s\.]*", "", entry)
            no_lines_punc = no_punc.replace('\n', ' ')

            ###########################################
            # CALCULATE NUMBER OF LINES
            ###########################################
            def count_lines():
                line_count = 0
                for line in entry:
                    if line == '\n':
                        line_count += 1
                return line_count

            ###########################################
            # CALCULATE NUMBER OF WORDS
            ###########################################
            def count_words():
                count = len(split_words)
                return (count)

            ###########################################
            # CALCULATE NUMBER OF CHARACTERS
            ###########################################
            def count_char():
                return len(no_lines_punc)

            ##########################################
            # CALCULATE NUMBER OF SYLLABLES IN POEMS
            #########################################
            no_newline = no_punc.replace("\n", " ")
            no_extra = re.sub(pattern=r"\s{2,}", repl=" ", string=no_newline)
            the_words = no_extra.split(
                ' ')  # a list of every word in the poem, for every poem
            word_count = len(the_words)
            syllables = 0

            for word in the_words:
                syllables += syllable_count(word)

            ##########################################
            # CALCULATE NUMBER OF FKGL, CLI, ARI
            ##########################################

            FKGL = float(0.39 * (count_words() / count_lines()) + 11.8 *
                         (syllables / count_words()) - 15.59)
            calculate_fkgl.append(FKGL)
            CLI = (5.89 * (count_char() / count_words())) - (
                0.3 * (count_lines() / count_words())) - 15.8
            calculate_cli.append(CLI)
            ARI = (4.71 * ((count_char() / count_words())) + 0.5 *
                   ((count_words() / count_lines())) - 21.43)
            calculate_ari.append(ARI)

    # ##########################################
    # # CREATING A CSV FILE
    # ##########################################

    df = pd.read_csv('poem_info.csv')
    df.to_csv('poem_data.csv', header=True, index=False)
    poem_data = pd.read_csv('poem_data.csv')
    poem_data = df.loc[[2, 7, 12, 13, 14, 17, 27, 29, 30, 32]]
    poem_data.rename(columns={'poemname': 'poemid'}, inplace=True)
    poem_data['fkgl'] = [float(item) for item in calculate_fkgl]
    poem_data['cli'] = [float(item) for item in calculate_cli]
    poem_data['ari'] = [float(item) for item in calculate_ari]
    poem_data.to_csv('poem_data.csv', header=True, index=False)

    # pprint(poem_data)

    def zprint(*args, **kwargs):
        print(*args, **kwargs, end='\n\n')

    # ##########################################
    # SHOW MEAN
    # ##########################################
    zprint('___MEANS___:\n', poem_data.groupby('poemsize').mean())

    # ##########################################
    # SHOW STATS
    # ##########################################
    small_medium = stats.ttest_ind(
        poem_data.query('poemsize=="small"')['cli'],
        poem_data.query('poemsize=="medium"')['cli'],
        equal_var=False)
    medium_large = stats.ttest_ind(
        poem_data.query('poemsize=="medium"')['cli'],
        poem_data.query('poemsize=="large"')['cli'],
        equal_var=False)
    small_large = stats.ttest_ind(poem_data.query('poemsize=="small"')['cli'],
                                  poem_data.query('poemsize=="large"')['cli'],
                                  equal_var=False)
    N = len(poem_data.index) - 1
    print('t({})={:0.2f}, p={:0.2f}'.format(N, small_medium.statistic,
                                            small_medium.pvalue))
    print('t({})={:0.2f}, p={:0.2f}'.format(N, medium_large.statistic,
                                            medium_large.pvalue))
    print('t({})={:0.2f}, p={:0.2f}'.format(N, small_large.statistic,
                                            small_large.pvalue))
    fig = seaborn.factorplot(x='poemsize',
                             y='ari',
                             data=poem_data,
                             kind='bar',
                             size=5)
    pyplot.show(fig)
Example #27
0
        s1 += [0]
    
    if rnd.random() < p2:
        s2 += [1]
    else:
        s2 += [0]

a1 = []
a2 = []
for i in xrange(0, num_tosses+1):
    a1 += [ average( s1[ : (i+1)] ) ]
    a2 += [ average( s2[ : (i+1)] ) ]

ps = []
for i in xrange(0, num_tosses+1):
    statistic, p = ttest_ind(s1[ : i+1], s2[ : i+1])
    ps += [p]

subplot(211)
line1 = plot(range(0, num_tosses+1), a1, color='blue')
line2 = plot(range(0, num_tosses+1), a2, color='black')
legend((line1, line2), ('p1', 'p2'))
axis([0, num_tosses, 0, 1])
xlabel('Number of tosses')
ylabel('Average heads')
title('Estimated probability of heads')

subplot(212)
pline = plot(range(0, num_tosses+1), ps, color='green')
threshold = plot(range(0, num_tosses+1), [0.05] * (num_tosses+1), color='red')
legend((pline, threshold), ('ttest', '0.05 threshold'))
def plotExpBox_Main(inputFile,header,cols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!


	fin=generic_istream(inputFile)
	
	plotData=[]	
	xtickLabels=[]
	for col in cols:
		plotData.append([])
		xtickLabels.append(header[col])

	colIndices=range(0,len(cols))

	lino=0
	for lin in fin:
		lino+=1
		if lino<startRow:
			continue		
		fields=lin.rstrip("\r\n").split(sep)
		
		for idx,col in zip(colIndices,cols):
			try:
				value=float(fields[col])			
				plotData[idx].append(value)
			except:
				pass		
	fin.close()

	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]


	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				pvalue=ttest_ind(plotData[x],plotData[y])[1]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)


	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)

	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)

	if plotPvalueCluster:
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
		
	if len(titl)==0:
		titl=outputFile

	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	#ylim([0,200])

	savefig(outputFile,bbox_inches="tight")
Example #29
0
def dropout_pred(model,
                 ref,
                 ref_rc,
                 alt,
                 alt_rc,
                 mutation_positions,
                 out_annotation_all_outputs,
                 output_filter_mask=None,
                 out_annotation=None,
                 dropout_iterations=30):
    """Dropout-based variant effect prediction

        This method is based on the ideas in [Gal et al.](https://arxiv.org/pdf/1506.02142.pdf) where dropout
        layers are also actived in the model prediction phase in order to estimate model uncertainty. The
        advantage of this method is that instead of a point estimate of the model output the distribution of
        the model output is estimated.

        # Arguments
            model: Keras model
            ref: Input sequence with the reference genotype in the mutation position
            ref_rc: Reverse complement of the 'ref' argument
            alt: Input sequence with the alternative genotype in the mutation position
            alt_rc: Reverse complement of the 'alt' argument
            mutation_positions: Position on which the mutation was placed in the forward sequences
            out_annotation_all_outputs: Output labels of the model.
            output_filter_mask: Mask of boolean values indicating which model outputs should be used.
                Use this or 'out_annotation'
            out_annotation: List of outputs labels for which of the outputs (in case of a multi-task model) the
                predictions should be calculated.
            dropout_iterations: Number of prediction iterations to be performed in order to estimate the
                output distribution. Values greater than 30 are recommended to get a reliable p-value.

        # Returns

            Dictionary with a set of measures of the model uncertainty in the variant position. The ones of interest are:

            - do_{ref, alt}_mean: Mean of the model predictions given the respective input sequence and dropout.
                - Forward or reverse-complement sequences are chosen as for 'do_pv'.
            - do_{ref, alt}_var: Variance of the model predictions given the respective input sequence and dropout.
                - Forward or reverse-complement sequences are chosen as for 'do_pv'.
            - do_diff: 'do_alt_mean' - 'do_alt_mean', which is an estimate similar to ISM using diff_type "diff".
            - do_pv: P-value of a paired t-test, comparing the predictions of ref with the ones of alt. Forward or
                - reverse-complement sequences are chosen based on which pair has the lower p-value.
        """
    prefix = "do"

    seqs = {"ref": ref, "ref_rc": ref_rc, "alt": alt, "alt_rc": alt_rc}

    assert np.all([
        np.array(get_seq_len(ref)) == np.array(get_seq_len(seqs[k]))
        for k in seqs.keys() if k != "ref"
    ])
    assert get_seq_len(ref)[0] == mutation_positions.shape[0]
    assert len(mutation_positions.shape) == 1

    # determine which outputs should be selected
    if output_filter_mask is None:
        if out_annotation is None:
            output_filter_mask = np.arange(out_annotation_all_outputs.shape[0])
        else:
            output_filter_mask = np.where(
                np.in1d(out_annotation_all_outputs, out_annotation))[0]

    # make sure the labels are assigned correctly
    out_annotation = out_annotation_all_outputs[output_filter_mask]

    # Instead of loading the model from a json file I will transfer the model architecture + weights in memory
    model_config = model._updated_config()
    alt_config = replace_dict_values(model_config, u"Dropout", u"BiDropout")

    # Custom objects have to be added before correctly!
    alt_model = keras.layers.deserialize(alt_config)

    # Transfer weights and biases
    alt_model.set_weights(model.get_weights())

    # ANALOGOUS TO ISM:
    # predict
    preds = {}
    for k in seqs:
        preds[k] = pred_do(alt_model,
                           seqs[k],
                           output_filter_mask=output_filter_mask,
                           dropout_iterations=dropout_iterations)

    t, prob = ttest_ind(preds["ref"], preds["alt"], axis=0)
    t_rc, prob_rc = ttest_ind(preds["ref_rc"], preds["alt_rc"], axis=0)

    logit_prob = None
    logit_prob_rc = None
    pred_range = get_range(preds)
    # In case the predictions are bound to [0,1] it might make sense to use logit on the data, as the model output
    # could be probalilities
    if np.all([(pred_range[k] >= 0) and (pred_range[k] <= 1)
               for k in pred_range]):
        logit_preds = apply_over_single(preds, logit)
        logit_prob = apply_over_double(logit_preds["ref"],
                                       logit_preds["alt"],
                                       apply_func=ttest_ind,
                                       select_return_elm=1,
                                       axis=0)
        logit_prob_rc = apply_over_double(logit_preds["ref_rc"],
                                          logit_preds["alt_rc"],
                                          apply_func=ttest_ind,
                                          select_return_elm=1,
                                          axis=0)
    # fwd and rc are independent here... so this can be done differently here...

    sel = (np.abs(prob) > np.abs(prob_rc)).astype(
        np.int)  # Select the LOWER p-value among fwd and rc

    out_dict = {}

    out_dict["%s_pv" % prefix] = pd.DataFrame(overwite_by(prob, prob_rc, sel),
                                              columns=out_annotation)

    if logit_prob is not None:
        logit_sel = (np.abs(logit_prob) > np.abs(logit_prob_rc)).astype(np.int)
        out_dict["%s_logit_pv" % prefix] = pd.DataFrame(overwite_by(
            logit_prob, logit_prob_rc, logit_sel),
                                                        columns=out_annotation)

    pred_means = {}
    pred_vars = {}
    pred_cvar2 = {}
    for k in preds:
        pred_means[k] = np.mean(preds[k], axis=0)
        pred_vars[k] = np.var(preds[k], axis=0)
        pred_cvar2[k] = pred_vars[k] / (pred_means[k]**2)

    mean_cvar = np.sqrt((pred_cvar2["ref"] + pred_cvar2["alt"]) / 2)
    mean_cvar_rc = np.sqrt((pred_cvar2["ref_rc"] + pred_cvar2["alt_rc"]) / 2)

    mean_cvar = overwite_by(mean_cvar, mean_cvar_rc, sel)
    ref_mean = overwite_by(pred_means["ref"], pred_means["ref_rc"], sel)
    alt_mean = overwite_by(pred_means["alt"], pred_means["alt_rc"], sel)
    ref_var = overwite_by(pred_vars["ref"], pred_vars["ref_rc"], sel)
    alt_var = overwite_by(pred_vars["alt"], pred_vars["alt_rc"], sel)

    out_dict["%s_ref_mean" % prefix] = pd.DataFrame(ref_mean,
                                                    columns=out_annotation)
    out_dict["%s_alt_mean" % prefix] = pd.DataFrame(alt_mean,
                                                    columns=out_annotation)

    out_dict["%s_ref_var" % prefix] = pd.DataFrame(ref_var,
                                                   columns=out_annotation)
    out_dict["%s_alt_var" % prefix] = pd.DataFrame(alt_var,
                                                   columns=out_annotation)

    out_dict["%s_cvar" % prefix] = pd.DataFrame(mean_cvar,
                                                columns=out_annotation)

    out_dict["%s_diff" %
             prefix] = out_dict["%s_alt_mean" %
                                prefix] - out_dict["%s_ref_mean" % prefix]

    return out_dict
Example #30
0
def one_side_test(first, second):
    value, p = ttest_ind(first, second, equal_var=False)
    if value < 0:
        return 0.0
    else:
        return 1 - p / 2
Example #31
0
t_stat, df, cv, p = independent_ttest(data1, data2, alpha)
print('t=%.3f, df=%d, cv=%.3f, p=%.3f' % (t_stat, df, cv, p))
# interpret via critical value
if abs(t_stat) <= cv:
    print('Accept null hypothesis that the means are equal.')
else:
    print('Reject the null hypothesis that the means are equal.')
# interpret via p-value
if p > alpha:
    print('Accept null hypothesis that the means are equal.')
else:
    print('Reject the null hypothesis that the means are equal.')

# In[6]:

twosample_results = stats.ttest_ind(data1, data2)
twosample_results

# In[7]:

matrix_twosample = [['', 'Test Statistic', 'p-value'],
                    [
                        'Sample Data', twosample_results[0],
                        twosample_results[1]
                    ]]
matrix_twosample

# In[8]:

twosample_table = ff.create_table(matrix_twosample, index=True)
twosample_table
    def compute(self, chromosome, start, end, additional=None):
        part_type, g_one, g_two, grouping = self.unpack_params(additional)

        expression_data = Gene_data(start,
                                    end,
                                    chromosome,
                                    measurements=self.gene_types)
        corr_list = []

        group_one, group_two = self.partion(part_type, g_one)
        exp_group_one = Gene_data(start,
                                  end,
                                  chromosome,
                                  measurements=group_one.to_dict('records'))

        group_one = [c for c in exp_group_one.columns if "_" in c]
        group_one = self.to_list_of_dict(group_one)

        if part_type is not None:
            exp_group_two = Gene_data(
                start,
                end,
                chromosome,
                measurements=group_two.to_dict('records'))
            group_two = [c for c in exp_group_two.columns if "_" in c]
            group_two = self.to_list_of_dict(group_two)
            group_pairs = [(x, y) for x in group_one for y in group_two]
        else:
            group_pairs = itertools.combinations(group_one, 2)

        # all combinations of gene expressions
        # TODO (Kyle?): simplify the above code
        group_pairs = itertools.combinations(group_one + group_two, 2)
        # pvalue_list = []
        #for data_source_one, data_source_two in itertools.combinations(self.gene_types, 2):
        for data_source_one, data_source_two in group_pairs:
            exp1 = data_source_one['id']
            exp2 = data_source_two['id']

            if exp1 in expression_data.columns and exp2 in expression_data.columns:

                col_one = expression_data[exp1]
                col_two = expression_data[exp2]

                correlation_coefficient = pearsonr(col_one, col_two)
                corr_obj = build_obj('correlation', 'expression', 'expression',
                                     True, data_source_one, data_source_two,
                                     correlation_coefficient[0],
                                     correlation_coefficient[1])
                corr_list.append(corr_obj)

                t_value, p_value = ttest_ind(col_one, col_two, equal_var=False)

        corr_list = sorted(corr_list, key=lambda x: x['value'], reverse=True)
        corr_res = pd.Series(corr_list)
        corr_res = corr_res.apply(pd.Series)

        parse_res = corr_res
        # corr_res = corr_res.to_json(orient='records')
        # parse_res = json.loads(corr_res)

        return parse_res
def in_out_mask_ttest_from_glm_file(wpth,scale_path,contrast,scale,imsk,omsk,parcel_img,eff='ttest',membership=[],conndf=''):

    oldpth = pthswp(wpth)
    tdf = pandas.DataFrame(np.zeros((scale,4)), columns=['t','p','wt','wp'])

    if not membership:
        #determine which seeds are in which mask
        inseedz = []
        outseedz = []
        cde = codegen(6)
        print 'determining parcel membership...'

        for i in range(1,(scale+1)):
            os.system('fslmaths %s -thr %s -uthr %s %s'%(parcel_img,i,i,cde))
            os.system('fslmaths %s.nii.gz -mas %s %s1'%(cde,imsk,cde))
            os.system('fslmaths %s.nii.gz -mas %s %s2'%(cde,omsk,cde))
            ival = subprocess.check_output('fslstats %s1.nii.gz -V'%(cde),shell = True).rsplit()[0]
            oval = subprocess.check_output('fslstats %s2.nii.gz -V'%(cde),shell = True).rsplit()[0]
            # determine membership via winner-takes-all
            if int(ival) > int(oval):
                inseedz.append(i)
                print 'seed %s going inside mask'%(i)
            elif int(ival) < int(oval):
                outseedz.append(i)
                print 'seed %s going outside mask'%(i)
            else:
                print 'could not resolve seed %s. In vox = %s, out vox = %s. Excluding from analysis'%(i,ival,oval)
        os.system('rm %s*')
    else:
        inseedz = membership[0]
        outseedz = membership[1]


    print 'preparing connectivity map...'

    if type(conndf) == pandas.core.frame.DataFrame:
        df = conndf
    else:
        df =jni.create_df_from_mat(scale_path,scale,pval=0.1,eff_tp=eff,mat_tp = 'glm')

    for i in range(scale):
        print 'calculating values for seed %s'%(i+1)
        ivalz = []
        ovalz = []
        indz = []
        for ind in df.index.tolist():
            for x in ind:
                if x == (i+1):
                    indz.append(ind)
        indz.remove(indz[i])
        for y in indz:
            if y[0] == (i+1):
                conn = y[1]
            else:
                conn = y[0]
            if conn in inseedz:
                ivalz.append(df.ix[y,eff])
            elif conn in outseedz:
                ovalz.append(df.ix[y,eff])
        invec = np.array(ivalz)
        outvec = np.array(ovalz)
        t,p = st.ttest_ind(invec,outvec)
        wt,wp = st.ttest_ind(invec,outvec,equal_var = False)
        tdf.ix[(i+1),'t'] = t 
        tdf.ix[(i+1),'p'] = p
        tdf.ix[(i+1),'wt'] = wt
        tdf.ix[(i+1),'wp'] = wp
        tdf.ix[(i+1),'gof'] = np.mean(invec) / np.mean(outvec)

    os.chdir(oldpth)

    return tdf,df,inseedz,outseedz
Example #34
0
###############################################
# Variance homogeneity
# H0 = Variance is homogeneous
# H1 = Variance is not homogeneous.

stats.levene(AB['T_Purchase'], AB['C_Purchase'])

# LeveneResult(statistic=2.6392694728747363, pvalue=0.10828588271874791)
# h0 isn't rejected.

######################################################
# Hypothesis Testing

test_statistics, pvalue = stats.ttest_ind(AB['T_Purchase'],
                                          AB['C_Purchase'],
                                          equal_var=True)
print('test statistics = %.4f, p-value = %.4f' % (test_statistics, pvalue))

#test statistics = 0.9416, p-value = 0.3493
# h0 isn't rejected.

#######################################
# # Create hypothesis (Earning)
#######################################

# Control and Test Purchase mean:
# C_Earning:    1908.5683
# T_Earning:   2514.890733

# H0: M1 = M2 There is no statistically significant difference between the maximum bidding and average bidding.
Example #35
0
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])			
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		if len(relabels)!=len(xtickLabels):
			print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
			exit()

		xtickLabels=relabels
		
	

	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]


	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				pvalue=ttest_ind(plotData[x],plotData[y])[1]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
		
	if len(titl)==0:
		titl=outputFile

	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	#ylim([0,200])

	savefig(outputFile,bbox_inches="tight")
Example #36
0
def get_difference(X, Y):
    return 1 - stats.ttest_ind(X, Y)[1]
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta,showViolin,showBox,firstColAnnot,plotTrend,showLegend,makePzfxFile,makeBinMatrix,writeDataSummaryStat,summaryStatRange,minuslog10pvalue,minNDataToKeep,vfacecolor,valpha,outXYZPvalues,dividePlots):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif


	
	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	trendData={}
	annot={}
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)
		
		if firstColAnnot:
			colAnnot=cols[0]
			cols=cols[1:]
			annotThisFile=[]
			annot[startIdx]=annotThisFile
		else:
			colAnnot=-1
			annotThisFile=None
			
		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))
		
		if plotTrend:
			#print >> stderr,"plotTrend"
			trendDataThisFile=[]
			trendData[startIdx]=trendDataThisFile
		else:
			trendDataThisFile=None
			
			
		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
			
			if plotTrend:
				#print >> stderr,"a"
				trendDataThisLine=[]
			else:
				trendDataThisLine=None
			
			allDataOKThisLine=True
			
			if colAnnot>=0:
				annotThisFile.append(fields[colAnnot])
			
			for idx,col in zip(colIndices,cols):
				try:
					value=float(fields[col])
					if logb!=0:
						if value==0.0:
							raise ValueError
						value=log(value)/logb							
					plotData[idx].append(value)
					
					if plotTrend:
						trendDataThisLine.append(value)
						#print >> stderr,"value:",value
					
				except:
					allDataOKThisLine=False	
				
			if plotTrend:
				if allDataOKThisLine:
					trendDataThisFile.append(trendDataThisLine)
				else:
					trendDataThisFile.append(None)
			
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()
		print >> stderr,xtickLabels
		print >> stderr,relabels
		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])<minNDataToKeep:
			print >> stderr,xtickLabels[c],"discarded because has only",len(plotData[c]),"data points <",minNDataToKeep
			del plotData[c]
			del xtickLabels[c]

	if not skipStat:
		print >> stdout,"student t-test (1 sample; mean=0)"
		print >> stdout,"sample","mean","p-val","median"
	
		if writeDataSummaryStat:
			fDSS=open(writeDataSummaryStat,"w")
			print >> fDSS,"sample\tmean\tvar\tsd\tmin\tmax\tN\tNInRange["+str(summaryStatRange[0])+","+str(summaryStatRange[1])+"]\t%NInRange\tNbelowRange\t%Nbelow\tNAboveRange\t%NAbove"
			
		for x in range(0,len(plotData)):
			#print >> stderr, len(plotData[x])
			try:
				print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1],median(plotData[x])
			except:
				print >> stdout, xtickLabels[x],mean(plotData[x]),"NA",median(plotData[x])
			
			if writeDataSummaryStat:
				sumData,N,NIN,NBelow,NAbove=filterDataInRangeInclusive(plotData[x],summaryStatRange[0],summaryStatRange[1])
				
				if NIN>1:
					#print >> stderr,"sumData=",sumData
					#print >> stderr,mean
					mea=mean2(sumData)
					DDOF=1
					sd=std(sumData,ddof=DDOF)
					var=sd*sd
					mi=min(sumData)
					ma=max(sumData)
				else:
					mea="NA"
					sd="NA"
					var="NA"
					mi="NA"
					ma="NA"
				
			
					
				print >> fDSS,xtickLabels[x]+"\t"+str(mea)+"\t"+str(var)+"\t"+str(sd)+"\t"+str(mi)+"\t"+str(ma)+"\t"+str(N)+"\t"+str(NIN)+"\t"+str(float(NIN)*100/N)+"\t"+str(NBelow)+"\t"+str(float(NBelow)*100/N)+"\t"+str(NAbove)+"\t"+str(float(NAbove)*100/N)
			
	
		pvalueM=[]
		
		if writeDataSummaryStat:
			fDSS.close()
		
		print >> stdout,""
		
		print >> stdout,"student t-test (2 samples)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
	
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					try:
						pvalue=ttest_ind(plotData[x],plotData[y])[1]
					except:
						pvalue=1.0
					
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";	
	
		
		print >> stdout,""
	
		
	
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)
	
	
			
		pvalueM=[]
	
		print >> stdout,"welch t-test"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
						
				else:
					try:
						pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
					
					print >> stdout, str(pvalue),
					pvalueRow.append(pvalue)
			print >> stdout,"";
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_Welch.xyz",xtickLabels,pvalueM)
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)
	
		
		print >> stdout,""
		print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
		print >> stdout,"p-val",
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
					except:
						pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
	
		if outXYZPvalues:
			writeXYZPvalues(outXYZPvalues+"_U.xyz",xtickLabels,pvalueM)
		
	
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
		
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Ansari-Bradley Two-sample Test for difference in scale parameters " 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=ansari(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
						#pvalue=1.0
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_Ansari_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_Ansari",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Fligner's Two-sample Test for equal variance (non-parametrics)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=fligner(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_fligner_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_fligner",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Levene's Two-sample Test for equal variance" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=levene(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
						
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_levene_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_levene",xtickLabels,pvalueM,methodCluster)	
		
		
		#####
	
		#####now the variance tests
		
		print >> stdout,""
		print >> stdout,"Bartlett's Two-sample Test for equal variance (for normal distributions)" 
		print >> stdout,"p-val",
		
		
		for x in range(0,len(plotData)):
			print >> stdout,xtickLabels[x],
		
	
		pvalueM=[]
	
		print >> stdout,""
		for x in range(0,len(plotData)):
			pvalueRow=[]
			pvalueM.append(pvalueRow)
			print >> stdout, xtickLabels[x],
			for y in range(0,len(plotData)):
				if y<=x:
					print >> stdout, "",
					if x==y:
						if minuslog10pvalue:
							pvalueRow.append(0.0)
						else:
							pvalueRow.append(1.0)
					else:
						pvalueRow.append(pvalueM[y][x])
				else:
					#if max(len(plotData[x]),len(plotData[y]))<=20:
					try:
						pvalue=bartlett(plotData[x],plotData[y])[1]		
					except:
						pvalue="NA"
						#pvalue=1.0
	
					if minuslog10pvalue and str(pvalue)!="NA":
						try:
							pvalue=-1*log(pvalue,10)
						except:
							pvalue=-1000.0
	
	
					print >> stdout,pvalue,
					pvalueRow.append(pvalue)
					#else:
					#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
			print >> stdout,"";	
		
		if plotPvalueCluster:
			makePValueRawPlot(outputClusterPrefix+"_bartlett_raw",xtickLabels,pvalueM)
			makePValueClusterPlot(outputClusterPrefix+"_bartlett",xtickLabels,pvalueM,methodCluster)	
		
		
		#####

	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes,showViolin,showBox,annot,trendData,showLegend,makePzfxFile,makeBinMatrix,dividePlots)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
		drawDensigram(plotHistogramToFile+".density.png",plotData,xtickLabels)
Example #38
0
# Set random seed to get the same result or remove for different each time
np.random.seed(123)

# Initialize effect_size, control_mean, control_sd
effect_size, sample_size, control_mean, control_sd = 0.1, 50, 1, 0.5

sims = 1000

'''
INSTRUCTIONS

*   For the time spent random variables, set the size such that it has shape sample_size × sims.
*   Calculate power as a fraction of p-values less than 0.05 (statistically significant).
*   If power is greater than or equal to 80%, break out of the while loop. Else, keep incrementing sample_size by 10.
'''

sample_size = 50

# Keep incrementing sample size by 10 till we reach required power
while 1:
    control_time_spent = np.random.normal(loc=control_mean, scale=control_sd, size=(sample_size, sims))
    treatment_time_spent = np.random.normal(loc=control_mean*(1+effect_size), scale=control_sd, size=(sample_size, sims))
    t, p = st.ttest_ind(treatment_time_spent, control_time_spent)
    
    # Power is the fraction of times in the simulation when the p-value was less than 0.05
    power = (p < 0.05).sum()/sims
    if power >= 0.8: 
        break
    else: 
        sample_size += 10
print("For 80% power, sample size required = {}".format(sample_size))
Example #39
0
def plotExpBox_Main(inputFiles,headers,valcols,outputFile,sep,startRow,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,plotPvalueCluster,outputClusterPrefix,methodCluster,xlegendrotation,xlabe,ylabe,figsz,titl,showSampleSizes,trimToMinSize,relabels,logb,plotHistogramToFile,plotMedianForGroups,botta):

	#if plotPvalueCluster:
		#if pvalue cluster is needed:
	#	from Bio.Cluster.cluster import *
	#	from Bio.Cluster import *
		#endif



	#the real deal!
	plotData=[]	
	xtickLabels=[]
	
	
	minSize=-1

	for inputFile,header,cols in zip(inputFiles,headers,valcols):
		fin=generic_istream(inputFile)
		
		startIdx=len(plotData)

		for col in cols:
			plotData.append([])
			xtickLabels.append(header[col])

		colIndices=range(startIdx,startIdx+len(cols))

		lino=0
		for lin in fin:
			lino+=1
			if lino<startRow:
				continue		
			fields=lin.rstrip("\r\n").split(sep)
		
			for idx,col in zip(colIndices,cols):
				try:
					
					value=float(fields[col])
					if logb!=0:
						value=log(value)/logb	
						if value<-100000:
							raise ValueError						
					plotData[idx].append(value)
					
				except:
					pass		
		fin.close()
	
		
		if minSize==-1:
			minSize=len(plotData[idx]) #or startIDX?
		else:
			minSize=min([minSize,len(plotData[idx])])
		

	if trimToMinSize:
		print >> stderr,"trimming to min size =",minSize
		trimData(plotData,minSize)

	if len(relabels)>0:
		#if len(relabels)!=len(xtickLabels):
		#	print >> stderr,"relabels doesn't have the same length as original label vectors",xtickLabels,"=>",relabels
		#	exit()

		for i,relabel in zip(range(0,len(relabels)),relabels):
			xtickLabels[i]=relabel
		
	
	for i in range(0,len(plotMedianForGroups)):
		plotMedianForGroups[i]=getCol0ListFromCol1ListStringAdv(xtickLabels,plotMedianForGroups[i])
			
	
	#drawing medians:
	medianToDraw=[]
	for mediangrouper in plotMedianForGroups:
		curD=[]		
		for c in mediangrouper:
			curD.extend(plotData[c])
		medianToDraw.append(median(curD))


	for c in range(len(plotData)-1,-1,-1):
		if len(plotData[c])==0:
			print >> stderr,xtickLabels[c],"discarded"
			del plotData[c]
			del xtickLabels[c]


	print >> stdout,"student t-test (1 sample; mean=0)"
	print >> stdout,"sample","mean","p-val"


	for x in range(0,len(plotData)):
		#print >> stderr, len(plotData[x])
		try:
			print >> stdout, xtickLabels[x],mean(plotData[x]),ttest_1samp(plotData[x],0)[1]
		except:
			print >> stdout, xtickLabels[x],"NA","NA"

	pvalueM=[]
	
	print >> stdout,""
	
	print >> stdout,"student t-test (2 samples)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""

	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				try:
					pvalue=ttest_ind(plotData[x],plotData[y])[1]
				except:
					pvalue=1.0

				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";	

	
	print >> stdout,""

	


	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_t_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_t",xtickLabels,pvalueM,methodCluster)

	pvalueM=[]

	print >> stdout,"welch t-test"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	
	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
					
			else:
				pvalue=welchs_approximate_ttest_arr(plotData[x],plotData[y])[3]
				print >> stdout, str(pvalue),
				pvalueRow.append(pvalue)
		print >> stdout,"";

	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_Welch_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_Welch",xtickLabels,pvalueM,methodCluster)

	
	print >> stdout,""
	print >> stdout,"non-parametric (Mann-Whitney U)" #"non-parametric (Mann-Whitney U if larger n<=20 else Wilcoxon)"
	print >> stdout,"p-val",
	for x in range(0,len(plotData)):
		print >> stdout,xtickLabels[x],
	

	pvalueM=[]

	print >> stdout,""
	for x in range(0,len(plotData)):
		pvalueRow=[]
		pvalueM.append(pvalueRow)
		print >> stdout, xtickLabels[x],
		for y in range(0,len(plotData)):
			if y<=x:
				print >> stdout, "",
				if x==y:
					pvalueRow.append(1.0)
				else:
					pvalueRow.append(pvalueM[y][x])
			else:
				#if max(len(plotData[x]),len(plotData[y]))<=20:
				try:
					pvalue=mannwhitneyu(plotData[x],plotData[y])[1]*2				
				except:
					pvalue=1.0
				print >> stdout,pvalue, #mann-whiteney need to mul by 2 (one tail to two tail)
				pvalueRow.append(pvalue)
				#else:
				#	print >>  stdout,wilcoxon(plotData[x],plotData[y])[1], # this is two-tailed already stdout, "", #
		print >> stdout,"";	
	
	

	if plotPvalueCluster:
		makePValueRawPlot(outputClusterPrefix+"_U_raw",xtickLabels,pvalueM)
		makePValueClusterPlot(outputClusterPrefix+"_U",xtickLabels,pvalueM,methodCluster)
	
	figure(figsize=figsz)
	subplots_adjust(top=0.9, bottom=botta, left=0.2, right=0.8)
	
	if len(titl)==0:
		titl=outputFile


	plotExpBox(plotData,xtickLabels,showIndPoints,mark,markMean,showMean,notch,whisker,outliers,xlegendrotation,xlabe,ylabe,titl,showSampleSizes)
	
	#ylim([0,200])
	for m in medianToDraw:
		axhline(y=m,linestyle=':',color='gray')

	savefig(outputFile,bbox_inches="tight")

	if len(plotHistogramToFile)>0:
		drawHistogram(plotHistogramToFile,plotData,xtickLabels)
Example #40
0
# - Gene symbol map
proteomics = proteomics[[
    i.split(';')[0] in umap for i in proteomics['uniprot']
]]
proteomics['genesymbol'] = [
    umap[i.split(';')[0]] for i in proteomics['uniprot']
]

# - Log fold-change
proteomics = proteomics.groupby('genesymbol').mean()

# - Differential protein abundance
de_proteomics = {}
for i in proteomics.index:
    t, p = ttest_ind(proteomics.ix[i, ko], proteomics.ix[i, wt])
    de_proteomics[i] = {
        'fc': proteomics.ix[i, ko].mean() - proteomics.ix[i, wt].mean(),
        't': t,
        'pval': p
    }
de_proteomics = DataFrame(de_proteomics).T.dropna()

# - FDR correction
de_proteomics['fdr'] = multipletests(de_proteomics['pval'], method='fdr_bh')[1]

# - Export protein level proteomics
de_proteomics.to_csv('./data/uok262_proteomics_labelfree_processed_fc.csv')
# de_proteomics = read_csv('./data/uok262_proteomics_labelfree_processed_fc.csv', index_col=0)
print de_proteomics.sort_values('fdr')
Example #41
0
    ).reset_index(name='Average value (Treatment villages)'),
             progresa_df_edited[progresa_df_edited.progresa == '0'].mean().
             reset_index(name='Average value (Control villages)'),
             on=['index']))

#Creating empty lists to append t value, p value and statistical significancy
t_value = []
p_value = []
stats_significant = []

#Iterating over the df to calculate t, p value and statistical significancy
for i in list(progresa_treatment_control['index']):
    t_value.append(
        stats.ttest_ind(
            list(
                progresa_df_edited[progresa_df_edited.progresa == 'basal'][i]),
            list(progresa_df_edited[progresa_df_edited.progresa == '0'][i]),
            nan_policy='omit').statistic)
    p_value.append(
        stats.ttest_ind(
            progresa_df_edited[progresa_df_edited.progresa == 'basal'][i],
            progresa_df_edited[progresa_df_edited.progresa == '0'][i],
            nan_policy='omit').pvalue)
    if stats.ttest_ind(
            progresa_df_edited[progresa_df_edited.progresa == 'basal'][i],
            progresa_df_edited[progresa_df_edited.progresa == '0'][i],
            nan_policy='omit').pvalue < 0.05:
        stats_significant.append('TRUE')
    else:
        stats_significant.append('FALSE')

print()
print("------------------------------------------")
print("------ STATISTICS FOR MOM HEURISTIC ------")
print("------------------------------------------")
print()

difficulty = ["Simple", "Easy", "Intermediate", "Expert"]
# statistics
for d in range(4):
    print()
    print("Testing for difficulty: " + difficulty[d])

    print('T-test for baseline vs. naked-pairs:')
    x = stats.ttest_ind(normal_splits[d], pairs_splits[d])
    print(x)

    print('T-test for baseline vs. naked-triples:')
    x = stats.ttest_ind(normal_splits[d], triple_splits[d])
    print(x)

    print('T-test for baseline vs. x-wing:')
    x = stats.ttest_ind(normal_splits[d], x_splits[d])
    print(x)

    print('T-test for baseline vs. all:')
    x = stats.ttest_ind(normal_splits[d], all_splits[d])
    print(x)