Beispiel #1
0
def calcPairwisePvalues( tdf, factcol, groupcol=None, onetailed=False ) :
    pvals = []
    if groupcol is not None :
        for grp in tdf[groupcol].unique():
            lof = tdf[(tdf.Vclass=="LoF") & (tdf[groupcol] ==grp)][factcol].tolist()
            rand = tdf[(tdf.Vclass=="Random") & (tdf[groupcol] ==grp)][factcol].tolist()
            maxY = max(lof+rand)
            ts,pv = ttest_ind(lof, rand)
            #p/2 < alpha and t < 0
            #if onetailed : print "Todo"
            pvals.append( Series(data=[grp,pv,ts,"-2",maxY],
                                 index=[groupcol,"pvalue","T-statistic","x","y"]))
    else :
        lof = tdf[(tdf.Vclass=="LoF")][factcol].tolist()
        rand = tdf[(tdf.Vclass=="Random")][factcol].tolist()
        maxY = max(lof+rand)
        ts,pv = ttest_ind(lof, rand)
        #p/2 < alpha and t < 0
        #if onetailed : print "Todo"
        pvals.append( Series(data=[pv,ts,"-2",maxY],
                                 index=["pvalue","T-statistic","x","y"]))
    pvals = DataFrame(pvals)
    pvals["Pvalue"] = ["P-value: %.3g\nT-statistic: %.2f" %(x,y) 
                       for x,y in pvals[["pvalue","T-statistic"]].values]

    return pvals
Beispiel #2
0
def linreg2_err(t, x, wleft=5, wright=5, hop=None, use_l=True, use_r=True):
    if hop is None:
        hop=1
        
    zs = np.zeros(len(x))
    if wright<0:
        wm = 0
    else:
        wm=wright
    for ii in range(wleft,len(x)-wm,hop):
        ts=[]
        xl = x[ii-wleft:ii]
        tl = t[ii-wleft:ii]
        xr = x[ii:ii+wright]
        tr = t[ii:ii+wright]

        if use_l>0:
            pl = np.polyfit(tl,xl,1)
            residll = xl-np.polyval(pl,tl)
            stdll = np.std(residll)
            residlr = xr-np.polyval(pl,tr)
            stdlr = np.std(residlr)
            ttl,pvl = ttest_ind(residll,residlr)
            ts.append(ttl)

        if use_r>0:
            pr = np.polyfit(tr,xr,1)
            residrr = xr-np.polyval(pr,tr)
            stdrr = np.std(residrr)
            residrl = xl-np.polyval(pr,tl)
            stdrl = np.std(residrl)
            ttr,pvr = ttest_ind(residrr,residrl)
            ts.append(-ttr)
        zs[ii] = np.mean(ts)
    return zs
def results_plots(name, min_n=2000):
    
    frame = pd.DataFrame.from_csv(os.path.join(DATA_FOLDER, "storks_exp_rep", "out", name))
    frame = frame[frame.N >= min_n]

    # 0) Sig tests
    behav_semi_sup = frame.AF_factor
    juv_idx = frame.status == 'Juv'
    adult_idx = frame.status == 'Adult'
    print("semi: ", np.array([1, .5])*ttest_ind(behav_semi_sup[juv_idx], behav_semi_sup[adult_idx])) # [1, .5]*[t, p]
    behav_sup = frame.AF_true_frac
    print("sup: ", np.array([1, .5])*ttest_ind(behav_sup[juv_idx], behav_sup[adult_idx]))

    # 1) scatter for AF_true_frac & AF_factor 
    frame.plot(kind='scatter', x='AF_true_frac', y='AF_factor')
    plt.show()

    # 2) compare Juv/Adults with AF_factor
    m = frame.groupby(frame['status'])['AF_factor'].mean()
    s = frame.groupby(frame['status'])['AF_factor'].sem()
    m.plot(kind='bar', yerr=s)
    plt.show() 

    # 2) compare Juv/Adults with AF_true_frac
    m = frame.groupby(frame['status'])['AF_true_frac'].mean()
    s = frame.groupby(frame['status'])['AF_true_frac'].sem()
    m.plot(kind='bar', yerr=s)
    plt.show()
Beispiel #4
0
def calc_ttest(data, exp_set, control_set, tags=()):
    d = [ st.ttest_ind( data.ix[probeset, list(exp_set.filenames)], 
                             data.ix[probeset, list(control_set.filenames)], equal_var=False) for probeset in data.index]
    rs = pandas.DataFrame( index=data.index, data=d, columns=[ tm.e( tags+(("st", "t"),("tt", "welch ttest"))), tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal") ))])    
    rs[tm.e( tags + (("tt", "welch ttest"), ("st", "pval"), ("mc", "bonf")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal"))) ], method="bonferroni")[1]
    rs[tm.e( tags + (("tt", "welch ttest"), ("st", "pval"), ("mc", "bh")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "welch ttest"), ("mc", "nominal")))], method="fdr_bh")[1] 

    d = [ st.ttest_ind( data.ix[probeset, list(exp_set.filenames)], 
                             data.ix[probeset, list(control_set.filenames)], equal_var=True) for probeset in data.index]

    rs[tm.e( tags+(("st", "t"),("tt", "student ttest")))] = [v[0] for v in d]
    rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal") ))] = [v[1] for v in d]
    
    rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "bonf")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal"))) ], method="bonferroni")[1]
    rs[tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "bh")))] = statsmodels.sandbox.stats.multicomp.multipletests(rs.ix[:, tm.e( tags + (("st", "pval"), ("tt", "student ttest"), ("mc", "nominal")))], method="fdr_bh")[1] 


    # do diagnostic tests for heteroskedasticity
    d = [st.levene( data.ix[probeset, list(exp_set.filenames)], data.ix[probeset, list(control_set.filenames)]) for probeset in data.index ]
    rs[ tm.e( tags + (("tt", "levene"), ("st", "pval")))] = [z[1] for z in d]

    # omnibus test for normality
#    d = [st.normaltest( data.ix[probeset, list(exp_set.filenames)]) for probeset in data.index ]
#    rs[ tm.e( tags + (("tt", "d-p omnibus"), ("st", "pval"), ("cg", "exp") ))] = [z[1] for z in d]

#    d = [st.normaltest( data.ix[probeset, list(control_set.filenames)]) for probeset in data.index ]
#    rs[ tm.e( tags + (("tt", "d-p omnibus"), ("st", "pval"), ("cg", "ctrl") ))] = [z[1] for z in d]

    return rs
def plotHist():
    TO, wmeanto= makeGaussian(20.74,1115,500,plot=False,dir="/Users/george/Dropbox/Astronomy/Oculus/25Oct2016/IMG00069.FIT")[3:]
    SM, wmeansm= makeGaussian(20.64,710,885,plot=False,dir="/Users/george/Dropbox/Astronomy/Oculus/25Oct2016/IMG00074.FIT")[3:]
    dif = np.median(20.74-2.5*np.log10(TO/wmeanto))-np.median(20.64-2.5*np.log10(SM/wmeansm))
    pval= stats.ttest_ind(SM, TO)[1]
    plt.ion()
    plt.clf()
    plt.figure(1)
    data = np.vstack([TO,SM]).T
    plt.xlim(2000,4000)
    plt.ylim(0,2000)
    #plt.hist(TO, bins=1000,label='Thacher Observatory',alpha=0.5,color='r')
    #plt.hist(SM, bins=1000,label='Sulfur Mountain',alpha=0.5,color='b')
    plt.axvline(x=rb.mean(TO), color ='red', linewidth = 2)
    plt.axvline(x=rb.mean(SM), color = 'red', linewidth = 2)
    plt.annotate(r'$dif$=%.2f mags/arcsec'u'\u00B2' %dif, [.01,.93], horizontalalignment='left', xycoords='axes fraction', fontsize='large', backgroundcolor='white')
    plt.annotate(r'$\bar{{\sigma}_T}_O$=%.2f flux/px' %rb.mean(TO), [.01,0.86], horizontalalignment='left', xycoords='axes fraction', fontsize="large", color='midnightblue')
    plt.annotate(r'$\bar{{\sigma}_S}_M$=%.2f flux/px'%rb.mean(SM), [0.01,0.79], horizontalalignment='left', xycoords='axes fraction', fontsize="large", color='darkgreen')
    plt.annotate(r'$p-val$=%.2E' %pval, [.01,.72], horizontalalignment='left', xycoords='axes fraction', fontsize='large')
    plt.hist(data, bins=1000,label=['Thacher Observatory (TO)','Sulphur Mountain (SM)'],alpha=0.5, width=40)
    plt.title("Sky brightness")
    plt.xlabel("Flux Value")
    plt.ylabel("Frequency")
    plt.legend(loc='upper right')
    plt.show()
    inds, = np.where(dif<=0)
    pcent = len(inds,)
    ttest=stats.ttest_ind(TO, SM)
    #returns: T-statistic((estimated-hypothesis value)/standard error),
    #p value(probability of an observed result assuming the null hypothesis is true)
    print dif, pcent, ttest
Beispiel #6
0
def runTTest(col, df):
    print "t-Test for %s" % col
    d1 = df[df[col] == True]["rating"]
    d2 = df[df[col] != True]["rating"]
    print "Number of Samples: (%d, %d)" % (d1.shape[0], d2.shape[0])
    print "Means: (%f, %f)" %(d1.mean(), d2.mean())
    print stats.ttest_ind(d1, d2)
def ttest_PTC_divergence_different(divergence_file, PTC_file, blast_file):
    '''
    (file, file) -> None
    Performs a t-test to compare the mean divergence between PTC and non-PTC genes, assuming unequal variance and print the results on screen
    '''

    divergence_data = PTC_divergence(divergence_file, PTC_file, blast_file)
    divergence = divergence_data[0]
    zero_dS = divergence_data[1]
    omega_greater_1 = divergence_data[2]

    from scipy import stats

    # make a list of divergence values for PTC and non-PTC genes
    dN_PTC = []
    dN_non_PTC = []

    dS_PTC = []
    dS_non_PTC = []

    omega_PTC = []
    omega_non_PTC = []

    for gene in divergence:
        if divergence[gene][-1] == 'yes':
            dN_PTC.append(divergence[gene][0])
            dS_PTC.append(divergence[gene][1])
            omega_PTC.append(divergence[gene][2])
        else:
            dN_non_PTC.append(divergence[gene][0])
            dS_non_PTC.append(divergence[gene][1])
            omega_non_PTC.append(divergence[gene][2])
              
    dN = stats.ttest_ind(dN_PTC, dN_non_PTC, equal_var = False)
    dS = stats.ttest_ind(dS_PTC, dS_non_PTC, equal_var = False)
    omega = stats.ttest_ind(omega_PTC, omega_non_PTC, equal_var = False)

    dN_ttest = (abs(round(float(dN[0]), 4)), float(dN[1]))
    dS_ttest = (abs(round(float(dS[0]), 4)), float(dS[1]))
    omega_ttest = (abs(round(float(omega[0]), 4)), float(omega[1]))

    # compute mean and standard error
    dN_stats_PTC = compute_mean_std_error(dN_PTC)
    dN_stats_non_PTC = compute_mean_std_error(dN_non_PTC)
    dS_stats_PTC = compute_mean_std_error(dS_PTC)
    dS_stats_non_PTC = compute_mean_std_error(dS_non_PTC)
    omega_stats_PTC = compute_mean_std_error(omega_PTC)
    omega_stats_non_PTC = compute_mean_std_error(omega_non_PTC)

    print('dN: t-test =\t {0},\t p-value =\t {1}'.format(dN_ttest[0], dN_ttest[1]))
    print('dS: t-test =\t {0},\t p-value =\t {1}'.format(dS_ttest[0], dS_ttest[1]))
    print('dN/dS: t-test =\t {0},\t p-value =\t {1}'.format(omega_ttest[0], omega_ttest[1]))
    print('PTC: mean dN =\t %6.4f,\t standard error =\t %6.4f' % dN_stats_PTC)
    print('nonPTC: mean dN =\t %6.4f,\t standard error =\t %6.4f' % dN_stats_non_PTC)
    print('PTC: mean dS =\t %6.4f,\t standard error =\t %6.4f' % dS_stats_PTC)
    print('nonPTC: mean dS =\t %6.4f,\t standard error =\t %6.4f' % dS_stats_non_PTC)
    print('PTC: mean dN/dS =\t %6.4f,\t standard error =\t %6.4f' % omega_stats_PTC)
    print('nonPTC: mean dN/dS =\t %6.4f,\t standard error =\t %6.4f' % omega_stats_non_PTC)
    print('{0}\t genes with dS = 0 were excluded'.format(zero_dS))
    print('{0}\t tgenes with dN/dS > 1 were excluded'.format(omega_greater_1))
    def runCompare(self, objId, labelToAdd, expression1, expression2):
        fh = open(self._getPath("report.txt"),'w')

        self.experiment = self.readExperiment(self.inputExperiment.get().fnPKPD)
        x1 = [float(x) for x in self.experiment.getSubGroupLabels(self.expression1.get(),self.labelToCompare.get())]
        x2 = [float(x) for x in self.experiment.getSubGroupLabels(self.expression2.get(),self.labelToCompare.get())]
        self.doublePrint(fh,"Values in SubGroup 1: %s"%str(x1))
        self.doublePrint(fh,"Values in SubGroup 2: %s"%str(x2))
        self.doublePrint(fh,"Testing H0: mu1=mu2")
        self.doublePrint(fh," ")

        try:
            [t,pval] = stats.ttest_ind(np.asarray(x1,np.double),np.asarray(x2,np.double),True)
            self.doublePrint(fh,"T-test two independent samples (same variance): t-statistic=%f p-value=%f"%(t,pval))
        except:
            pass

        try:
            [t,pval] = stats.ttest_ind(x1,x2, False)
            self.doublePrint(fh,"T-test two independent samples (different variance, Welch's test): t-statistic=%f p-value=%f"%(t,pval))
        except:
            pass

        try:
            [u,pval] = stats.mannwhitneyu(x1, x2, True)
            self.doublePrint(fh,"Mann-Whitney U test for two independent samples: u-statistic=%f p-value=%f"%(u,pval))
        except:
            pass

        fh.close()
def pandas_boxplot():
    '''Example from Altman "Practical statistics for medical research'''

    # Get the data
    inFile = 'altman_94.txt'
    url_base = 'https://raw.github.com/thomas-haslwanter/statsintro/master/Data/data_altman/'
    url = url_base + inFile
    data = np.genfromtxt(urlopen(url), delimiter=',')

    # Group them into "lean" and "obese"
    lean = pd.Series(data[data[:,1]==1,0])
    obese = pd.Series(data[data[:,1]==0,0])

    # Combine them into a pandas DataFrame
    df = pd.DataFrame({'lean':lean, 'obese':obese})

    # Calculate the mean value, for each group
    print(df.mean())

    # Show a boxplot
    df.boxplot()
    plt.show()
    
    # Perform a T-test between "lean" and "obese" subjects
    stats.ttest_ind(lean, obese)
Beispiel #10
0
def check_distance_pvals(dist_dict, group_dict, group_frac=0.5, nreps=500):

    groups = sorted(set(group_dict.values()))
    assert len(groups) == 2

    group_vals = defaultdict(list)

    for (key1, key2), dist in dist_dict.items():
        if group_dict[key1] == group_dict[key2]:
            group_vals[group_dict[key1]].append(dist)

    assert len(group_vals) == 2
    _, raw_pval = ttest_ind(*group_vals.values())

    nitems = int(group_frac*min(map(len, group_vals.values())))
    cor_vals = []
    for _ in range(nreps):
        [shuffle(items) for items in group_vals.values()]
        _, pval = ttest_ind(*[items[:nitems] for items in group_vals.values()])
        cor_vals.append(pval)

    odict = {
        'RawPval': raw_pval,
        'AdjPval': np.mean(cor_vals),
        'Group1Name': groups[0],
        'Group2Name': groups[1],
        'Group1Mean': np.mean(group_vals[groups[0]]),
        'Group2Mean': np.mean(group_vals[groups[1]]),
        'Group1Std': np.std(group_vals[groups[0]]),
        'Group2Std': np.std(group_vals[groups[1]])
    }

    return odict
Beispiel #11
0
    def test_vs_nonmasked(self):
        np.random.seed(1234567)
        outcome = np.random.randn(20, 4) + [0, 0, 1, 2]

        # 1-D inputs
        res1 = stats.ttest_ind(outcome[:, 0], outcome[:, 1])
        res2 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1])
        assert_allclose(res1, res2)

        # 2-D inputs
        res1 = stats.ttest_ind(outcome[:, 0], outcome[:, 1], axis=None)
        res2 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1], axis=None)
        assert_allclose(res1, res2)
        res1 = stats.ttest_ind(outcome[:, :2], outcome[:, 2:], axis=0)
        res2 = mstats.ttest_ind(outcome[:, :2], outcome[:, 2:], axis=0)
        assert_allclose(res1, res2)

        # Check default is axis=0
        res3 = mstats.ttest_ind(outcome[:, :2], outcome[:, 2:])
        assert_allclose(res2, res3)

        # Check equal_var
        res4 = stats.ttest_ind(outcome[:, 0], outcome[:, 1], equal_var=True)
        res5 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1], equal_var=True)
        assert_allclose(res4, res5)
        res4 = stats.ttest_ind(outcome[:, 0], outcome[:, 1], equal_var=False)
        res5 = mstats.ttest_ind(outcome[:, 0], outcome[:, 1], equal_var=False)
        assert_allclose(res4, res5)
Beispiel #12
0
    def test(self, arr1, arr2):
        p_value = 0
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if stats.levene(arr1, arr2)[1] > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    p_value = stats.ttest_ind(arr1, arr2)[1]
                else:
                    # p = Mann
                    if equal(arr1, arr2):
                        p_value = 1
                    else:
                        p_value = stats.mannwhitneyu(arr1, arr2)[1]
            else:
                p_value = stats.ttest_ind(arr1, arr2, False)[1]

        elif self.statistics == "student":
            p_value = stats.ttest_ind(arr1, arr2)[1]
        elif self.statistics == "welch":
            p_value = stats.ttest_ind(arr1, arr2, False)[1]
        elif self.statistics == "mann":
            if equal(arr1, arr2):
                p_value = 1
            else:
                p_value = stats.mannwhitneyu(arr1, arr2)[1]
        return p_value
    def t_value(self, data, matched_tr_ind_list, matched_co_ind_list, treat_col_name):
        """
        t値を計算する
        """
        ps_tr, ps_co = data.ix[matched_tr_ind_list], data.ix[matched_co_ind_list]
        rand_tr, rand_co = data.ix[data[treat_col_name]==1], data.ix[data[treat_col_name]==0]
        del ps_tr[treat_col_name]
        del ps_co[treat_col_name]
        del rand_tr[treat_col_name]
        del rand_co[treat_col_name]
        ps_t_val_dict, rand_t_val_dict = {}, {}

        for column in ps_tr.columns:
            ## ps scoreでマッチングを取った場合のT値
            ps_tr_array, ps_co_array = numpy.array(ps_tr[column]), numpy.array(ps_co[column])
            # ps_tr_n, ps_co_n = ps_tr.shape[0], ps_co.shape[0]
            # ps_tr_mean, ps_co_mean = numpy.mean(ps_tr_array), numpy.mean(ps_co_array)
            # ps_tr_var, ps_co_var = numpy.var(ps_tr_array), numpy.var(ps_co_array)

            ps_t_val = stats.ttest_ind(ps_tr_array, ps_co_array)[0]

            ps_t_val_dict.update({column:ps_t_val})

            ## マッチングしていない場合のT値
            rand_tr_array, rand_co_array = numpy.array(rand_tr[column]), numpy.array(rand_co[column])
            # rand_tr_n, rand_co_n = rand_tr.shape[0], rand_co.shape[0]
            # rand_tr_mean, rand_co_mean = numpy.mean(rand_tr_array), numpy.mean(rand_co_array)
            # rand_tr_var, rand_co_var = numpy.var(rand_tr_array), numpy.var(rand_co_array)

            rand_t_val = stats.ttest_ind(rand_tr_array, rand_co_array)[0]

            rand_t_val_dict.update({column:rand_t_val})

        return ps_t_val_dict, rand_t_val_dict
def read_result2():
    result1 = pickle.load(open('data/result/lab2/case1.obj', 'rb'))
    result2 = pickle.load(open('data/result/lab2/case2.obj', 'rb'))
    result3 = pickle.load(open('data/result/lab2/case3.obj', 'rb'))
    
    print np.mean(result1), np.var(result1)
    print np.mean(result2), np.var(result2), stats.ttest_ind(result1, result2)
    print np.mean(result3), np.var(result3), stats.ttest_ind(result2, result3)        
def significant(array1,array2):
    try:
        arr1 = np.array(array1)
        arr2 = np.array(array2)
        print(stats.ttest_ind(arr1,arr2)[1])
        return stats.ttest_ind(arr1,arr2)[1] < 0.1
    except:
        print('PROBLEM!')
        return None
def do_t_tests():
	two_sample = stats.ttest_ind(sec_rating, not_rating)

	print "The t-statistic is %.3f and the p-value is %.3f." % two_sample

	# assuming unequal population variances
	two_sample_diff_var = stats.ttest_ind(sec_rating, not_rating)

	print "If we assume unequal variances than the t-statistic is %.3f and the p-value is %.3f." % two_sample_diff_var
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("top_data_dir")
    parser.add_argument("performance_dir")
    parser.add_argument("--basemodel","-bm",default="raw_f2exp")
    parser.add_argument("--expansion_model","-em",default="raw_f2exp_fbDocs:10")
    parser.add_argument("--threshold","-t",type=float,default=0.6)
    args=parser.parse_args()

    datasets = load_data(args.top_data_dir)

    
    for collection_name in datasets:
        for year in datasets[collection_name].years:
            base = []
            expansion = []
            probs = []
            performance_file = os.path.join(args.performance_dir,year.name)
            performances = json.load(open(performance_file))
            print "for year %s" %(year)
            for day in datasets[collection_name]._prediction[year]:
                if year==Year.y2016:
                    day_string = "201608%s" %(day.zfill(2))
                elif year == Year.y2015:
                    day_string = "201507%s" %(day.zfill(2))
                elif year == Year.y2017:
                    if int(day) < 10:
                        day_string = "201708%s" %(day.zfill(2))
                    else:
                        day_string = "201707%s" %(day.zfill(2))
                else:
                    raise NotImplemented("year %s is not NotImplemented!" %(year.name))
                for qid in datasets[collection_name]._prediction[year][day]:
                    try:
                        base_performance = performances[args.basemodel][qid][day_string]
                    except KeyError:
                        continue
                    else:
                        if datasets[collection_name].is_silent_day(year,day,qid):
                            base.append(.0)
                            expansion.append(.0)
                        else:
                            base.append(base_performance)
                            query_day_prob = datasets[collection_name].get_prob(year,day,qid)
                            probs.append(query_day_prob)
                            if query_day_prob > args.threshold:
                                expansion.append(base_performance)
                            else:
                                expansion.append(performances[args.expansion_model][qid][day_string])
            print "There are %d pairs" %(len(base))                
            base_avg = sum(base) / len(base)
            expansion_avg = sum(expansion) / len(expansion)
            print "%s: %f, %s: %f" %(args.basemodel,base_avg,
                                     args.expansion_model,expansion_avg)
            # print probs
            print ttest_ind(base,expansion)  
            print "-"*20
Beispiel #18
0
def motifStatsRandMarix(data, motifSize=3, degree=10, usetotal=False):
	"""Outputs text file with stats on the motifs in data"""
			
	filename = "result/t_test_Deg-{0}_Size-{1}.txt".format(degree,motifSize)
	with open(filename,'w') as f:
		f.write("Student's T test comparing both AD/NL/MCI to Random.\n\n")
		for corr in ['corr']:
			title = "P-values for "+corr+" data set compared to random generated graphs\n"
			f.write(title)
			
			data[('MCIR', corr)] = genRandomGraphs(data[('MCI', corr)], degree, len(data[('MCI', corr)])) 
			data[('ADR', corr)] = genRandomGraphs(data[('AD', corr)], degree, len(data[('AD', corr)])) 
			data[('NLR', corr)] = genRandomGraphs(data[('NL', corr)], degree, len(data[('NL', corr)]))
			 
			motifsNL=findMotifs(data,('NL',corr), motifSize, degree, usetotal)
			motifsMCI=findMotifs(data,('MCI',corr), motifSize, degree, usetotal)
			motifsAD=findMotifs(data,('AD',corr), motifSize, degree, usetotal)
			motifsNLR=findMotifs(data,('NLR',corr), motifSize, degree, usetotal)
			motifsMCIR=findMotifs(data,('MCIR',corr), motifSize, degree, usetotal)
			motifsADR=findMotifs(data,('ADR',corr), motifSize, degree, usetotal)
		
			allMotifs = list( set(motifsNL.keys())
							& set(motifsAD.keys())
							& set(motifsMCI.keys())
							& set(motifsNLR.keys())
							& set(motifsADR.keys())
							& set(motifsMCIR.keys()) )
			allMotifs.sort()
			f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}{6:>15}{7:>15}{8:>15}{9:>15}\n".format(
				'MOTIF ID','NL', 'MCI','AD', 'NLR Mean','MCIR Mean','ADR Mean', 'NLR Std','MCIR Std', 'ADR Std'))
			
			motifStats = []
			for key in allMotifs:
				tMCI, probMCI = stats.ttest_ind(motifsMCI[key], motifsMCIR[key])
				tAD, probAD = stats.ttest_ind(motifsAD[key], motifsADR[key])
				tNL, probNL = stats.ttest_ind(motifsNL[key], motifsNLR[key])
				motifStats.append((key,probNL,probMCI,probAD))
			
			motifStats.sort(key=lambda x: min(x))
				
			for key, probNL, probMCI, probAD in motifStats:
				normRMean = motifsNLR[key].mean()
				mciRMean = motifsMCIR[key].mean()
				adRMean = motifsADR[key].mean()
				normRVar = motifsNLR[key].std()
				mciRVar = motifsMCIR[key].std()
				adRVar = motifsADR[key].std()
				if probMCI<0.01 or probAD<0.01 or probNL<0.01:
					star = "**"
				elif probMCI<0.1 or probAD<0.1 or probNL<0.01:
					star = "*"
				else:
					star = ""
				line = star+"{0:>"+str(10-len(star))+"}{1:>15.3}{2:>15.3}{3:>15.3}{4:>15.3}{5:>15.3}{6:>15.3}{7:>15.3}{8:>15.3}{9:>15.3}\n"
				f.write(line.format(str(int(key)), probNL, probMCI, probAD,normRMean,mciRMean,adRMean,normRVar,mciRVar,adRVar))
			f.write("\n\n") 
    def generate_sequence_gene_expression_statistics(self, show_species_charts=True, show_chart=True):
        i = -1
        if self.multiple_networks:
            for nw_ge_file in glob.glob(self.output_silix_nw_exp_data_folder_path + '/*.txt'):
                i += 1
                mapping_data = np.genfromtxt(nw_ge_file, delimiter=',', dtype=str)
                if len(mapping_data) > 0:
                    print 'Network: ', i, mapping_data.shape
                    x = np.array(mapping_data[:, 2], dtype=float)
                    y = np.array(mapping_data[:, 3], dtype=float)
                    ca_stat = ca_pvalue = spike_stat = spike_pvalue = ind_stat = ind_pvalue = 0

                    if not np.all(x == 0):
                        ca_stat, ca_pvalue = stats.ttest_1samp(x[x != 0], 0)
                        spike_stat, spike_pvalue = stats.ttest_1samp(y[y != 0], 0)
                        ind_stat, ind_pvalue = stats.ttest_ind(x[x != 0], y[y != 0], equal_var=False)
                    nw_number = (int)(re.findall(r'\d+', nw_ge_file)[0])
                    nw_statistics = (
                        [nw_number, x[x != 0].mean(), x[x != 0].var(), x[x != 0].std(), y[y != 0].mean(),
                         y[y != 0].var(), y[y != 0].std(), ca_stat, ca_pvalue,
                         spike_stat, spike_pvalue, ind_stat, ind_pvalue])
                    self.network_gene_expressions.append(nw_statistics)
        else:
            mapping_data = np.genfromtxt(self.output_silix_nw_exp_data_folder_path + self.silix_nw_exp_data_filename,
                                         delimiter=',', dtype=str)
            if len(mapping_data) > 0:
                print 'Network: ', mapping_data.shape
                x = np.array(mapping_data[:, 2], dtype=float)
                y = np.array(mapping_data[:, 3], dtype=float)
                ca_stat = ca_pvalue = spike_stat = spike_pvalue = ind_stat = ind_pvalue = 0
                if not np.all(x == 0):
                    ca_stat, ca_pvalue = stats.ttest_1samp(x[x != 0], 0)
                    spike_stat, spike_pvalue = stats.ttest_1samp(y[y != 0], 0)
                    ind_stat, ind_pvalue = stats.ttest_ind(x[x != 0], y[y != 0], equal_var=False)
                nw_statistics = (
                    [0, x[x != 0].mean(), x[x != 0].var(), x[x != 0].std(), y[y != 0].mean(),
                     y[y != 0].var(), y[y != 0].std(), ca_stat, ca_pvalue,
                     spike_stat, spike_pvalue, ind_stat, ind_pvalue])
                self.network_gene_expressions.append(nw_statistics)
        # convert list into array
        self.network_gene_expressions = np.asarray(self.network_gene_expressions)

        # Save network gene expression statistics to csv file
        gene_expression_statistics_file = self.output_silix_nw_exp_data_folder_path + 'gene_expression_statistics.csv'
        with open(gene_expression_statistics_file, 'w') as f_handle:
            f_handle.write(
                'Network, 9mM CA Mean, 9mM CA Var, 9mM CA SD, Spike Mean, Spike Var, Spike SD, 9mM CA ttest-stat, 9mM CA ttest-pvalue, Spike ttest-stat, Spike ttest-pvalue, Ind ttest-stat, Ind ttest-pvalue  \n')
            np.savetxt(f_handle, self.network_gene_expressions, delimiter=',')

        if show_species_charts:
            self.generate_species_wise_gene_expression_statistics()

        if self.multiple_networks and show_chart:
            self.plot_all_nw_gene_expr_stats_chart()
        elif show_chart and not self.multiple_networks:
            self.plot_single_network_gene_expr_stats_chart()
def ttest(exp1,exp2,exp3):
    print "=== exp1 vs exp2" 
    ts, pvalue = stats.ttest_ind(exp1, exp2,equal_var=False)
    print "p value =", pvalue
    print "=== exp1 vs exp3" 
    ts, pvalue = stats.ttest_ind(exp1, exp3,equal_var=False)
    print "p value =", pvalue
    print "=== exp2 vs exp3" 
    ts, pvalue = stats.ttest_ind(exp2, exp3,equal_var=False)
    print "p value =", pvalue,
Beispiel #21
0
def motifStats(data, motifSize=3, degree=10, usetotal=False):
	"""Outputs pdf file with stats on the motifs in data"""
			
	filename = "result/t_test_Deg-{0}_Size-{1}.txt".format(degree,motifSize)
	with open(filename,'w') as f:
		f.write("Student's T test comparing both MCI and AD to NL.\n\n")
		for corr in ('corr','lcorr','lacorr'):
			title = "P-values for "+corr+" data set compared to normal patients\n"
			f.write(title)
	
			
			motifsNL=findMotifs(data,('NL',corr), motifSize, degree, usetotal)
			motifsMCI=findMotifs(data,('MCI',corr), motifSize, degree, usetotal)
			motifsAD=findMotifs(data,('AD',corr), motifSize, degree, usetotal)
			
			#mats = []
			#for i in xrange(108):	
			#	x = np.random.rand(88,88)
			#	x -= np.diag(np.diag(x))
			#	mats.append(x)
			#rand = {}
			#rand['derp'] = mats
			#motifsNL=findMotifs(rand,'derp', motifSize, degree, usetotal)
		
			allMotifs = list( set(motifsNL.keys())
							& set(motifsAD.keys())
							& set(motifsMCI.keys()) )

			f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}{6:>15}{7:>15}{8:>15}\n".format(
				'MOTIF ID','MCI','AD','Norm Mean','MCI Mean','AD Mean','NORM Std','MCI Std', 'AD Std'))
			
			motifStats = []
			for key in allMotifs:
				tMCI, probMCI = stats.ttest_ind(motifsMCI[key], motifsNL[key])
				tAD, probAD = stats.ttest_ind(motifsAD[key], motifsNL[key])
				motifStats.append((key,probMCI,probAD))
			
			motifStats.sort(key=lambda x: min(x))
				
			for key, probMCI, probAD in motifStats:
				normMean = motifsNL[key].mean()
				mciMean = motifsMCI[key].mean()
				adMean = motifsAD[key].mean()
				normVar = motifsNL[key].std()
				mciVar = motifsMCI[key].std()
				adVar = motifsAD[key].std()
				if probMCI<0.01 or probAD<0.01:
					star = "**"
				elif probMCI<0.1 or probAD<0.1:
					star = "*"
				else:
					star = ""
				line = star+"{0:>"+str(10-len(star))+"}{1:>15.3}{2:>15.3}{3:>15.3}{4:>15.3}{5:>15.3}{6:>15.3}{7:>15.3}{8:>15.3}\n"
				f.write(line.format(str(int(key)), probMCI, probAD,normMean,mciMean,adMean,normVar,mciVar,adVar))
			f.write("\n\n")
Beispiel #22
0
    def return_test_results(self, arr1, arr2):
        test_name = ""
        p_value = 0
        t_value = 0
        levene = stats.levene(arr1, arr2)[1]
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if levene > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    test_name = "Student"
                    result = stats.ttest_ind(arr1, arr2)
                    t_value = result[0]
                    p_value = result[1]
                else:
                    # p = Mann
                    test_name = "Mann"
                    if equal(arr1, arr2):
                        t_value = None
                        p_value = 1
                    else:
                        result = stats.mannwhitneyu(arr1, arr2)
                        t_value = result[0]
                        p_value = result[1]
            else:
                test_name = "Welch"
                result = stats.ttest_ind(arr1, arr2, False)
                t_value = result[0]
                p_value = result[1]

        elif self.statistics == "student":
            test_name = "Student"
            result = stats.ttest_ind(arr1, arr2)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "welch":
            test_name = "Welch"
            result = stats.ttest_ind(arr1, arr2, False)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "mann":
            test_name = "Mann"
            if equal(arr1, arr2):
                t_value = None
                p_value = 1
            else:
                result = stats.mannwhitneyu(arr1, arr2)
                t_value = result[0]
                p_value = result[1]

        df = len(arr1) + len(arr2) - 2

        return [test_name, t_value, p_value, df, levene]
def read_result():
    fsc0 = pickle.load(open('data/result/fsc1.obj', 'rb'))
    fsc25 = pickle.load(open('data/result/fsc2.obj', 'rb'))
    fsc50 = pickle.load(open('data/result/fsc3.obj', 'rb'))
    fsc75 = pickle.load(open('data/result/fsc4.obj', 'rb'))
    fsc100 = pickle.load(open('data/result/fsc5.obj', 'rb'))
    
    print np.mean(fsc0), np.var(fsc0)
    print np.mean(fsc25), np.var(fsc25), stats.ttest_ind(fsc0, fsc25)
    print np.mean(fsc50), np.var(fsc50), stats.ttest_ind(fsc25, fsc50)
    print np.mean(fsc75), np.var(fsc75), stats.ttest_ind(fsc50, fsc75)
    print np.mean(fsc100), np.var(fsc75), stats.ttest_ind(fsc100, fsc75)
def table_post_processing(results_collector):
    stats_collector = []

    print '\n Summary of the analysis:'

    for line in results_collector:
        class_name, _time_stamps = classify(line[0])
        stats_collector.append([class_name, _time_stamps, line[6], line[7]])

    stats_collector = np.array(stats_collector)

    for class_name in classes:
        class_set_filter = stats_collector[:, 0] == class_name
        if any(class_set_filter):
            class_set = stats_collector[class_set_filter, :]
            print class_name
            final_stats_collector_x = []
            final_stats_collector_y = []
            final_stats_collector_e = []
            raw_times = {}
            for time_stamp in time_stamp_coll:
                time_stamp_filter = class_set[:, 1] == time_stamp
                if any(time_stamp_filter):
                    time_stamp_set = class_set[time_stamp_filter, :]
                    mean = np.nanmean(time_stamp_set[:, 2].astype(np.float64))
                    err = np.nanstd(time_stamp_set[:, 2].astype(np.float64)) / \
                          np.sqrt(len(time_stamp_set[:, 2]))*1.96
                    raw_times[time_stamp] = rm_nans(time_stamp_set[:, 2].astype(np.float64))
                    print '\t time: %s, mean: %s, err: %s' % (time_stamp, mean, err)
                    final_stats_collector_x.append(time_stamps[time_stamp])
                    final_stats_collector_y.append(mean)
                    final_stats_collector_e.append(err)

            time_translator = dict([(item, i) for i, item in enumerate(sorted(raw_times.keys()))])

            samples_n = len(raw_times.keys())
            print samples_n
            p_val_array = np.array((samples_n, samples_n))

            for time1, time2 in combinations(sorted(raw_times.keys()), 2):
                print time1, time2
                print time_translator[time1], time_translator[time2]
                print ttest_ind(raw_times[time1], raw_times[time2])
                _, p_val = ttest_ind(raw_times[time1], raw_times[time2])
                p_val_array[time_translator[time1], time_translator[time2]] = p_val

            print p_val_array

            plt.errorbar(final_stats_collector_x, final_stats_collector_y, final_stats_collector_e,
                         label=class_name)

    plt.legend()
    plt.show()
Beispiel #25
0
def determine_significance(mesa1, mesa2):
    """ Determines if two sets of values are statistically significant.

    In the best case, we can determine a normal distribution, and equal
    variance. Once determined we can use the independent t-test function if the
    values are of equal variance.  If we have normal data, but the variance is
    unequal, the welch t-test is used.
    http://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test
    http://en.wikipedia.org/wiki/Student%27s_t-test#Equal_or_unequal_sample_sizes.2C_unequal_variances

    In the case where we cannot determine normality the mann-whitney u-test is
    desired to be used, but this test is only effective when there are greater
    than 20 samples.
    http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
    """
    # FIXME: Is it possible to determine these things with fewer samples?
    Distribution = Enum('Distribution', 'Normal, Non_normal Unknown')
    normality = Distribution.Normal
    try:
        k2, normal = stats.normaltest(mesa1)
        # FIXME: Unhardcode
        if (normal < NORMAL_CI):
            normality = Distribution.Non_normal

        k2, normal = stats.normaltest(mesa2)
        if (normal < NORMAL_CI):
            normality = Distribution.Non_normal
    except ValueError:
        normality = Distribution.Unknown

    equal_variance = is_equal_variance(mesa1, mesa2)

    if args.ttest:
        t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance)
        return (p, normality == Distribution.Normal,
                "t-test" if equal_variance else "Welch's")
    elif args.mannwhitney:
        u, p = stats.mannwhitneyu(mesa1, mesa2)
        p *= 2  # We want a 2-tailed p-value
        return (p, len(mesa1) < 20 or len(mesa2) < 20, "Mann-Whitney")

    if normality == Distribution.Normal:
        error_handler='raise'
        if np.var(mesa1) == 0 and equal_variance:
            error_handler='ignore'
        with np.errstate(divide=error_handler):
            t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance)
        return (p, False, "t-test" if equal_variance else "Welch's")
    else:
        u, p = stats.mannwhitneyu(mesa1, mesa2)
        p *= 2  # We want a 2-tailed p-value
        flawed = len(mesa1) < 20 or len(mesa2) < 20
        return (p, flawed, "Mann-Whitney")
Beispiel #26
0
def main():
    f27_scan = open('sim_scan27.txt', 'r')
    f27_table = open('sim_table27.txt', 'r')
    f35932_scan = open('sim_scan35932.txt', 'r')
    f35932_table = open('sim_table35932.txt', 'r')
    
    ntests = 10
    
    scan27 = [0 for i in range(ntests)]
    table27 = [0 for i in range(ntests)]
    scan35932 = [0 for i in range(ntests)]
    table35932 = [0 for i in range(ntests)]

    files = [f27_scan, f27_table, f35932_scan, f35932_table]
    arrs = [scan27, table27, scan35932, table35932]

    for i in range(ntests):
        for j in range(4):
            line = files[j].readline()
            if line[len(line)-1] == '\n':
                line = line[:len(line)-1]
            arrs[j][i] = float(line)
    for j in range(4):
        files[j].close()

    _, p27 = stats.ttest_ind(scan27, table27, equal_var=False)
    mean27scan = stats.tmean(scan27)
    mean27table = stats.tmean(table27)
    var27scan = stats.tvar(scan27)
    var27table = stats.tvar(table27)

    _, p35932 = stats.ttest_ind(scan35932, table35932, equal_var=False)
    mean35932scan = stats.tmean(scan35932)
    mean35932table = stats.tmean(table35932)
    var35932scan = stats.tvar(scan35932)
    var35932table = stats.tvar(table35932)

    f = open('sim_results_compare_scan_table.txt', 'w')
    f.write('27\n')
    f.write('scan mean: ' + str(mean27scan) + '\n')
    f.write('scan var: ' + str(var27scan) + '\n')
    f.write('table mean: ' + str(mean27table) + '\n')
    f.write('table var: ' + str(var27table) + '\n')
    f.write('p-value: ' + str(p27) + '\n\n')

    f.write('35932\n')
    f.write('scan mean: ' + str(mean35932scan) + '\n')
    f.write('scan var: ' + str(var35932scan) + '\n')
    f.write('table mean: ' + str(mean35932table) + '\n')
    f.write('table var: ' + str(var35932table) + '\n')
    f.write('p-value: ' + str(p35932) + '\n')

    f.close()
Beispiel #27
0
def main(pkl_list, name_list, cut=sys.maxint):
    pickles = plot_util.load_pickles(name_list, pkl_list)
    best_dict, idx_dict, keys = plot_util.get_best_dict(name_list, pickles,
                                                       cut=cut)

    for k in keys:
        sys.stdout.write("%10s: %s experiment(s)\n" % (k, len(best_dict[k])))

    sys.stdout.write("Unpaired t-tests-----------------------------------------------------\n")
    # TODO: replace by itertools
    for idx, k in enumerate(keys):
        if len(keys) > 1:
            for j in keys[idx+1:]:
                t_true, p_true = stats.ttest_ind(best_dict[k], best_dict[j])
                rounded_t_true, rounded_p_true = stats.ttest_ind(numpy.round(best_dict[k], 3),
                                                                 numpy.round(best_dict[j], 3))

                sys.stdout.write("%10s vs %10s\n" % (k, j))
                sys.stdout.write("Standard independent 2 sample test, equal population variance\n")
                sys.stdout.write(" "*24 + "  T: %10.5e, p-value: %10.5e (%5.3f%%) \n" %
                                (t_true, p_true, p_true*100))
                sys.stdout.write("Rounded:                ")
                sys.stdout.write("  T: %10.5e, p-value: %10.5e (%5.3f%%)\n" %
                                (rounded_t_true, rounded_p_true, rounded_p_true*100))
                if tuple(map(int, (scipy.__version__.split(".")))) >= (0, 11, 0):
                    # print scipy.__version__ >= '0.11.0'
                    t_false, p_false = stats.ttest_ind(best_dict[k], best_dict[j], equal_var=False)
                    rounded_t_false, rounded_p_false = stats.ttest_ind(numpy.round(best_dict[k], 3),
                                                                       numpy.round(best_dict[j], 3),
                                                                       equal_var=False)
                    sys.stdout.write("Welch's t-test, no equal population variance\n")
                    sys.stdout.write(" "*24)
                    sys.stdout.write(": T: %10.5e, p-value: %10.5e (%5.3f%%)\n" %
                                    (t_false, p_false, p_false*100))
                    sys.stdout.write("Rounded:                ")
                    sys.stdout.write(": T: %10.5e, p-value: %10.5e (%5.3f%%)\n" %
                                    (rounded_t_false, rounded_p_false, rounded_p_false*100))
                sys.stdout.write("\n")

    sys.stdout.write("Best Value-----------------------------------------------------------\n")
    for k in keys:
        sys.stdout.write("%10s: %10.5f (min: %10.5f, max: %10.5f, std: %5.3f)\n" %
                        (k, float(numpy.mean(best_dict[k])), float(numpy.min(best_dict[k])),
                         numpy.max(best_dict[k]), float(numpy.std(best_dict[k]))))

    sys.stdout.write("Needed Trials--------------------------------------------------------\n")
    for k in keys:
        sys.stdout.write("%10s: %10.5f (min: %10.5f, max: %10.5f, std: %5.3f)\n" %
                        (k, float(numpy.mean(idx_dict[k])), float(numpy.min(idx_dict[k])),
                         numpy.max(idx_dict[k]), float(numpy.std(idx_dict[k]))))

    sys.stdout.write("------------------------------------------------------------------------\n")
    def runCompare(self, objId1, objId2, label1, label2, paired, expression1, expression2):
        fh = open(self._getPath("report.txt"),'w')

        self.experiment1 = self.readExperiment(self.inputExperiment1.get().fnPKPD)
        self.experiment2 = self.readExperiment(self.inputExperiment2.get().fnPKPD)
        label2ToUse = self.label1.get() if self.label2.get()=="" else self.label2.get()
        if self.paired:
            x1=[]
            x2=[]
            for sampleName, sample in self.experiment1.getSubGroup(self.expression1.get()).iteritems():
                x1.append(float(sample.descriptors[self.label1.get()]))
                if sampleName in self.experiment2.samples:
                    x2.append(float(self.experiment2.samples[sampleName].descriptors[label2ToUse]))
                else:
                    raise "Cannot find sample %s in Experiment 2"%sample.sampleName
        else:
            expression2ToUse = self.expression1.get() if self.expression2.get()=="" else self.expression2.get()
            x1 = [float(x) for x in self.experiment1.getSubGroupLabels(self.expression1.get(),self.label1.get())]
            x2 = [float(x) for x in self.experiment2.getSubGroupLabels(expression2ToUse,label2ToUse)]
        self.doublePrint(fh,"Values in SubGroup 1: %s"%str(x1))
        self.doublePrint(fh,"Values in SubGroup 2: %s"%str(x2))
        self.doublePrint(fh,"Testing H0: mu1=mu2")
        self.doublePrint(fh," ")

        try:
            if self.paired:
                [t,pval] = stats.ttest_rel(np.asarray(x1,np.double),np.asarray(x2,np.double),True)
                self.doublePrint(fh,"T-test two paired samples: t-statistic=%f p-value=%f"%(t,pval))
            else:
                [t,pval] = stats.ttest_ind(np.asarray(x1,np.double),np.asarray(x2,np.double),True)
                self.doublePrint(fh,"T-test two independent samples (same variance): t-statistic=%f p-value=%f"%(t,pval))
        except:
            pass

        if not self.paired:
            try:
                [t,pval] = stats.ttest_ind(x1,x2, False)
                self.doublePrint(fh,"T-test two independent samples (different variance, Welch's test): t-statistic=%f p-value=%f"%(t,pval))
            except:
                pass

        try:
            if self.paired:
                [w,pval] = stats.wilcoxon(x1, x2, correction=True)
                self.doublePrint(fh,"Wilcoxon signed rank test for two paired samples: w-statistic=%f p-value=%f"%(w,pval))
            else:
                [u,pval] = stats.mannwhitneyu(x1, x2, True)
                self.doublePrint(fh,"Mann-Whitney U test for two independent samples: u-statistic=%f p-value=%f"%(u,pval))
        except:
            pass

        fh.close()
def detect_hemizygous_markers(pop_file , coverage_file, nb_sample_required=0):
    sample2pop, pop2sample = read_pop_file(pop_file)
    if len(pop2sample)!=2:
        logging.critical('Hemizygous Markers can only be search between two set of samples. Edit you population file to have two populations')
        return -1
    pop1,pop2 = pop2sample.keys()
    samples_pop1 = pop2sample.get(pop1)
    samples_pop2 = pop2sample.get(pop2)

    all_markers, all_samples, all_samples_to_norm_coverage = get_normalize_coverage(coverage_file, nb_sample_required)

    sample_errors =  set(sample2pop.keys()).difference(set(all_samples))
    if len(sample_errors)>0:
        logging.critical('%s samples (%s) from the population file not found in the coverage file'%(len(sample_errors), ', '.join(sample_errors)))
        return -2

    header = ["#consensus", "mean_%s"%(pop1), "mean_%s"%(pop2), "fold_change",
              "t_test_%s_eq_2X_%s"%(pop1,pop2), "t_test_%s_eq_%s"%(pop1,pop2)]
    all_lines = [' '.join(header)]
    for i, marker in enumerate(all_markers):
        out=[marker]
        out_pop1=[]
        out_pop2=[]
        pop1_values = []
        pop2_values = []
        for sample in samples_pop1:
            cov = all_samples_to_norm_coverage.get(sample)[i]
            pop1_values.append(cov)
            out_pop1.append(str(cov))

        for sample in samples_pop2:
            cov = all_samples_to_norm_coverage.get(sample)[i]
            pop2_values.append(cov)
            out_pop2.append(str(cov))
        pop1_nvalues = numpy.array(pop1_values)
        pop1_nvalues_2 =pop1_nvalues*2
        pop2_nvalues = numpy.array(pop2_values)
        #This compare the normalized values, we assume they should not be equal
        t_stat_eq, pvalue_eq =  stats.ttest_ind(pop1_nvalues,pop2_nvalues)
        #This compare the norm value with norm values * 2, we assume they should be equal
        t_stat_2X, pvalue_2X =  stats.ttest_ind(pop1_nvalues_2,pop2_nvalues)
        fold=pop2_nvalues.mean()/pop1_nvalues.mean()

        if pvalue_eq<.05 and pvalue_2X >0.5 and fold < 2.2 and fold >1.8:
            out.append(str(pop1_nvalues.mean()))
            out.append(str(pop2_nvalues.mean()))
            out.append(str(fold))
            out.append(str(pvalue_2X))
            out.append(str(pvalue_eq))
            all_lines.append(' '.join(out))

    return '\n'.join(all_lines)
Beispiel #30
0
 def stats(self, x, y):
     if not self.diagonal:
         xflatten = np.delete(x, [i*(x.shape[0]+1)for i in range(x.shape[0])])
         yflatten = np.delete(y, [i*(y.shape[0]+1)for i in range(y.shape[0])])
         p = np.corrcoef(xflatten,yflatten)
         utils.printf('Pearson\'s correlation:\n{}'.format(p))
         utils.printf('Z-Test:{}'.format(ztest(xflatten, yflatten)))
         utils.printf('T-Test:{}'.format(ttest_ind(xflatten, yflatten)))
     else:
         p = np.corrcoef(x, y)
         utils.printf('Pearson\'s correlation:\n{}'.format(p))
         utils.printf('Z-Test:{}'.format(ztest(x, y)))
         utils.printf('T-Test:{}'.format(ttest_ind(x, y)))
Beispiel #31
0
        # if we leave in Aid Response all the distributions will test
        # as the same

        # comment out the next line to test with all Inc Types
        row.pop(1)

        # convert from strings
        floats = []
        for count in row:
            floats.append(float(count))

        # keep track of the values
        hoodCounts.append(floats)

for index in range(len(hoodCounts)):
    # get the current Neighborhood
    compareHood = hoodNames[index]
    compareSet = hoodCounts[index]
    for next in range(len(hoodCounts)):
        # compare to each
        if index != next:
            testHood = hoodNames[next]
            testSet = hoodCounts[next]

            # Run a t test on the incident distributions
            pVals = stats.ttest_ind(compareSet, testSet, equal_var=False)
            if pVals[1] < 0.05:
                print testHood
                print compareHood
                print pVals
import numpy as np
from scipy import stats

N = 10
a = np.random.randn(N) + 2
b = np.random.randn(N)

var_a = a.var(ddof=1)
var_b = b.var(ddof=1)

s = np.sqrt((var_a + var_b) / 2)
  
t = ((a.mean() - b.mean()) / (s * np.sqrt(2.0 / N)))

df = 2*N - 2

p = 1 - stats.t.cdf(t, df=df)

print("t:\t", t, "p:\t", 2*p)

t2, p2 = stats.ttest_ind(a, b)

print("t2:\t", t2, "p2:\t", p2)
Beispiel #33
0
# Import independent two-sample t-test
from scipy.stats import ttest_ind

# Divide `df.brain_vol` by `df.skull_vol`
df['adj_brain_vol'] = df.brain_vol / df.skull_vol

# Select brain measures by Alzheimers group
brain_alz = df.loc[df.alzheimers == True, 'adj_brain_vol']
brain_typ = df.loc[df.alzheimers == False, 'adj_brain_vol']

# Evaluate null hypothesis
results = ttest_ind(brain_alz, brain_typ)
Beispiel #34
0
                break

    client_seconds = time.perf_counter() - start_seconds
    print(f'client-time\t{query_index}\t{client_seconds}\t{server_seconds}')

    # Run additional profiling queries to collect profile data, but only if test times appeared to be different.
    # We have to do it after normal runs because otherwise it will affect test statistics too much
    if len(all_server_times) != 2:
        continue

    if len(all_server_times[0]) < 3:
        # Don't fail if for some reason there are not enough measurements.
        continue

    pvalue = stats.ttest_ind(all_server_times[0],
                             all_server_times[1],
                             equal_var=False).pvalue
    median = [statistics.median(t) for t in all_server_times]
    # Keep this consistent with the value used in report. Should eventually move
    # to (median[1] - median[0]) / min(median), which is compatible with "times"
    # difference we use in report (max(median) / min(median)).
    relative_diff = (median[1] - median[0]) / median[0]
    print(
        f'diff\t{query_index}\t{median[0]}\t{median[1]}\t{relative_diff}\t{pvalue}'
    )
    if abs(relative_diff) < ignored_relative_change or pvalue > 0.05:
        continue

    # Perform profile runs for fixed amount of time. Don't limit the number
    # of runs, because we also have short queries.
    profile_start_seconds = time.perf_counter()
    count2 = count2 - 1
print('After Sampling:\n')
print(sampled_native_df.describe())
print(sampled_singularity_df.describe())
#df = native_df.merge(singularity_df, how='left')
#print(df.describe())
print('p-value:\t 0.05\n')
print('degrees of freedom:\t ~60\n')
print('Critical t-val:\t 2.0\n')

t_val_rel = stats.ttest_rel(
    sampled_native_df.loc[:, 'Native Runtime (Seconds)'],
    sampled_singularity_df.loc[:, 'Singularity Runtime (Seconds)'])
print(t_val_rel)
t_val_ind = stats.ttest_ind(
    sampled_native_df.loc[:, 'Native Runtime (Seconds)'],
    sampled_singularity_df.loc[:, 'Singularity Runtime (Seconds)'])
print(t_val_ind)

ax = plt.gca()
native_df.plot(kind='hist', y='Native Runtime (Seconds)', color='red', ax=ax)
singularity_df.plot(kind='hist',
                    y='Singularity Runtime (Seconds)',
                    color='blue',
                    ax=ax)
plt.savefig('2gpu_Histogram.png')
plt.savefig('2gpu_Histogram.eps')

plt.figure(2)
ax2 = plt.gca()
sampled_native_df.plot(kind='hist',
Beispiel #36
0
    # ## Use similarity across conditions as the 4th dimension ##########################################
    print("Compute similarity via ttest.")
    condition_names = list(np.unique(haxby_labels))
    n_conds = len(condition_names)
    n_compares = n_conds * (n_conds - 1) / 2

    p_vectors = np.zeros((n_compares, masked_fmri_vectors.shape[1]))
    comparison_text = []
    comparison_img = []
    idx = 0
    for i, cond in enumerate(condition_names):
        for j, cond2 in enumerate(condition_names[i + 1:]):
            print("Computing ttest for %s vs. %s." % (cond, cond2))
            _, p_vector = stats.ttest_ind(
                masked_fmri_vectors[haxby_labels == cond, :],
                masked_fmri_vectors[haxby_labels == cond2, :],
                axis=0)

            # Normalize and log-transform
            p_vector /= p_vector.max()  # normalize
            p_vector = -np.log10(p_vector)
            p_vector[np.isnan(p_vector)] = 0.
            p_vector[p_vector > 10.] = 10.

            p_img = epi_masker.inverse_transform(p_vector)
            comparison_img.append(p_img)
            comparison_text.append('%s vs. %s' % (cond, cond2))
            p_vectors[idx, :] = p_vector
            idx += 1

    # ## Convert similarities into a single subject image (like a time-course) ################
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', required=True, help='Tabular file.')
    parser.add_argument('-o',
                        '--outfile',
                        required=True,
                        help='Path to the output file.')
    parser.add_argument("--sample_one_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument(
        "--sample_cols",
        help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help=
        "Whether a continuity correction (1/2.) should be taken into account.")
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help=
        "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance."
    )
    parser.add_argument(
        "--reta",
        action="store_true",
        default=False,
        help="Whether or not to return the internally computed a values.")
    parser.add_argument("--fisher",
                        action="store_true",
                        default=False,
                        help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias"
    )
    parser.add_argument("--inclusive1",
                        action="store_true",
                        default=False,
                        help="if false,lower_limit will be ignored")
    parser.add_argument("--inclusive2",
                        action="store_true",
                        default=False,
                        help="if false,higher_limit will be ignored")
    parser.add_argument("--inclusive",
                        action="store_true",
                        default=False,
                        help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help=
        "If True, if there are extra points a warning is raised saying how many of those points there are"
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help=
        "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs."
    )
    parser.add_argument("--correction",
                        action="store_true",
                        default=False,
                        help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help=
        "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)"
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help=
        "the number of trials. This is ignored if x gives both the number of successes and failures"
    )
    parser.add_argument("--b",
                        type=int,
                        default=0,
                        help="The number of bins to use for the histogram")
    parser.add_argument("--N",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof",
                        type=int,
                        default=0,
                        help="Degrees of freedom correction")
    parser.add_argument("--score",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help=
        "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5"
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument(
        "--new",
        type=float,
        default=0.0,
        help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end."
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help=
        "lambda_ gives the power in the Cressie-Read power divergence statistic"
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help=
        "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument."
    )
    parser.add_argument("--base",
                        type=float,
                        default=1.6,
                        help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, 'w+')
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(';'):
            barlett_samples.append(map(int, sample.split(',')))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(',')
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(',')
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split('\t')
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(get_value(cols, index))
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(get_value(cols, index))
        if test_id.strip() == 'describe':
            size, min_max, mean, uv, bs, bk = stats.describe(
                map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == 'mode':
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == 'nanmean':
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == 'nanmedian':
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == 'kurtosistest':
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == 'variation':
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == 'itemfreq':
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ','.join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == 'nanmedian':
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == 'variation':
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == 'boxcox_llf':
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == 'tiecorrect':
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == 'rankdata':
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == 'nanstd':
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == 'anderson':
            A2, critical, sig = stats.anderson(map(float, sample_one),
                                               dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(',')
            for list in sig:
                cols.append(list)
        elif test_id.strip() == 'binom_test':
            p_value = stats.binom_test(map(float, sample_one),
                                       n=args.n,
                                       p=args.p)
            cols.append(p_value)
        elif test_id.strip() == 'gmean':
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == 'hmean':
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == 'kurtosis':
            k = stats.kurtosis(map(float, sample_one),
                               axis=args.axis,
                               fisher=args.fisher,
                               bias=args.bias)
            cols.append(k)
        elif test_id.strip() == 'moment':
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == 'normaltest':
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == 'skew':
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == 'skewtest':
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == 'sem':
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == 'zscore':
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == 'signaltonoise':
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == 'percentileofscore':
            p = stats.percentileofscore(map(float, sample_one),
                                        score=args.score,
                                        kind=args.kind)
            cols.append(p)
        elif test_id.strip() == 'bayes_mvs':
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one),
                                                   alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == 'sigmaclip':
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one),
                                             low=args.m,
                                             high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == 'kstest':
            d, p_value = stats.kstest(map(float, sample_one),
                                      cdf=args.cdf,
                                      N=args.N,
                                      alternative=args.alternative,
                                      mode=args.mode)
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == 'chi2_contingency':
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one),
                correction=args.correction,
                lambda_=args.lambda_)
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == 'tmean':
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf),
                                   (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == 'tmin':
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one),
                                 lowerlimit=mf,
                                 inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == 'tmax':
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one),
                                 upperlimit=nf,
                                 inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == 'tvar':
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == 'tstd':
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == 'tsem':
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf),
                               (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == 'scoreatpercentile':
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    interpolation_method=args.interpolation)
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two), (mf, nf),
                    interpolation_method=args.interpolation)
            for list in s:
                cols.append(list)
        elif test_id.strip() == 'relfreq':
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == 'binned_statistic':
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b)
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf))
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == 'threshold':
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one),
                                    mf,
                                    nf,
                                    newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == 'trimboth':
            o = stats.trimboth(map(float, sample_one),
                               proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == 'trim1':
            t1 = stats.trim1(map(float, sample_one),
                             proportiontocut=args.proportiontocut,
                             tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == 'histogram':
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == 'cumfreq':
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == 'boxcox_normmax':
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf),
                                          method=args.method)
            cols.append(ma)
        elif test_id.strip() == 'boxcox':
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one),
                                           alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one),
                                   imbda,
                                   alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == 'histogram2':
            h2 = stats.histogram2(map(float, sample_one),
                                  map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == 'ranksums':
            z_statistic, p_value = stats.ranksums(map(float, sample_one),
                                                  map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == 'ttest_1samp':
            t, prob = stats.ttest_1samp(map(float, sample_one),
                                        map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == 'ansari':
            AB, p_value = stats.ansari(map(float, sample_one),
                                       map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == 'linregress':
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two))
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == 'pearsonr':
            cor, p_value = stats.pearsonr(map(float, sample_one),
                                          map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == 'pointbiserialr':
            r, p_value = stats.pointbiserialr(map(float, sample_one),
                                              map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == 'ks_2samp':
            d, p_value = stats.ks_2samp(map(float, sample_one),
                                        map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == 'mannwhitneyu':
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one),
                map(float, sample_two),
                use_continuity=args.mwu_use_continuity)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == 'zmap':
            z = stats.zmap(map(float, sample_one),
                           map(float, sample_two),
                           ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == 'ttest_ind':
            mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one),
                                                  map(float, sample_two),
                                                  equal_var=args.equal_var)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == 'ttest_rel':
            t, prob = stats.ttest_rel(map(float, sample_one),
                                      map(float, sample_two),
                                      axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == 'mood':
            z, p_value = stats.mood(map(float, sample_one),
                                    map(float, sample_two),
                                    axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == 'shapiro':
            W, p_value, a = stats.shapiro(map(float, sample_one),
                                          map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == 'kendalltau':
            k, p_value = stats.kendalltau(map(float, sample_one),
                                          map(float, sample_two),
                                          initial_lexsort=args.initial_lexsort)
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == 'entropy':
            s = stats.entropy(map(float, sample_one),
                              map(float, sample_two),
                              base=args.base)
            cols.append(s)
        elif test_id.strip() == 'spearmanr':
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one),
                                               map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == 'wilcoxon':
            if sample2 == 1:
                T, p_value = stats.wilcoxon(map(float, sample_one),
                                            map(float, sample_two),
                                            zero_method=args.zero_method,
                                            correction=args.correction)
            else:
                T, p_value = stats.wilcoxon(map(float, sample_one),
                                            zero_method=args.zero_method,
                                            correction=args.correction)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == 'chisquare':
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               map(float, sample_two),
                                               ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == 'power_divergence':
            if sample2 == 1:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       map(float, sample_two),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == 'theilslopes':
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     map(float, sample_two),
                                                     alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == 'combine_pvalues':
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med,
                                                      weights=map(
                                                          float, sample_two))
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == 'obrientransform':
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ','.join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == 'f_oneway':
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == 'kruskal':
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == 'friedmanchisquare':
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == 'fligner':
            xsq, p_value = stats.fligner(center=args.center,
                                         proportiontocut=args.proportiontocut,
                                         *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == 'bartlett':
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == 'levene':
            w, p_value = stats.levene(center=args.center,
                                      proportiontocut=args.proportiontocut,
                                      *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == 'median_test':
            stat, p_value, m, table = stats.median_test(
                ties=args.ties,
                correction=args.correction,
                lambda_=args.lambda_,
                *b_samples)
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ','.join(map(str, list))
                cols.append(elements)
        outfile.write('%s\n' % '\t'.join(map(str, cols)))
    outfile.close()
Beispiel #38
0
clmtempf = ma.filled(clmtempf, fill_value=0.)

clmtropfa = ma.masked_where(maitoatrop <= 0, clmtropfa)
clmtempfa = ma.masked_where(maitoatemp <= 0, clmtempfa)
clmtropfa = ma.filled(clmtropfa, fill_value=0.)
clmtempfa = ma.filled(clmtempfa, fill_value=0.)

clmhis = clmtropf + clmtempf
clmfuture = clmtropfa + clmtempfa
clmhis = ma.masked_where(clmhis[:, :] <= 0, clmhis)
clmfuture = ma.masked_where(clmfuture[:, :] <= 0, clmfuture)

clmhist = clmtrop + clmtemp
clmfutt = clmtropa + clmtempa

tc, pTc = ttest_ind(clmhist, clmfutt, axis=0, equal_var=False)

tc = N.flipud(tc)
pTc = N.flipud(pTc)

yieldclm = clmfuture - clmhis
yieldclm = ma.masked_where(yieldclm == 0., yieldclm)

yieldclm1 = ma.masked_where(pTc[:, :] > 0.1, yieldclm)

yieldf = N.zeros((10, 360, 720))
yieldf2 = N.zeros((10, 360, 720))
yieldfa = N.zeros((10, 360, 720))
yieldf2a = N.zeros((10, 360, 720))
yieldfb = N.zeros((10, 360, 720))
yieldf2b = N.zeros((10, 360, 720))
Beispiel #39
0
# Plotting by genotype and sex
gensex_fig = plt.figure(2)
mouse_mask_table[['MaskVolume', 'Genotype',
                  'Sex']].boxplot(by=['Genotype', 'Sex'])
plt.ylabel('$mm^3$')
plt.ylabel('')
plt.savefig(os.path.join(analysis_path, 'Boxplot_MaskVolumes_ByGenotypeSex'))
# plt.show()

# pval calculation, equal_var for now ;)
mouse_mask_table[mouse_mask_table['Genotype'] == 'WT']['MaskVolume']
cat2 = mouse_mask_table[mouse_mask_table['Genotype'] == 'KO']

print(
    ttest_ind(
        mouse_mask_table[mouse_mask_table['Genotype'] == 'WT']['MaskVolume'],
        mouse_mask_table[mouse_mask_table['Genotype'] == 'KO']['MaskVolume'],
        equal_var=True))

mouse_mask_table.to_csv(
    os.path.join(analysis_path, 'Mouse_maskvolume_table.csv'))


## Function to compute volumes for image
def image2volumetable(image_path, voxel_volume):
    # Compute voxel numbers and volumes and output to table
    mouse_mask_image = nib.load(image_path)
    mouse_mask_image_array = mouse_mask_image.get_fdata()
    [mouse_volume_integer, mouse_voxel_number
     ] = np.unique(np.int64(np.round(mouse_mask_image_array)),
                   return_counts=True)
    mouse_volume = mouse_voxel_number * voxel_volume
Beispiel #40
0
# descriptive statistics for rest of world sales
df["Other_Sales"].describe()


# In[6]:


# descriptive statistics for global sales
df["Global_Sales"].describe()


# In[11]:


# T-test for North American sales
stats.ttest_ind(df["Global_Sales"], df["NA_Sales"])


# In[12]:


# T-test for European sales
stats.ttest_ind(df["Global_Sales"], df["EU_Sales"])


# In[13]:


# T-test for Japanese sales
stats.ttest_ind(df["Global_Sales"], df["JP_Sales"])
                    del current
            
            currentInds.reset_index(drop=True)
            
            #test if dataframe is empty to continue to next drug
            if currentInds.empty:
                continue
            else:
                #separate the concentrations
                conc= np.unique(currentInds['concentration'])
                for dose in conc:
                    test =[]
                    to_test = currentInds['concentration'] ==dose
                    testing = currentInds[to_test]
                    for feature in currentInds.columns[0:-3]:
                        test.append(stats.ttest_ind(testing[feature], controlMeans[rep][feature]))
       
                    ps = [(test[i][1]) for i in range(len(test))] #make into a list
                    ps.append(drug)
                    ps.append(dose)
        
                    temp = pd.DataFrame(ps).transpose()
                    pVals[rep] = pVals[rep].append(temp)
                    del temp, to_test, testing
            del currentInds

    #add in features
    pVals[rep].columns = feats
    pVals[rep] = pVals[rep].reset_index (drop=True)   

#import module for multiple comparison correction

def q5():
    # Retorne aqui o resultado da questão 5.
    return False


# In[85]:


atletas = ["BRA", "USA", "CAN"]
amostra = athletes[athletes["nationality"].isin(atletas)]
brasileiros = athletes[athletes["nationality"].isin(["BRA"])]["height"]
americanos = athletes[athletes["nationality"].isin(["USA"])]["height"]

sct.ttest_ind(americanos.dropna(),brasileiros.dropna(), equal_var=False)


# ## Questão 6
# 
# Repita o procedimento da questão 5, mas agora entre as alturas de `bra` e `can`. Podemos afimar agora que as médias são estatisticamente iguais? Reponda com um boolean (`True` ou `False`).

# In[11]:


def q6():
    # Retorne aqui o resultado da questão 6.
    return True


# In[88]:
def compara_media(str_1, str_2):
    v1 = athletes[athletes["nationality"].isin([str(str_1)])]["height"]
    v2 = athletes[athletes["nationality"].isin([str(str_2)])]["height"]

    return sct.ttest_ind(v1.dropna(), v2.dropna(), equal_var=False)
Beispiel #44
0
# print("df2.income.sum() : ", df2.income.sum())
#
# print("+++++++++중앙값 구하기++++++++++")
# print("df2.income.median() : ", df2.income.median())
#
# print("+++++++++기초통계량 요약해서 출력하기++++++++++")
# print("df2.describe() : ", df2.describe())
# print("df2.income.describe() : ", df2.income.describe())
#
# print("df2.sex.value_counts() : ", df2.sex.value_counts())
# print("df2.groupby(df2.sex).mean()", df2.groupby(df2.sex).mean())

male = df2.income[df2.sex == 'm']
female = df2.income[df2.sex == 'f']

ttest_result = stats.ttest_ind(male, female)
print(ttest_result)

print("ttest_result[0]", ttest_result[0])
print("ttest_result[1]", ttest_result[1])

if ttest_result[1] > 0.05:
    print(f'p-value는 {ttest_result[1]}로 95% 수준에서 유의하지않음')
else:
    print(f'p-value는 {ttest_result[1]}로 95% 수준에서 유의함')

corr = df2.corr(method='spearman')
print("corr:", corr)

income_stress_corr = df2.income.corr(df2.stress)
print("income_stress_corr:", income_stress_corr)
Beispiel #45
0
def return_stats(
    stock='jpm',
    commission=2,
    money=100000,
    #inc=10,- can read this argument and change code below if doing absolute share-based
    #original_shares=100, - can read this argument and change code below if doing absolute share-based
    policies=[hold, random_action, rule_based, ols, qlearner]):
    '''
    Enacts every strategy and provides summary statistics and graphs

    Inputs
    stock:
    money: original cash held
    inc: increment of buy/sell permitted
    original_shares: original number of shares held

    Output
    None

    Provides numerous summary statistics and visualizations
    '''

    original_money = money

    # generate stock table
    stock_table = read_stock(stock, start, end)

    # note stock name
    stock_name = stock.upper()

    # approximate 50/50 split in money-stock
    original_shares = round(money / 2 / stock_table.values[0])

    # recalculate money accordingly
    money -= (stock_table.values[0] * original_shares)

    # make share increment about 1% of original share holdings
    inc = m.ceil(original_shares / 100)

    # generate results
    results = {
        policy.__name__: policy(stock_table,
                                money=money,
                                inc=inc,
                                original_shares=original_shares,
                                commission=commission)
        for policy in policies
    }

    # plot qtables only for qlearner (or any other strategies with Q table)
    for policy in policies:
        if results[policy.__name__][
                'qtable'] is not None:  #don't try to plot Q tables for benchmark strategies

            # get state history and quantile length and qtable for normalization and averaging function
            state_history = results[policy.__name__]['state_history']
            quantile_length = len(results[policy.__name__]['BB_quantiles'])
            qtab = results[policy.__name__]['qtable']

            qtab_bb = weighted_average_and_normalize(qtab, state_history, 0,
                                                     quantile_length)
            qtab_bb = qtab_bb.iloc[::
                                   -1]  # reverse order of rows for visualization purposes - now biggest value will be on top
            qtab_bb.index = np.round(
                np.flip(np.array(results[policy.__name__]['BB_quantiles'])), 5
            )  # define index as bb quantiles, reversing quantile order in kind so biggest value is first

            # plot BB heatmap
            plt.figure(figsize=(9, 7))
            fig = heatmap(qtab_bb, cmap='Blues')
            plt.title('Bollinger Band % Q-Table', size=16)
            plt.gca().hlines([i + 1 for i in range(len(qtab_bb.index))],
                             xmin=0,
                             xmax=10,
                             linewidth=10,
                             color='white')
            plt.xticks(fontsize=15)
            plt.yticks(fontsize=14, rotation=0)
            plt.gca().tick_params(axis='x', bottom=False, left=False)
            plt.gca().tick_params(axis='y', bottom=False, left=False)
            plt.show(fig)

            # marginalize over SMA
            # TODO - determine if this mean was taken correctly
            qtab_sma = weighted_average_and_normalize(qtab, state_history, 1,
                                                      quantile_length)
            qtab_sma = qtab_sma.iloc[::-1]
            qtab_sma.index = np.round(
                np.flip(np.array(results[policy.__name__]['SMA_quantiles'])),
                5)

            plt.figure(figsize=(9, 7))
            fig = heatmap(qtab_sma, cmap='Blues')
            plt.title('SMA Percentage Q-Table', size=16)
            plt.gca().hlines([i + 1 for i in range(len(qtab_sma.index))],
                             xmin=0,
                             xmax=10,
                             linewidth=10,
                             color='white')
            plt.xticks(fontsize=15)
            plt.yticks(fontsize=14, rotation=0)
            plt.gca().tick_params(axis='x', bottom=False, left=False)
            plt.gca().tick_params(axis='y', bottom=False, left=False)
            plt.show(fig)

            # marginalize over MRDR
            # TODO - determine if this mean was taken correctly
            qtab_mrdr = weighted_average_and_normalize(qtab, state_history, 2,
                                                       quantile_length)
            qtab_mrdr = qtab_mrdr.iloc[::-1]
            qtab_mrdr.index = np.round(
                np.flip(np.array(results[policy.__name__]['MRDR_quantiles'])),
                5)

            plt.figure(figsize=(9, 7))
            fig = heatmap(qtab_mrdr, cmap='Blues')
            plt.title('Market Relative Daily Return Q-Table', size=16)
            plt.gca().hlines([i + 1 for i in range(len(qtab_mrdr.index))],
                             xmin=0,
                             xmax=10,
                             linewidth=10,
                             color='white')
            plt.xticks(fontsize=15)
            plt.yticks(fontsize=14, rotation=0)
            plt.gca().tick_params(axis='x', bottom=False, left=False)
            plt.gca().tick_params(axis='y', bottom=False, left=False)
            plt.show(fig)

    # get markov transition models
    for policy in policies:
        plt.figure(figsize=(6, 3))
        plt.title('Transition Matrix For ' + policy.__name__, size=16)
        mkv = results[policy.__name__]['markov']
        fig = heatmap(mkv,
                      annot=True,
                      annot_kws={'size': 14},
                      cmap='Greens',
                      cbar=False)
        plt.xticks(fontsize=14)
        plt.yticks(fontsize=14, rotation=0)
        plt.gca().set(xlabel='Current Trading Day', ylabel='Last Trading Day')
        plt.gca().tick_params(axis='x', bottom=False, left=False)
        plt.gca().tick_params(axis='y', bottom=False, left=False)
        plt.gca().hlines([1, 2], xmin=0, xmax=10, linewidth=10, color='white')
        plt.show(fig)

    # plot daily portfolio values
    plt.figure(figsize=(14, 8))
    for policy in policies:
        plt.plot(results[policy.__name__]['final_vals'], label=policy.__name__)
    plt.legend()
    plt.xlabel("Date", fontsize=20)
    plt.ylabel("Portfolio Value ($)", fontsize=20)
    plt.title("Daily Portfolio Values For Different Trading Strategies: " +
              stock.upper(),
              fontsize=25)
    plt.show()

    # plot daily cash values
    plt.figure(figsize=(14, 8))
    for policy in policies:
        plt.plot(results[policy.__name__]['cash'], label=policy.__name__)
    plt.legend()
    plt.xlabel("Date", fontsize=20)
    plt.ylabel("Cash Held ($)", fontsize=20)
    plt.title("Daily Cash Held For Different Trading Strategies: " +
              stock.upper(),
              fontsize=25)
    plt.show()

    # plot daily shares
    plt.figure(figsize=(14, 8))
    for policy in policies:
        plt.plot(results[policy.__name__]['shares'], label=policy.__name__)
    plt.legend()
    plt.xlabel("Date", fontsize=20)
    plt.ylabel("Shares Held", fontsize=20)
    plt.title("Daily Share Holdings For Different Trading Strategies: " +
              stock_name,
              fontsize=25)
    plt.show()

    # plot daily portfolio values
    for i, policy in enumerate(policies):
        dic = results[policy.__name__]
        if dic['state_history'] is not None:
            print("States History for " + policy.__name__ + "is: ",
                  dic['state_history'])

        del dic['state_history']
        del dic['qtable']
        del dic['markov']
        try:
            del dic['BB_quantiles']
            del dic['SMA_quantiles']
            del dic['MRDR_quantiles']
        except:
            pass
        df = pd.DataFrame(dic)

        plt.figure(figsize=(14, 8))
        plt.plot([], label="BUY", color="orange", marker='o')
        plt.plot([], label="SELL", color="black", marker='o')
        plt.plot([], label="HOLD", color="red", marker='o')
        buy_df = df[df.actions == 'BUY']
        sell_df = df[df.actions == 'SELL']
        hold_df = df[df.actions == 'HOLD']
        plt.plot(results[policy.__name__]['final_vals'], label=policy.__name__)
        plt.scatter(buy_df.index,
                    buy_df['final_vals'],
                    color='orange',
                    marker='^',
                    s=10)
        plt.scatter(sell_df.index,
                    sell_df['final_vals'],
                    color='black',
                    marker='v',
                    s=10)
        plt.scatter(hold_df.index,
                    hold_df['final_vals'],
                    color='red',
                    marker='s',
                    s=10)
        plt.xlabel("Date", fontsize=20)
        plt.ylabel("Portfolio Value ($)", fontsize=20)
        plt.title("Daily Portfolio Values For Trading Strategies of " +
                  policy.__name__ + " for stock : " + stock.upper(),
                  fontsize=25)
        plt.legend()
        plt.show()

    # display percentages
    #TODO: display(res) has no display() function. Fix bug.
    for policy in policies:
        print('For ' + stock_name + ',', policy.__name__,
              'action proportions were:')
        res = results[policy.__name__]['actions'].value_counts()
        res = res / res.sum()
        print(res)
        print('\n')
        print('For ' + stock_name + ',', policy.__name__,
              'average return based on action was:')
        res = returns(results[policy.__name__]['final_vals']).groupby(
            results[policy.__name__]['actions']).mean()
        print(res)
        print('\n')

    # calculate final returns
    for policy in policies:
        print('Final porfolio value under', policy.__name__,
              'strategy for ' + stock_name + ':',
              round(results[policy.__name__]['final_vals'].values[-1], 0))
    print('\n')

    # calculate final percentage of money invested in stock
    for policy in policies:
        print(
            'Final percentage of money invested in stock under',
            policy.__name__, 'strategy for ' + stock_name + ':',
            str(
                round(
                    100 *
                    (1 - (results[policy.__name__]['cash'].values[-1] /
                          results[policy.__name__]['final_vals'].values[-1])),
                    1)) + '%')
    print('\n')

    # calculate returns
    rets = {
        policy: returns(results[policy.__name__]['final_vals'])
        for policy in policies
    }

    # generate risk_free return for sharpe ratio - five-year treasury yield
    rfs = returns(read_stock('^FVX'))

    # find common indecies between stock tables and treasury yields
    rfn = set(stock_table.index).intersection(set(rfs.index))

    # now reindex
    rfr = rfs.loc[rfn]
    rfi = rfr.index

    # generate baseline return for information ratio - s&p 500
    bls = returns(read_stock('^GSPC')).values

    # print summary stats for daily returns
    for policy in policies:
        nm = policy.__name__

        # mean daily return
        print('Mean daily return under', nm, 'for', stock_name + ':',
              str(round(np.mean(rets[policy], axis=0), 5)))

        # standard deviation of daily return
        print('Standard deviation of daily return under', nm, 'for',
              stock_name + ':', round(np.std(rets[policy], axis=0), 3))

        # information ratio of daily return
        checkhist(rets[policy].values, bls)
        pr = np.mean(rets[policy].values)
        br = np.mean(bls)
        te = np.std(rets[policy].values - bls)
        ir = round((pr - br) / (te) * np.sqrt(len(bls)), 2)
        print('Information Ratio against S&P 500 under', nm, 'strategy for',
              stock_name + ':', ir)

        # sharpe ratio of daily return
        dat = rets[policy].loc[
            rfi].values  # need to correct dates to line up with risk free return
        checkhist(dat, rfr)
        rp = np.mean(dat)
        br = np.mean(rfr)
        sd = np.std(rfr - dat)
        sr = round((rp - br) / (sd) * np.sqrt(len(rfr)), 2)
        print('Sharpe Ratio against five-year treasury yield under', nm,
              'strategy for', stock_name + ':', sr)
        print(
            'Note: only used dates when five-year treasury yields were available in calculating RFR for Sharpe Ratio'
        )
        print('\n')

    for policy1 in policies:
        p1 = rets[policy1].loc[
            rfi].values  # filter to dates with five-year treasury yields available
        n1 = policy1.__name__

        # independent samples t-test vs. risk-free return
        checkhist(p1, rfr)
        t = ttest_ind(p1, rfr, equal_var=True)
        gr = t[0] > 0
        n2 = 'rfr'
        p = round(t[1], 3) / 2  # make one-sided
        if gr:
            print('T-test for difference of mean returns in', n1, 'and', n2,
                  'finds', n1, '>', n2, 'with p-value', round(p, 3))
        else:
            print('T-test for difference of mean returns in', n2, 'and', n1,
                  'finds', n2, '>', n1, 'with p-value', round(p, 3))

        # levene test vs. risk-free return
        l = levene(rets[policy1].values, bls)
        p = round(l[1], 3)
        gr = np.std(rets[policy1].values) > np.std(bls)
        n2 = 'bls'
        if gr:
            print('Levene test for difference of variances (volatility) in',
                  n1, 'and', n2, 'finds p-value of', round(p, 3), 'with', n1,
                  'showing more volatility')
        else:
            print('Levene test for difference of variances (volatility) in',
                  n1, 'and', n2, 'finds p-value of', round(p, 3), 'with', n2,
                  'showing more volatility')
        print('\n')

        for policy2 in policies:
            if policy1 != policy2:  #and hash(policy1) <= hash(policy2) - not necessary
                p1 = rets[
                    policy1].values  # no longer need to filter to dates with five-year treasury yields available
                p2 = rets[policy2].values
                checkhist(p1, p2)
                n2 = policy2.__name__

                # independent samples t-test
                t = ttest_ind(p1, p2, equal_var=True)
                gr = t[0] > 0
                p = round(t[1], 3) / 2  # make one-sided
                if gr:
                    print('T-test for difference of mean returns in', n1,
                          'and', n2, 'finds', n1, '>', n2, 'with p-value',
                          round(p, 3))
                else:
                    print('T-test for difference of mean returns in', n2,
                          'and', n1, 'finds', n2, '>', n1, 'with p-value',
                          round(p, 3))

                # levene test
                l = levene(p1, p2)
                p = round(l[1], 5)
                gr = np.std(p1) > np.std(p2)
                if gr:
                    print(
                        'Levene test for difference of variances (volatility) in',
                        n1, 'and', n2, 'finds p-value of', round(p, 3), 'with',
                        n1, 'showing more volatility')
                else:
                    print(
                        'Levene test for difference of variances (volatility) in',
                        n1, 'and', n2, 'finds p-value of', round(p, 3), 'with',
                        n2, 'showing more volatility')
                print('\n')
            print('\n')

    # TODO: add any additional desired visualizations
    plt.show()
Beispiel #46
0
 sdf = pd.DataFrame()
 best_algo = "BL"
 for e in error_metrics:
     X = df.loc[df['algorithm'] == best_algo, e].to_numpy()
     sdf = sdf.append(
         {
             'algorithm': best_algo,
             'Error': np.mean(X),
             'metric': e
         },
         ignore_index=True)
     for algo in df['algorithm'].unique():
         if algo in best_algo:
             continue
         Y = df.loc[df['algorithm'] == algo, e].to_numpy()
         p_value = stats.ttest_ind(X, Y).pvalue
         # print(p_value)
         if p_value <= 0.05:
             sdf = sdf.append(
                 {
                     'algorithm': algo,
                     'Error': np.mean(Y),
                     'metric': e
                 },
                 ignore_index=True)
         else:
             sdf = sdf.append(
                 {
                     'algorithm': algo,
                     'Error': np.mean(X),
                     'metric': e
Beispiel #47
0
def main(args):

    # Loading data trought Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqueID,
                       group=args.group,
                       runOrder=args.order,
                       logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible.
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    if args.pairing == "unpaired":
        logger.info(
            "Unpaired t-test will be performed for all groups pairwise.")
        # Getting the uinique pairs and all pairwise prermutations
        # son that we will feed them to pairwise unpaired t-tests.
        group_values_series = dat.transpose()[dat.group].T.squeeze()
        group_values_series_unique = group_values_series.unique()
        number_of_unique_groups = group_values_series_unique.shape[0]
        groups_pairwise = list(combinations(group_values_series_unique, 2))
        number_of_groups_pairwise = len(groups_pairwise)

        # Extracting data from the interface.
        data_frame = dat.transpose()
        # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
        # This variable is useless for unpared test. it just adds extra column to the data frame.
        if args.order == False:
            number_of_features = data_frame.shape[1] - 1
        else:
            number_of_features = data_frame.shape[1] - 2
        # Saving treatment group name from the arguments.
        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with unpaired t-test. This is just summary for the table.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features

        for j in range(0, number_of_features):
            # Creating duplicate for manipulation.
            data_frame_manipulate = data_frame

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            # We should either drop 1 or 2 columns depending whether we fed the second one.
            if args.order == False:
                data_frame_manipulate_transpose = data_frame_manipulate.drop(
                    args.group, 1).transpose()
            else:
                data_frame_manipulate_transpose = data_frame_manipulate.drop(
                    [args.group, args.order], 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Computing means for each group and outputting them.
        # This part just produces summary statistics for the output table.
        # This has nothing to do with unpaired t-test. This is just summary for the table.

        for i in range(0, number_of_unique_groups):
            # Extracting the pieces of the data frame that belong to the ith group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            # We should either drop 1 or 2 columns depending whether we fed the second one.
            if args.order == False:
                data_frame_current_group = data_frame_current_group.drop(
                    args.group, 1).transpose()
            else:
                data_frame_current_group = data_frame_current_group.drop(
                    [args.group, args.order], 1).transpose()

            # Pulling indexes list from the current group.
            indexes_list = data_frame_current_group.index.tolist()

            # Creating array of means for the current group that will be filled.
            means_value = [0] * number_of_features

            for j in range(0, number_of_features):
                series_current = data_frame_current_group.loc[indexes_list[j]]
                means_value[j] = series_current.mean()

            # Adding current mean_value column to the data frame and assigning the name.
            means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
                i]
            summary_df[means_value_column_name_current] = means_value

        # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
        for i in range(0, number_of_groups_pairwise):
            # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
            groups_subset = groups_pairwise[i]
            data_frame_first_group = data_frame.loc[data_frame[
                args.group].isin([groups_subset[0]])]
            data_frame_second_group = data_frame.loc[data_frame[
                args.group].isin([groups_subset[1]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            # We should either drop 1 or 2 columns depending whether we fed the second one.
            if args.order == False:
                data_frame_first_group = data_frame_first_group.drop(
                    args.group, 1).transpose()
                data_frame_second_group = data_frame_second_group.drop(
                    args.group, 1).transpose()
            else:
                data_frame_first_group = data_frame_first_group.drop(
                    [args.group, args.order], 1).transpose()
                data_frame_second_group = data_frame_second_group.drop(
                    [args.group, args.order], 1).transpose()

            # Pulling indexes list from the first one (they are the same)
            indexes_list = data_frame_first_group.index.tolist()

            # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
            p_value = [0] * number_of_features
            t_value = [0] * number_of_features
            neg_log10_p_value = [0] * number_of_features
            flag_value_0p01 = [0] * number_of_features
            flag_value_0p05 = [0] * number_of_features
            flag_value_0p10 = [0] * number_of_features
            difference_value = [0] * number_of_features

            for j in range(0, number_of_features):
                series_first = data_frame_first_group.loc[indexes_list[j]]
                series_second = data_frame_second_group.loc[indexes_list[j]]
                ttest_ind_args = [series_first, series_second]
                p_value[j] = ttest_ind(*ttest_ind_args)[1]
                t_value[j] = ttest_ind(*ttest_ind_args)[0]
                # Possible alternative for two groups.
                # p_value[j] = ttest_ind_args(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            # Creating column names for the data frame.
            p_value_column_name_current = 'prob_greater_than_t_for_diff_' + groups_subset[
                0] + '_' + groups_subset[1]
            t_value_column_name_current = 't_value_for_diff_' + groups_subset[
                0] + '_' + groups_subset[1]
            neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[
                0] + '_' + groups_subset[1]
            difference_value_column_name_current = 'diff_of_' + groups_subset[
                0] + '_' + groups_subset[1]
            flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[
                0] + '_' + groups_subset[1]
            flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[
                0] + '_' + groups_subset[1]
            flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[
                0] + '_' + groups_subset[1]

            # Adding current p_value and flag_value column to the data frame and assigning the name.
            # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
            if i == 0:
                flag_df = pd.DataFrame(
                    data=flag_value_0p01,
                    columns=[flag_value_column_name_current_0p01],
                    index=indexes_list)
            else:
                flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

            # At this point data frame exists so only columns are added to the existing data frame.
            summary_df[p_value_column_name_current] = p_value
            summary_df[t_value_column_name_current] = t_value
            summary_df[
                neg_log10_p_value_column_name_current] = neg_log10_p_value
            summary_df[difference_value_column_name_current] = difference_value
            flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
            flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups.
    # Each sample in one group should have exacty one matching pair in the other group.
    # The matching is controlled by args.order variable.

    if args.pairing == "paired":
        logger.info(
            "Paired test will be performed for two groups pairwise based on pairing variable: {0}."
            .format(args.order))

        # Getting the number of unique groups. If it is bigger than 2 return the warning and exit.
        group_values_series = dat.transpose()[dat.group].T.squeeze()
        group_values_series_unique = group_values_series.unique()
        number_of_unique_groups = group_values_series_unique.shape[0]
        if number_of_unique_groups != 2:
            logger.warning(
                u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed."
                .format(number_of_unique_groups))
            exit()

        # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed.
        # Creating pairwise combination of our two groups that we will use in the future.
        groups_pairwise = list(combinations(group_values_series_unique, 2))
        number_of_groups_pairwise = len(groups_pairwise)

        # Extracting data from the interface.
        data_frame = dat.transpose()
        # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
        # Checking that the requred pairing variable has been provided.
        if args.order == False:
            logger.info(
                "The required t-test pairing variable has not been provided: The paired t-test cannot be performed."
            )
            exit()

        # This piece of code will be executed only if the args.order has been provided and the check is passed.
        # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order
        number_of_features = data_frame.shape[1] - 2

        # At this point is is confirmed that there are only 2 groups and that pairing variable args.order has been provided.
        # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups.
        # Getting the unique pairs and deleting those theat have more or less than three.
        pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze()
        pairid_values_series_unique = pairid_values_series.unique()
        number_of_unique_pairid = pairid_values_series_unique.shape[0]

        # Extracting data from the interface.
        data_frame = dat.transpose()

        # Extracting the number of samples in the final frame.
        number_of_samples = data_frame.shape[0]

        # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups.
        # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning.
        # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed.
        for i in range(0, number_of_unique_pairid):
            # Extracting the pieces of the data frame that belong to ith unique pairid.
            data_frame_current_pairid = data_frame.loc[data_frame[
                args.order].isin([pairid_values_series_unique[i]])]

            # We transpose here so it will be easier to operate with.
            data_frame_current_pairid = data_frame_current_pairid.transpose()
            sample_names_current_pairid = list(
                data_frame_current_pairid.columns.values)
            if data_frame_current_pairid.shape[1] != 2:
                # Pulling indexes list from the current data frame.
                logger.warning(
                    u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis."
                    .format(pairid_values_series_unique[i],
                            data_frame_current_pairid.shape[1],
                            sample_names_current_pairid))

                # Getting indexes we are trying to delete.
                boolean_indexes_to_delete = data_frame.index.isin(
                    sample_names_current_pairid)
                # Deleting the indexes and in the for loop going to next iteration.
                data_frame.drop(data_frame.index[boolean_indexes_to_delete],
                                inplace=True)

            # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2:
            # Here we are checking if the groupID-s for the given pair are indeed different.

            elif data_frame_current_pairid.transpose()[args.group][
                    0] == data_frame_current_pairid.transpose()[args.group][1]:
                logger.warning(
                    u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis."
                    .format(
                        pairid_values_series_unique[i],
                        data_frame_current_pairid.transpose()[args.group][1],
                        data_frame_current_pairid.transpose()[args.group][0],
                        sample_names_current_pairid))
                # Getting indexes we are trying to delete.
                boolean_indexes_to_delete = data_frame.index.isin(
                    sample_names_current_pairid)
                # Deleting the indexes.
                data_frame.drop(data_frame.index[boolean_indexes_to_delete],
                                inplace=True)

        # Checking if the data frame became empty after cleaning.
        if data_frame.shape[0] == 0:
            logger.warning(
                u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program."
            )
            exit()

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with paired t-test. This is just summary for the table.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features

        for j in range(0, number_of_features):
            # Creating duplicate for manipulation.
            data_frame_manipulate = data_frame

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_manipulate_transpose = data_frame_manipulate.drop(
                [args.group, args.order], 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Computing means for each group and outputting them.
        # This part just produces summary statistics for the output table.
        # This has nothing to do with paired t-test. This is just summary for the table.

        for i in range(0, number_of_unique_groups):
            # Extracting the pieces of the data frame that belong to the ith group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            data_frame_current_group = data_frame_current_group.drop(
                [args.group, args.order], 1).transpose()

            # Pulling indexes list from the current group.
            indexes_list = data_frame_current_group.index.tolist()

            # Creating array of means for the current group that will be filled.
            means_value = [0] * number_of_features

            for j in range(0, number_of_features):
                series_current = data_frame_current_group.loc[indexes_list[j]]
                means_value[j] = series_current.mean()

            # Adding current mean_value column to the data frame and assigning the name.
            means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
                i]
            summary_df[means_value_column_name_current] = means_value

        # Performing paired t-test for the two groups and saving results.
        # Creating p_values and flag_values empty list of length number_of_features.
        # This will be used for the two groups in paired t-test.
        p_value = [0] * number_of_features
        t_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        difference_value = [0] * number_of_features

        # Performing paired t-test for each pair of features.
        for j in range(0, number_of_features):
            # Extracting the pieces of the data frame that belong to 1st group.
            data_frame_first_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[0]])]
            data_frame_second_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[1]])]

            # Sorting data frame by args.group index
            # This will ensure datasets are aligned by pair when fed to the t-test.
            data_frame_first_group = data_frame_first_group.sort(args.order)
            data_frame_second_group = data_frame_second_group.sort(args.order)

            # Sorting data frame by args.group index
            data_frame_first_group = data_frame_first_group.drop(
                [args.group, args.order], 1).transpose()
            data_frame_second_group = data_frame_second_group.drop(
                [args.group, args.order], 1).transpose()

            # Pulling list of indexes. This is the same list for the first and for the second.
            indexes_list = data_frame_first_group.index.tolist()

            # Pullinng the samples out
            series_first = data_frame_first_group.loc[indexes_list[j]]
            series_second = data_frame_second_group.loc[indexes_list[j]]

            # Running t-test for the two given samples
            paired_ttest_args = [series_first, series_second]
            p_value[j] = ttest_rel(*paired_ttest_args)[1]
            t_value[j] = ttest_rel(*paired_ttest_args)[0]
            neg_log10_p_value[j] = -np.log10(p_value[j])
            difference_value[j] = series_first.mean() - series_second.mean()
            if p_value[j] < 0.01: flag_value_0p01[j] = 1
            if p_value[j] < 0.05: flag_value_0p05[j] = 1
            if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # The loop over features has to be finished by now. Converting them into the data frame.
        # Creating column names for the data frame.
        p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[
            0] + '_' + group_values_series_unique[1]
        t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[
            0] + '_' + group_values_series_unique[1]
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[
            0] + '_' + group_values_series_unique[1]
        difference_value_column_name_current = 'diff_of_' + group_values_series_unique[
            0] + '_' + group_values_series_unique[1]
        flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[
            0] + '_' + group_values_series_unique[1] + '_0p01'
        flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[
            0] + '_' + group_values_series_unique[1] + '_0p05'
        flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[
            0] + '_' + group_values_series_unique[1] + '_0p10'

        summary_df[t_value_column_name_current] = t_value
        summary_df[p_value_column_name_current] = p_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_df = pd.DataFrame(data=flag_value_0p01,
                               columns=[flag_value_column_name_current_0p01],
                               index=indexes_list)
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_groups_pairwise):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key = groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond threshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of treatment means for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing t-test run.")
Beispiel #48
0
    sex = list(merged_df.gender.values)
    site = list(merged_df.site.values)
    age = list(merged_df.age.values)

    train_index, test_index = None, None

    while flag_selection:

        splits = StratifiedShuffleSplit(n_splits=1, test_size=args.test_size)

        for train_index, test_index in splits.split(np.zeros(len(site)), site):

            age_test = [float(age[idx]) for idx in test_index]
            age_train = [float(age[idx]) for idx in train_index]

            sex_test = [sex_dict[sex[idx]] for idx in test_index]
            sex_train = [sex_dict[sex[idx]] for idx in train_index]

            t_age, p_age = ttest_ind(age_test, age_train)
            T_sex = chi2(sex_test, sex_train)

            print(p_age, T_sex)
            if p_age > args.p_val_threshold and T_sex < args.t_val_threshold:
                flag_selection = False

            test_df = merged_df.iloc[test_index]
            train_df = merged_df.iloc[train_index]

            train_df.to_csv(train_path, sep='\t', index=False)
            test_df.to_csv(test_path, sep='\t', index=False)
        print("%20s %20s %20s %20s %20s %20s %20s" % (
            begin_date, round(init_svi, 4), round(init_bs, 4),
            round(portfolio_net_svi / init_svi, 4), round(portfolio_net_bs / init_bs, 4),
            round(tradedamt_svi / holdamt_svi, 4), round(tradedamt_bs / holdamt_bs, 4)))
    print('=' * 200)
    print("%20s %20s %20s %20s %20s %20s %20s %20s" % (
        "eval date", "spot", "delta", 'price_svi', 'price_bs', 'portfolio_svi', 'portfolio_bs',
        'transaction'))
    print('svi_pnl', sum(svi_pnl) / len(svi_pnl))
    print('bs_pnl', sum(bs_pnl) / len(bs_pnl))
    results = {}
    results.update({'date': dates})
    results.update({'pnl svi': svi_pnl})
    results.update({'pnl bs': bs_pnl})
    results.update({'option init svi': option_init_svi})
    results.update({'option init bs': option_init_bs})
    results.update({'transaction svi': transaction_svi})
    results.update({'transaction bs': transaction_bs})
    results.update({'holdings svi': holdings_svi})
    results.update({'holdings bs': holdings_bs})

    df = pd.DataFrame(data=results)
    # print(df)
    df.to_csv(os.path.abspath('..') + '/results4/dh_MA_'+barrier_type+'_r='
              +str(rebalancerate) + '_b=' + str(barrier_pct) + 'f2.csv')

    t,p = stats.ttest_ind(svi_pnl,bs_pnl)
    t1,p1 = stats.wilcoxon(svi_pnl,bs_pnl)
    print(barrier_type, ' ',barrier_pct)
    print('t : ',t,p)
    print('wilcoxom : ',t1,p1)
Beispiel #50
0
def generate_tstats_classes(df, dest_dir, params):
    """Computes t-test for each class.

   This function computes a t-test for each class in the dataset.
   The t-test is computed by comparing class level metrics for
   a set of sparse model checkpoints to non-sparse model
   checkppints.

  Args:
    df: input dataframe with class level metrics.
    dest_dir: pathway to output directory.
    params: dataset specific params.
  """

    human_label_lookup = class_level_metrics.HumanLabelLookup()
    label_dict = human_label_lookup.create_library()
    class_names = list(label_dict.values())

    df.drop(columns='Unnamed: 0')
    df.reset_index(inplace=True, drop=True)
    df['id'] = df.index

    df_ = pd.wide_to_long(df,
                          stubnames=['precision', 'recall'],
                          i='id',
                          j='class',
                          sep='/',
                          suffix=r'\w+').reset_index()

    data = pd.DataFrame([])

    num_classes = params['num_classes']
    mean_accuracy_dict = params['accuracy']

    long_df_all = df_
    for i in range(num_classes):

        # adding label id ensures unique naming of classes
        c = class_names[i] + '_' + str(i)
        for p in [0.1, 0.3, 0.5, 0.7, 0.9]:

            variant_mean_recall = long_df_all[(
                (long_df_all['fraction_pruned'] == p) &
                (long_df_all['class'] == c))]['recall'].mean()

            baseline_mean_recall = long_df_all[(
                (long_df_all['fraction_pruned'] == 0.0) &
                (long_df_all['class'] == c))]['recall'].mean()

            # normalize recall by model accuracy
            baseline_set = long_df_all[(
                (long_df_all['fraction_pruned'] == 0.0) &
                (long_df_all['class']
                 == c))]['recall'] - mean_accuracy_dict[0.0]
            variant_set = long_df_all[(
                (long_df_all['fraction_pruned'] == p) &
                (long_df_all['class'] == c))]['recall'] - mean_accuracy_dict[p]

            t_stat = ttest_ind(baseline_set, variant_set, equal_var=False)

            data = data.append(pd.DataFrame(
                {
                    'class': c,
                    'pruning_fraction': p,
                    'baseline_mean_recall': baseline_mean_recall,
                    'variant_mean_recall': variant_mean_recall,
                    'pvalue_recall_norm': t_stat[1],
                    'statistic_recall_norm': t_stat[0],
                },
                index=[0]),
                               ignore_index=True)

    time_ = str(time.time())
    output_file = 'recall_t_statistic'
    file_name = '_' + time_ + '_' + output_file + '.csv'
    file_path = os.path.join(dest_dir, file_name)
    with tf.gfile.Open(file_path, 'w') as f:
        data.to_csv(f)
    def two_cal(self, x, norm_res, homo_res, skews, paired=False):
        '''Calculate and return two samples comparison tests results.
        
        Parameters:
        ----------
        x : list of numpy.ndarray
            Variables to test on
        norm_res : dict
            Normality test results
        homo_res : dict
            Homogeneity test rerults
        skews : list
            Skewness values of all x variables
        paired : bool
            False for two independent variables. True Otherwise
            
        Returns:
        -------
        res : dict
            'Statistic': statistic value calculated by the test
            'Pvalue': p-value calculated by the test
            'Test': name of the test used
            'Result': True if failed to reject null hypothesis, False otherwise
            
        Notes:
        -----
        None'''

        res = {}

        if sum(norm_res.values()) == len(x) and all(abs(np.array(skews)) < .5):
            # If all normal
            if paired:  # Paired samples
                res['Statistic'] = ss.ttest_rel(x[0], x[1])[0]
                res['Pvalue'] = ss.ttest_rel(x[0], x[1])[1]
                res['Test'] = 'T-test with paired samples'
            else:  # Independent samples
                if homo_res:  # Variances equal
                    res['Statistic'] = ss.ttest_ind(x[0], x[1])[0]
                    res['Pvalue'] = ss.ttest_ind(x[0], x[1])[1]
                    res['Test'] = 'T-test with independent samples'
                else:  # Variances unequal
                    res['Statistic'] = ss.ttest_ind(x[0],
                                                    x[1],
                                                    equal_var=False)[0]
                    res['Pvalue'] = ss.ttest_ind(x[0], x[1],
                                                 equal_var=False)[1]
                    res['Test'] = 'Welch\'s T-test'

        else:  # If not all normal, use unparametric tests.
            if paired:  # Paired samples
                res['Statistic'] = ss.wilcoxon(x[0].reshape(-1),
                                               x[1].reshape(-1))[0]
                res['Pvalue'] = ss.wilcoxon(x[0].reshape(-1),
                                            x[1].reshape(-1))[1]
                res['Test'] = 'Wilcoxon signed-rank Test with Paired Samples'
            else:  # Independent samples
                if all([len(x[0]), len(x[1])]) >= 20:  # Sample size > 20
                    res['Statistic'] = ss.mannwhitneyu(x[0], x[1])[0]
                    res['Pvalue'] = ss.mannwhitneyu(x[0], x[1])[1]
                    res['Test'] = 'Mann-Whitney U Test with Indepedent Samples'
                else:  # Sample size < 20
                    res['Statistic'] = ss.ranksums(x[0], x[1])[0]
                    res['Pvalue'] = ss.ranksums(x[0], x[1])[1]
                    res['Test'] = 'Wilcoxon rank-sum Test with Indepedent Samples'

        # Get the results based on fixed significance level
        if res['Pvalue'] >= .05:
            res['Result'] = True
        else:
            res['Result'] = False

        return res
Beispiel #52
0
            mean_acc = cat2.loc[cat2[('platform', '')] == 'All',
                                ('{}{}'.format(part, var), 'mean')].values[0]
            std_rej = cat1.loc[cat1[('platform', '')] == 'All',
                               ('{}{}'.format(part, var), 'std')].values[0]
            std_acc = cat2.loc[cat2[('platform', '')] == 'All',
                               ('{}{}'.format(part, var), 'std')].values[0]
            count_rej = cat1.loc[cat1[('platform', '')] == 'All',
                                 ('{}{}'.format(part, var), 'count')].values[0]
            count_acc = cat2.loc[cat2[('platform', '')] == 'All',
                                 ('{}{}'.format(part, var), 'count')].values[0]

            grp1 = data.loc[data['accepted'] == 0,
                            '{}{}'.format(part, var)].dropna()
            grp2 = data.loc[data['accepted'] == 1,
                            '{}{}'.format(part, var)].dropna()
            ttest = stats.ttest_ind(grp1, grp2, equal_var=False)
            pooled_std = np.sqrt(
                ((count_rej - 1) * std_rej**2 +
                 (count_acc - 1) * std_acc**2) / (count_rej + count_acc))
            cohens_d = np.abs((mean_rej - mean_acc) / pooled_std)
            results.loc[len(results)] = [
                part, var, ttest[1], mean_acc - mean_rej, mean_rej, std_rej,
                count_rej, mean_acc, std_acc, count_acc, cohens_d, has_edu
            ]

            bar_i += 1

        fig, ax = plt.subplots(figsize=(width, height))
        nb_bars = len(ind)
        acc = np.arange(0, nb_bars, 2)
        rej = np.arange(1, nb_bars, 2)
Beispiel #53
0
early = df[df['assignment1_submission'] <= '2015-12-31']
late = df[df['assignment1_submission'] > '2015-12-31']
print(early)

# In[ ]:

early.mean()

# In[ ]:

late.mean()

# In[11]:

from scipy import stats

get_ipython().magic('pinfo stats.ttest_ind')

# In[14]:

stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade'])

# In[ ]:

stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade'])

# In[ ]:

stats.ttest_ind(early['assignment3_grade'], late['assignment3_grade'])
Beispiel #54
0
    if "_" in id:
        id = id.split("_")[1]
    id = id[4:]
    return id

correlationDict = {}
for i in correlationfh:
    intA,intB,corr = i.split()
    correlationDict[intA.lower()+"_"+intB.lower()]=float(corr)

inter_corr = []
count = 0
invalid = 0
for i in interologyfh:
    intA,intB = i.split()
    count += 1
    try:
        corr = correlationDict[intB.lower() + "_" + intA.lower()]
        inter_corr.append(corr)
    except KeyError:
        try:
            corr = correlationDict[intA.lower() + "_" + intB.lower()]
            inter_corr.append(corr)
        except KeyError:
            print(intA.lower(),intB.lower())
            invalid+=1
print(len(correlationDict.items()))
random_corr = random.sample(list(correlationDict.values()),1000)
print("total correlation from interologs= {}\ntotalcorrelation from random pairs= {}".format(sum(inter_corr)/count,sum(random_corr)/len(random_corr)))
print("p value between two correlations: {}".format(ttest_ind(inter_corr,random_corr)))
print("number of invalid keys: {} out of {} = {}".format(invalid,count,invalid/count))
def calculate_ttest(hyperpartisan_valid_predictions, joint_valid_predictions):
    _, p_value = ttest_ind(hyperpartisan_valid_predictions,
                           joint_valid_predictions)
    return p_value
feature_rep_train, feature_rep_test = datasets[0], datasets[1]
fingerprint_rep_train, fingerprint_rep_test = datasets[2], datasets[3]
#image_rep_train, image_rep_test = datasets[4], datasets[5]

######################################################################################
# LogisticRegression with MolecularDescriptors
X = feature_rep_train.iloc[:, 2:].values
Y = feature_rep_train.iloc[:, 1].values

model_scores, null_scores = assess_model(X, Y, [datasets[0], datasets[1]])

print("Null Model: ", np.mean(null_scores), np.std(null_scores))

print("Logistc Model with Molecular Descriptors performance: ",
      np.mean(model_scores), np.std(model_scores))
t_stat, p_value = stats.ttest_ind(model_scores, null_scores)
print(p_value)

X_train, Y_train = X[:750, :], Y[:750]
X_test, Y_test = X[:750, :], Y[:750]
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

plot_learning_curve(LogisticRegression(),
                    "LogisticRegression_MolecularDescriptors",
                    X_train,
                    Y_train,
                    cv=10)

model = LogisticRegression()
Beispiel #57
0
def main():
    reddit_counts = sys.argv[1]

    station_fh = gzip.open(reddit_counts, 'rt', encoding='utf-8')
    stations = pd.read_json(station_fh, lines=True)

    stations = stations[stations.subreddit == "canada"]
    stations = stations[(stations['date'].dt.year == 2012) |
                        (stations['date'].dt.year == 2013)].reset_index()

    del stations["index"]
    stations['weekday'] = stations['date'].dt.dayofweek
    weekday = stations[(stations['weekday'] == 5) |
                       (stations['weekday'] == 6)].reset_index()
    weekend = stations[(stations['weekday'] != 5)
                       & (stations['weekday'] != 6)].reset_index()
    del weekday["index"]
    del weekend["index"]

    initial_ttest_p = ttest_ind(weekday['comment_count'],
                                weekend['comment_count'])
    initial_weekday_normality_p = stats.normaltest(weekday['comment_count'])
    initial_weekend_normality_p = stats.normaltest(weekend['comment_count'])
    initial_levene_p = stats.levene(weekday['comment_count'],
                                    weekend['comment_count'])

    # Fix1
    # sqrt
    weekday_sqrt = np.sqrt(weekday["comment_count"])
    weekend_sqrt = np.sqrt(weekend["comment_count"])
    # print(weekday_sqrt)

    transformed_weekday_normality_p = stats.normaltest(weekday_sqrt)
    transformed_weekend_normality_p = stats.normaltest(weekend_sqrt)
    transformed_levene_p = stats.levene(weekday_sqrt, weekend_sqrt)

    # Fix2
    # Logic copy from Prof Greg Baker
    def week(dt):
        isocal = dt.isocalendar()
        return '%i-%i' % (isocal[0], isocal[1])

    weekday_number = weekday.date.apply(week)
    weekend_number = weekend.date.apply(week)

    weekday["number"] = weekday_number
    weekend["number"] = weekend_number

    grouped_weekday = weekday.groupby(['number'])
    weekly_weekday = grouped_weekday.aggregate('sum')
    grouped_weekend = weekend.groupby(['number'])
    weekly_weekend = grouped_weekend.aggregate('sum')

    weekly_weekday_normality_p = stats.normaltest(
        weekly_weekday['comment_count'])
    weekly_weekend_normality_p = stats.normaltest(
        weekly_weekend['comment_count'])
    weekly_levene_p = stats.levene(weekly_weekday['comment_count'],
                                   weekly_weekend['comment_count'])
    weekly_ttest_p = ttest_ind(weekly_weekday['comment_count'],
                               weekly_weekend['comment_count'])

    # Fix3
    utest_p = mannwhitneyu(weekday['comment_count'], weekend['comment_count'])

    print(
        OUTPUT_TEMPLATE.format(
            initial_ttest_p=initial_ttest_p.pvalue,
            initial_weekday_normality_p=initial_weekday_normality_p.pvalue,
            initial_weekend_normality_p=initial_weekend_normality_p.pvalue,
            initial_levene_p=initial_levene_p.pvalue,
            transformed_weekday_normality_p=transformed_weekday_normality_p.
            pvalue,
            transformed_weekend_normality_p=transformed_weekend_normality_p.
            pvalue,
            transformed_levene_p=transformed_levene_p.pvalue,
            weekly_weekday_normality_p=weekly_weekday_normality_p.pvalue,
            weekly_weekend_normality_p=weekly_weekend_normality_p.pvalue,
            weekly_levene_p=weekly_levene_p.pvalue,
            weekly_ttest_p=weekly_ttest_p.pvalue,
            utest_p=utest_p.pvalue,
        ))
Beispiel #58
0
def main():
    parser = argparse.ArgumentParser(description='read result from csv')
    parser.add_argument('file_name_0',
                        metavar='file_name_0',
                        type=str,
                        help='input file name')
    parser.add_argument('file_name_1',
                        metavar='file_name_1',
                        type=str,
                        help='input file name')
    parser.add_argument('file_name_2',
                        metavar='file_name_2',
                        type=str,
                        help='input file name')
    args = parser.parse_args()

    result0 = np.loadtxt(args.file_name_0, delimiter=',')
    result1 = np.loadtxt(args.file_name_1, delimiter=',')
    result2 = np.loadtxt(args.file_name_2, delimiter=',')
    plt.rcParams['font.family'] = 'Times new Roman'

    method_name1 = "S. L. P."
    method_name2 = "CADRL"

    fig = plt.figure()
    ax0 = fig.add_subplot(2, 2, 1)
    ax1 = fig.add_subplot(2, 2, 2)
    ax2 = fig.add_subplot(2, 2, 3)
    ax3 = fig.add_subplot(2, 2, 4)
    bp0 = ax0.boxplot((result0[:, 0], result1[:, 0], result2[:, 0]),
                      whis="range",
                      showmeans=True,
                      meanline=True)
    ax0.set_xticklabels(['Our method', method_name1, method_name2])
    ax0.set_title('(a) Travel distance[m]', y=-0.35, fontsize=10)
    ax0.set_ylim(0, 50)
    ax0.set_yticks(np.arange(0, ax0.get_ylim()[1] + 10, 10))
    ax0.grid()
    print("travel distance")
    print("P. M. mean: " + str(result0[:, 0].mean()) + "[m]")
    print("P. M. mean: " + str(result0[result0[:, 2] == 0, 0].mean()) + "[m]")
    print("P. M. median: " + str(np.median(result0[:, 0])) + "[m]")
    print("P. M. stddev: " + str(result0[:, 0].std()))
    print(method_name1 + " mean: " + str(result1[:, 0].mean()) + "[m]")
    print(method_name1 + " mean: " +
          str(result1[result1[:, 2] == 0, 0].mean()) + "[m]")
    print(method_name1 + " median: " + str(np.median(result1[:, 0])) + "[m]")
    print(method_name1 + " stddev: " + str(result1[:, 0].std()))
    print(stats.ttest_ind(result0[:, 0], result1[:, 0], equal_var=False))
    print(
        stats.ttest_ind(result0[result0[:, 2] == 0, 0],
                        result1[result1[:, 2] == 0, 0],
                        equal_var=False))
    print(method_name2 + " mean: " + str(result2[:, 0].mean()) + "[m]")
    print(method_name2 + " mean: " +
          str(result2[result2[:, 2] == 0, 0].mean()) + "[m]")
    print(method_name2 + " median: " + str(np.median(result2[:, 0])) + "[m]")
    print(method_name2 + " stddev: " + str(result2[:, 0].std()))
    print(stats.ttest_ind(result0[:, 0], result2[:, 0], equal_var=False))
    print(
        stats.ttest_ind(result0[result0[:, 2] == 0, 0],
                        result2[result2[:, 2] == 0, 0],
                        equal_var=False))

    bp1 = ax1.boxplot((result0[:, 1], result1[:, 1], result2[:, 1]),
                      whis="range",
                      showmeans=True,
                      meanline=True)
    ax1.set_xticklabels(['Our method', method_name1, method_name2])
    ax1.set_title('(b) Travel time[s]', y=-0.35, fontsize=10)
    ax1.set_ylim(0, 50)
    ax1.set_yticks(np.arange(0, ax1.get_ylim()[1] + 10, 10))
    ax1.grid()
    print("travel time")
    print("P. M. mean: " + str(result0[:, 1].mean()) + "[s]")
    print("P. M. mean: " + str(result0[result0[:, 2] == 0, 1].mean()) + "[s]")
    print("P. M. median: " + str(np.median(result0[:, 1])) + "[s]")
    print("P. M. stddev: " + str(result0[:, 1].std()))
    print(method_name1 + " mean: " + str(result1[:, 1].mean()) + "[s]")
    print(method_name1 + " mean: " +
          str(result1[result1[:, 2] == 0, 1].mean()) + "[s]")
    print(method_name1 + " median: " + str(np.median(result1[:, 1])) + "[s]")
    print(method_name1 + " stddev: " + str(result1[:, 1].std()))
    print(stats.ttest_ind(result0[:, 1], result1[:, 1], equal_var=False))
    print(
        stats.ttest_ind(result0[result0[:, 2] == 0, 1],
                        result1[result1[:, 2] == 0, 1],
                        equal_var=False))
    print(method_name2 + " mean: " + str(result2[:, 1].mean()) + "[s]")
    print(method_name2 + " mean: " +
          str(result2[result2[:, 2] == 0, 1].mean()) + "[s]")
    print(method_name2 + " median: " + str(np.median(result2[:, 1])) + "[s]")
    print(method_name2 + " stddev: " + str(result2[:, 1].std()))
    print(stats.ttest_ind(result0[:, 1], result2[:, 1], equal_var=False))
    print(
        stats.ttest_ind(result0[result0[:, 2] == 0, 1],
                        result2[result2[:, 2] == 0, 1],
                        equal_var=False))

    bp2 = ax2.boxplot((result0[:, 2], result1[:, 2], result2[:, 2]),
                      whis="range",
                      showmeans=True,
                      meanline=True)
    ax2.set_xticklabels(['Our method', method_name1, method_name2])
    ax2.set_title('(c) Number of collisions', y=-0.35, fontsize=10)
    ax2.set_ylim(0, 5)
    ax2.set_yticks(np.arange(0, ax2.get_ylim()[1] + 1, 1))
    ax2.grid()
    print("collision count")
    print("P. M. mean: " + str(result0[:, 2].mean()))
    print("P. M. median: " + str(np.median(result0[:, 2])))
    print("P. M. stddev: " + str(result0[:, 2].std()))
    print("P. M. c. rate: " + str(
        np.where(result0[:, 2] > 0)[0].shape[0] /
        float(result0[:, 2].shape[0])))
    print(method_name1 + " mean: " + str(result1[:, 2].mean()))
    print(method_name1 + " median: " + str(np.median(result1[:, 2])))
    print(method_name1 + " stddev: " + str(result1[:, 2].std()))
    print(method_name1 + " c. rate: " + str(
        np.where(result1[:, 2] > 0)[0].shape[0] /
        float(result1[:, 2].shape[0])))
    print(stats.ttest_ind(result0[:, 2], result1[:, 2], equal_var=False))
    print(method_name2 + " mean: " + str(result2[:, 2].mean()))
    print(method_name2 + " median: " + str(np.median(result2[:, 2])))
    print(method_name2 + " stddev: " + str(result2[:, 2].std()))
    print(method_name2 + " c. rate: " + str(
        np.where(result2[:, 2] > 0)[0].shape[0] /
        float(result2[:, 2].shape[0])))
    print(stats.ttest_ind(result0[:, 2], result2[:, 2], equal_var=False))

    bp3 = ax3.boxplot((result0[:, 3], result1[:, 3], result2[:, 3]),
                      whis="range",
                      showmeans=True,
                      meanline=True)
    ax3.set_xticklabels(['Our method', method_name1, method_name2])
    ax3.set_title('(d) Minimum distance[m]', y=-0.35, fontsize=10)
    ax3.set_ylim(0, 1.5)
    ax3.set_yticks(np.arange(0, ax3.get_ylim()[1] + 0.5, 0.5))
    ax3.axhline(0.6, c="r")
    ax3.text(3.6, 0.6, "$R_{col}$", size=10, color="red")
    ax3.grid()
    print("min distance")
    print("P. M. mean: " + str(result0[:, 3].mean()) + "[m]")
    print("P. M. mean: " + str(result0[result0[:, 2] == 0, 3].mean()) + "[m]")
    print("P. M. median: " + str(np.median(result0[:, 3])) + "[m]")
    print("P. M. stddev: " + str(result0[:, 3].std()))
    print(method_name1 + " mean: " + str(result1[:, 3].mean()) + "[m]")
    print(method_name1 + " mean: " +
          str(result1[result1[:, 2] == 0, 3].mean()) + "[m]")
    print(method_name1 + " median: " + str(np.median(result1[:, 3])) + "[m]")
    print(method_name1 + " stddev: " + str(result1[:, 3].std()))
    print(stats.ttest_ind(result0[:, 3], result1[:, 3], equal_var=False))
    print(
        stats.ttest_ind(result0[result0[:, 2] == 0, 3],
                        result1[result1[:, 2] == 0, 3],
                        equal_var=False))
    print(method_name2 + " mean: " + str(result2[:, 3].mean()) + "[m]")
    print(method_name2 + " mean: " +
          str(result2[result2[:, 2] == 0, 3].mean()) + "[m]")
    print(method_name2 + " median: " + str(np.median(result2[:, 3])) + "[m]")
    print(method_name2 + " stddev: " + str(result2[:, 3].std()))
    print(stats.ttest_ind(result0[:, 3], result2[:, 3], equal_var=False))
    print(
        stats.ttest_ind(result0[result0[:, 2] == 0, 3],
                        result2[result2[:, 2] == 0, 3],
                        equal_var=False))

    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    plt.show()
Beispiel #59
0
data1, data2, data3 = sdpt, sdkt, sdbt
stat, p = f_oneway(data1, data2, data3)
f.write("\nanova small dense: ")
f.write("stat: " + str(stat) + " ")
f.write("p: " + str(p))

data1, data2, data3 = mspt, mskt, msbt
stat, p = f_oneway(data1, data2, data3)
f.write("\nanova medium sparse: ")
f.write("stat: " + str(stat) + " ")
f.write("p: " + str(p))

data1, data2, data3 = lspt, lskt, lsbt
stat, p = f_oneway(data1, data2, data3)
f.write("\nanova medium dense: ")
f.write("stat: " + str(stat) + " ")
f.write("p: " + str(p))

data1, data2 = lspt, lskt
stat, p = ttest_ind(data1, data2)
f.write("\n ttest p and k med dense: ")
f.write("stat: " + str(stat) + " ")
f.write("p: " + str(p))

data1, data2, data3 = lspt, lskt, lsbt
stat, p = f_oneway(data1, data2, data3)
f.write("\nanova large sparse: ")
f.write("stat: " + str(stat) + " ")
f.write("p: " + str(p))

f.close()
Beispiel #60
0
def t_test2(x1, x2, eq_Var=True):
    t, p = stats2.ttest_ind(x1, x2, equal_var=eq_Var)
    return p, p > alpha  # 如果p小,是拒绝原假设,认为二者不同,即可以通过该属性区分