Beispiel #1
0
def proc_Dsuff(  xlist, ylist, pltitle = 'DXY', Csuff = 0.0, fname = 'DXY.png' ):
	# Mix of error calcs. Suffix-2 vars use the old error-value calc.
	# New method uses MEM and Sigma, inpit on command line.
	ssd = 0.0
	msd = 0.0
	df1 = 0.0
	F   = 0.0
	F2  = 0.0
	PVAL    = 0.0
	PVAL2   = 0.0
	nullsd  = 0.0
	nullsd2 = 0.0
	ssd = calc_ssd(  xlist, ylist, pltitle, fname )
	df1 = calc_df1(  xlist, ylist )
	
	df2 = len( ylist )

	nullsd  = calc_nullsd1( measurement_error_mean, sigma_mem )
	nullsd2 = calc_nullsd3( xlist, ylist, error_value )
	emsd  = nullsd 
	emsd2 = nullsd2 
	if ( df1 > 0 ):
		msd = ssd/df1
		F = msd/emsd 
		PVAL = f.sf ( F, df1, df2, loc=0, scale=1 ) 
	#else:		print 'ERR - DF1 Div by Zero. F error.'
	## Only do calcs if DF1 > 0. Error o'wise.
	if ( df1 > 0 ):
		msd = ssd/df1
		F2 = msd/emsd2 
		PVAL2 = f.sf ( F2, df1, df2, loc=0, scale=1 ) 

	ProcLabel = os.path.splitext( fname )[0] 
	ProcLabel = ProcLabel.replace( 'D', '' ) # Remove D from Label.
	print '{:>10s}'.format( ProcLabel ), 
	print '{:>2d}'.format( Yval ),
	print '{:>4.3f}'.format( Csuff ),
	print '{:>8.3f}'.format( ssd ),
	print '{:>8.2f}'.format( msd ),
	print '{:>11.2f}'.format( emsd ),
	print '{:>16.3f}'.format( F ),
	print '{:>5.2f}'.format( PVAL ),
	print '{:>11.3f}'.format( F2 ),
	print '{:>5.2f}'.format( PVAL2 ),
	print '{:>3d}'.format( df2 ),
	print
	opcsv.writerow( [ ProcLabel, Yval, '{:>4.3f}'.format( Csuff ), 
	'{:>8.3f}'.format( ssd ), 
	'{:>8.2f}'.format( msd ),
	'{:>8.2f}'.format( emsd ),
	'{:>11.3f}'.format( F ), 
	'{:>5.2f}'.format( PVAL ), 
	'{:>11.3f}'.format( F2 ), 
	'{:>5.2f}'.format( PVAL2 ), 
	'{:>3d}'.format( df2 ) ] )
Beispiel #2
0
def proc_Dsuff(xlist, ylist, pltitle='DXY', Csuff=0.0, fname='DXY.png'):
    # Mix of error calcs. Suffix-2 vars use the old error-value calc.
    # New method uses MEM and Sigma, inpit on command line.
    ssd = 0.0
    msd = 0.0
    df1 = 0.0
    F = 0.0
    F2 = 0.0
    PVAL = 0.0
    PVAL2 = 0.0
    nullsd = 0.0
    nullsd2 = 0.0
    ssd = calc_ssd(xlist, ylist, pltitle, fname)
    df1 = calc_df1(xlist, ylist)

    df2 = len(ylist)

    nullsd = calc_nullsd1(measurement_error_mean, sigma_mem)
    nullsd2 = calc_nullsd3(xlist, ylist, error_value)
    emsd = nullsd
    emsd2 = nullsd2
    if (df1 > 0):
        msd = ssd / df1
        F = msd / emsd
        PVAL = f.sf(F, df1, df2, loc=0, scale=1)
    #else:		print 'ERR - DF1 Div by Zero. F error.'
    ## Only do calcs if DF1 > 0. Error o'wise.
    if (df1 > 0):
        msd = ssd / df1
        F2 = msd / emsd2
        PVAL2 = f.sf(F2, df1, df2, loc=0, scale=1)

    ProcLabel = os.path.splitext(fname)[0]
    ProcLabel = ProcLabel.replace('D', '')  # Remove D from Label.
    print '{:>10s}'.format(ProcLabel),
    print '{:>2d}'.format(Yval),
    print '{:>4.3f}'.format(Csuff),
    print '{:>8.3f}'.format(ssd),
    print '{:>8.2f}'.format(msd),
    print '{:>11.2f}'.format(emsd),
    print '{:>16.3f}'.format(F),
    print '{:>5.2f}'.format(PVAL),
    print '{:>11.3f}'.format(F2),
    print '{:>5.2f}'.format(PVAL2),
    print '{:>3d}'.format(df2),
    print
    opcsv.writerow([
        ProcLabel, Yval, '{:>4.3f}'.format(Csuff), '{:>8.3f}'.format(ssd),
        '{:>8.2f}'.format(msd), '{:>8.2f}'.format(emsd), '{:>11.3f}'.format(F),
        '{:>5.2f}'.format(PVAL), '{:>11.3f}'.format(F2),
        '{:>5.2f}'.format(PVAL2), '{:>3d}'.format(df2)
    ])
Beispiel #3
0
def f_one(*args):
    a = list(args)
    for ai in a:
        if not ai:
            return [None, None]
    r = len(a[0])
    m = len(a)
    x_bar = mean(a)
    # t = mean(sum(a, []))
    x_ib = []
    for x in a:
        x_ib.append(mean(x))
    assert len(x_ib) == m

    s_e = 0.0
    s_t = 0.0
    for i in range(m):
        for j in range(r):
            s_e += (a[i][j] - x_ib[i]) ** 2
            s_t += (a[i][j] - x_bar) ** 2
    s_a = s_t - s_e
    va = s_a / (m - 1)
    ve = s_e / (m * r - m)
    fv = va / ve
    p = f.sf(fv, dfn=m - 1, dfd=m * r - m)
    return [fv, p]
Beispiel #4
0
def test_scipy_f():
    rng = np.random.RandomState(20120407)
    x = rng.normal(size=(100)) * 4
    for m in np.arange(1, 15):
        for n in np.arange(1, 15):
            assert_array_almost_equal(f_sf(x, m, n), f.sf(x, m, n))
            assert_array_almost_equal(f_cdf(x, m, n), f.cdf(x, m, n))
 def f_oneway(self, *args):
     m = len(args)
     if m==0:
         return [None,None]
     r = len(args[0])
     if r==0:
         return [None,None]
     
     n = r*m
     mean = self.mean(args)
     
     sa = 0
     for i in args:
         sa = sa + (self.meani(i)-mean)**2
     sa = sa * r
     va = sa/(m-1)
     
     se = 0
     for i in args:
         for j in i:
             se = se + (j-self.meani(i))**2
     ve = se/(n-m)
     f = va/ve
     p = F.sf(f,m-1,n-m)
     return [round(f,6),round(p,6)]
Beispiel #6
0
def f_one(*args):
    a = list(args)
    for ai in a:
        if not ai:
            return [None, None]
    r = len(a[0])
    m = len(a)
    x_bar = mean(a)
    # t = mean(sum(a, []))
    x_ib = []
    for x in a:
        x_ib.append(mean(x))
    assert len(x_ib) == m

    s_e = 0.0
    s_t = 0.0
    for i in range(m):
        for j in range(r):
            s_e += (a[i][j] - x_ib[i])**2
            s_t += (a[i][j] - x_bar)**2
    s_a = s_t - s_e
    va = s_a / (m - 1)
    ve = s_e / (m * r - m)
    fv = va / ve
    p = f.sf(fv, dfn=m - 1, dfd=m * r - m)
    return [fv, p]
Beispiel #7
0
def var_homo_test_base(n, var1, var2):
    if var1 < var2:
        var1, var2 = var2, var1
    _F = var1 / var2
    v1, v2 = n - 1, n - 1
    _p = f.sf(_F, v1, v2)
    return _F, _p
Beispiel #8
0
    def plotMap(self, p):
        param = h_map[:,:,p]
        h_min, h_max = vminVmax(param)

        if p == 4:
            c_map = 'jet_r'
        else:
            c_map = 'jet'
            
        if cp.mask_bool:
            # generate p-value heatmap + masked Lorentzian component heatmaps
            dof1, dof2 = 3, 6  # degrees of freedom for model M1, M2
            p_val = ff.sf(h_map[:,:,6], dof1, dof2)
            p_val[np.isnan(p_val)] = 1
            param_mask = np.copy(param) 
            param_mask[p_val > cp.mask_val] = np.NaN
            
            # determine percentage of region masked 
            count = np.count_nonzero(np.isnan(param_mask))   
            total_pix = p_val.shape[0]*p_val.shape[1]
            mask_percent = ((np.float(count))/total_pix)*100
            
            cp.ax1_title.set_text(r'%s | $f_{masked}$ = %0.1f%s' % (titles[p], mask_percent, '%'))
            cp.im.set_data(param_mask)
        
        elif not cp.mask_bool:
            cp.ax1_title.set_text(r'%s' % titles[p])
            cp.im.set_data(param)
            
        cp.im.set_clim(h_min, h_max)
        cp.im.set_cmap(c_map)
        
        cp.colorBar()
Beispiel #9
0
 def f_oneway(self, *args):
    if len(args)==0:
         return[None,None]
    elif len(args)==1:
        return[None,None]
    else:
        meanr = []
        mean = 0.0
        sa = 0.0
        se = 0.0
        for i in args:
            meanr.append((float)(sum(i))/len(i))
            mean+=sum(i)
        mean/=(len(args)*len(args[0]))
        for i in meanr:
            sa+=len(args[0])*(i-mean)**2
        for i in range(len(meanr)):
            for j in range(len(args[0])):
                se+=(args[i][j]-meanr[i])**2
        fa = len(args)-1
        fe = len(args)*len(args[0])-len(args)
        va = sa/fa
        ve = se/fe
        F = va/ve
        P = f.sf(F,fa,fe)    
        return[round(F,6),round(P,6)]
Beispiel #10
0
def f_classif(X, y):
    # TODO Ancora non ci siamo con la memoria (soprattutto le comprehension)
    groups, mask, counts = np.unique(y,
                                     return_inverse=True,
                                     return_counts=True)
    # SStotal
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "SStotal...")
    gmeans_ = np.array(
        [X[mask == g, :].mean(axis=0) for g in range(len(groups))])
    means_ = gmeans_.mean(axis=0)
    sst_ = ((X - means_)**2).sum(axis=0)
    # SSwithin
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "SSwithin...")
    grouped_ss = [((X[mask == g, :] - gmeans_[g])**2).sum(axis=0)
                  for g in range(len(groups))]
    #grouped_ss = [np.square(X[mask==g,:] - gmeans_[g]).sum(axis=0) for g in range(len(groups))]
    ssw_ = np.array(grouped_ss).sum(axis=0)
    # SSbetween
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "SSbetween...")
    grouped_ss = [(counts[g] * ((gmeans_[g] - means_)**2))
                  for g in range(len(groups))]
    #grouped_ss = [(counts[g] * np.square(gmeans_[g] - means_)) for g in range(len(groups))]
    ssb_ = np.array(grouped_ss).sum(axis=0)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Completing...")
    # DF (degree of freedom)
    k, N = len(groups), X.shape[0]
    DFbetween, DFwithin, DFtotal = k - 1, N - k, N - 1
    # F-score
    MSbetween = ssb_ / DFbetween
    MSwithin = ssw_ / DFwithin
    F = MSbetween / MSwithin
    # p-value
    pval = f.sf(F, DFbetween, DFwithin)
    return F, pval
Beispiel #11
0
 def var_var(self, alpha):
     f0 = self.S1**2/self.S2**2
     n1, n2 = self.n1, self.n2
     # hypothesis testing2
     H1a = f.ppf(1 - alpha/2., n1 - 1, n2 - 1) < f0 or f.ppf(alpha/2., n1 - 1, n2 - 1) > f0
     H1b = f.ppf(alpha/2., n1 - 1, n2 - 1) < f0
     H1c = f.ppf(1 - alpha/2., n1 - 1, n2 - 1) > f0
     # p-value
     p1a = np.max(np.array([f.sf(f0, n1 - 1, n2 - 1), 1 - f.sf(f0, n1 - 1, n2 - 1)]))
     p1b = f.sf(f0, n1 - 1, n2 - 1)
     p1c = 1 - f.sf(f0, n1 - 1, n2 - 1)
     
     # confidence intervals: the minimum level of significance
     # alpha for which the null hypothesis is rejected 
     c1 = self.S1**2/self.S1**2 * f.ppf(alpha/2., n2 - 1, n1 - 1)     
     c2 = self.S1**2/self.S1**2 * f.ppf(1 - alpha/2., n2 - 1, n1 - 1)     
     return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
Beispiel #12
0
    def var_var(self, alpha):
        f0 = self.S1 ** 2 / self.S2 ** 2
        n1, n2 = self.n1, self.n2
        # hypothesis testing2
        H1a = f.ppf(1 - alpha / 2.0, n1 - 1, n2 - 1) < f0 or f.ppf(alpha / 2.0, n1 - 1, n2 - 1) > f0
        H1b = f.ppf(alpha / 2.0, n1 - 1, n2 - 1) < f0
        H1c = f.ppf(1 - alpha / 2.0, n1 - 1, n2 - 1) > f0
        # p-value
        p1a = np.max(np.array([f.sf(f0, n1 - 1, n2 - 1), 1 - f.sf(f0, n1 - 1, n2 - 1)]))
        p1b = f.sf(f0, n1 - 1, n2 - 1)
        p1c = 1 - f.sf(f0, n1 - 1, n2 - 1)

        # confidence intervals: the minimum level of significance
        # alpha for which the null hypothesis is rejected
        c1 = self.S1 ** 2 / self.S1 ** 2 * f.ppf(alpha / 2.0, n2 - 1, n1 - 1)
        c2 = self.S1 ** 2 / self.S1 ** 2 * f.ppf(1 - alpha / 2.0, n2 - 1, n1 - 1)
        return H1a, H1b, H1c, p1a, p1b, p1c, (c1, c2)
Beispiel #13
0
 def fit(self):
     # use independet variable + subjects
     self.reg_t = LinearRegression(n_jobs=self.njobs).fit(self.X_t, self.y)
     self.preds_t = self.reg_t.predict(self.X_t)
     self.RSS_t = ((self.preds_t - self.y)**2).sum()
     self.signCorr = np.sign(self.reg_t.coef_[-1])
     #only independent variable
     self.reg_m = LinearRegression(n_jobs=self.njobs).fit(self.X_m, self.y)
     self.preds_m = self.reg_m.predict(self.X_m)
     self.RSS_m = ((self.preds_m - self.y)**2).sum()
     self.SS_m = ((self.preds_m - self.preds_t)**2).sum()
     #only subjects
     self.reg_s = LinearRegression(n_jobs=self.njobs).fit(self.X_s, self.y)
     self.preds_s = self.reg_s.predict(self.X_s)
     self.RSS_s = ((self.preds_s - self.y)**2).sum()
     self.SS_s = ((self.preds_s - self.preds_t)**2).sum()
     ## MSE
     self.MSE_s = self.SS_s / self.df_s
     self.MSE_m = self.SS_m / self.df_m
     self.MSE_t = self.RSS_t / self.df_t
     ## F ratio
     self.F_ratio_m = self.MSE_m / self.MSE_t
     self.F_ratio_s = self.MSE_s / self.MSE_t
     ## p value
     self.p_value_m = fdist.sf(self.F_ratio_m, self.df_m, self.df_t)
     self.p_value_s = fdist.sf(self.F_ratio_s, self.df_s, self.df_t)
     ##final DF
     self.df_total = self.df_m + self.df_s + self.df_t
     self.ss_total = ((self.y - self.y.mean())**2).sum()
     self.mse_total = self.ss_total / self.df_total
     self.result = pd.DataFrame(
         {
             'DoF': [self.df_m, self.df_s, self.df_t, self.df_total],
             'SumOfSq': [self.SS_m, self.SS_s, self.RSS_t, self.ss_total],
             'MSE': [self.MSE_m, self.MSE_s, self.MSE_t, self.mse_total],
             'F_value': [self.F_ratio_m, self.F_ratio_s, None, None],
             'p_value': [
                 '<0.0001' if self.p_value_m < 0.0001 else self.p_value_m,
                 self.p_value_s, None, None
             ]
         },
         index=[self.subject, self.independent, 'Residual', 'Total'])
     self.corr = self.signCorr * np.sqrt(self.SS_s /
                                         (self.SS_s + self.RSS_t))
     return self.corr, self.p_value_s
Beispiel #14
0
def ptl_anovaR(inframe):
    '''
    edw prepei na allakseis tous arithmous sta t
    repeated measures one way ANOVA data
    inframe is  a dataframe
    ss is sum of square
    wg is within group
    bg is between group
    ms is means square
    '''
    rows, cols = inframe.shape
    k = cols
    n_sbj = rows

    allser = dftoser(inframe)
    n_t = allser.size

    # Within group
    ss_wg = 0
    for i in range(k):
        ss_wg += ssd(inframe.iloc[:, i])
    df_wg = n_t - k
    t1 = (ss_wg, df_wg)

    # Between groups
    ss_t = ssd(allser)
    ss_bg = ss_t - ss_wg
    df_bg = n_t - k
    ms_bg = ss_bg / df_bg
    t0 = (ss_bg, df_bg, ms_bg)

    # Subjects
    subjects_means = pd.Series([0 for i in range(rows)])
    for i in range(rows):
        sm = inframe.iloc[i, :].mean(skipna=True)
        subjects_means.iloc[i] = sm

    ss_sb = k * ssd(subjects_means)
    df_sb = n_sbj - 1
    t3 = (ss_sb, df_sb)

    ss_er = ss_wg - ss_sb
    df_er = df_wg - df_sb
    ms_er = ss_er / df_er
    t2 = (ss_er, df_er, ms_er)

    df_t = n_t - 1
    t4 = (ss_t, df_t)

    if ms_er == 0:
        F = float("Inf")
    else:
        F = ms_bg / ms_er

    p = f.sf(F, df_bg, df_er, loc=0, scale=1)

    return t0, t1, t2, t3, t4, F, p
Beispiel #15
0
    def f_oneway(slef, *args):

        m = len(args)
        if (m == 0):
            return [None, None]

        for arg in args:
            if (len(arg) == 0):
                return [None, None]

        #cal avg_array,all_avg
        avg_array = []
        for arg in args:
            su = 0.0
            for x in arg:
                su += x
            avg = su / len(arg)
            avg_array.append(avg)

        print avg_array

        all_avg = 0.0
        for a in avg_array:
            all_avg += a
        all_avg = all_avg / len(avg_array)

        print all_avg

        #cal Sa,Se,St
        Sa = 0.0
        for a in avg_array:
            t = (a - all_avg)**2
            Sa += t
        Sa = Sa * len(args[0])

        Se = 0.0
        i = 0
        for arg in args:
            temp = avg_array[i]
            t = 0.0
            for x in arg:
                t += (x - temp)**2
            Se += t
            i = i + 1

        #cal fa,fe
        fa = m - 1
        fe = m * len(args[0]) - m

        #cal Va,Ve
        Va = Sa / fa
        Ve = Se / fe

        #cal Fa
        Fa = round(Va / Ve, 6)
        p = round(f.sf(x=Fa, dfn=fa, dfd=fe), 6)
        return [Fa, p]
Beispiel #16
0
    def f_oneway(slef,*args):

        m=len(args)
        if(m==0):
            return [None,None]

        for arg in args:
            if(len(arg)==0):
                return [None,None]

        #cal avg_array,all_avg
        avg_array=[]
        for arg in args:
            su=0.0
            for x in arg:
                su+=x
            avg=su/len(arg)
            avg_array.append(avg)

        print avg_array

        all_avg=0.0
        for a in avg_array:
            all_avg+=a
        all_avg=all_avg/len(avg_array)

        print all_avg

        #cal Sa,Se,St
        Sa=0.0
        for a in avg_array:
            t=(a-all_avg)**2
            Sa+=t
        Sa=Sa*len(args[0])

        Se=0.0
        i=0
        for arg in args:
            temp=avg_array[i]
            t=0.0
            for x in arg:
                t+=(x-temp)**2
            Se+=t
            i=i+1

        #cal fa,fe
        fa=m-1
        fe=m*len(args[0])-m

        #cal Va,Ve
        Va=Sa/fa
        Ve=Se/fe

        #cal Fa
        Fa=round(Va/Ve,6)
        p=round(f.sf(x=Fa,dfn=fa,dfd=fe),6)
        return [Fa,p]
Beispiel #17
0
def test_f_test():
    X, y = build()
    res = get_f_mat(X, y, grouping=[0, 1, 2])
    df1 = [[x[1] for x in y] for y in res]
    df2 = [[x[2] for x in y] for y in res]
    assert np.allclose(df1, 1)
    assert np.allclose(df2, 7997)
    p_mat = [[f.sf(f_score, df1, df2) for (f_score, df1, df2) in row]
             for row in res]
    assert np.allclose(p_mat, 0)
Beispiel #18
0
 def GRS_test(self,alpha,Sigma):
     """
     This test should be used for time series regressions
     """
     if rank(Sigma,tol=1e-9) < self.N:
         print ("Warning Sigma has deficient rank of ", rank(Sigma))
         val = (self.T - self.N - self.k)*dot(alpha.T,dot(pinv(Sigma),alpha))/(self.N*(1 + dot(self.fm.T,solve(self.vcvfac,self.fm))))
     else:
         val = (self.T - self.N - self.k)*dot(alpha.T,solve(Sigma,alpha))/(self.N*(1 + dot(self.fm.T,solve(self.vcvfac,self.fm))))
     return squeeze(f.sf(val,self.N,self.T-self.N-self.k))
Beispiel #19
0
    def __init__(self,
                 t=None,
                 F=None,
                 sd=None,
                 effect=None,
                 df_denom=None,
                 df_num=None,
                 alpha=0.05,
                 **kwds):

        self.effect = effect  # Let it be None for F
        if F is not None:
            self.distribution = 'F'
            self.fvalue = F
            self.statistic = self.fvalue
            self.df_denom = df_denom
            self.df_num = df_num
            self.dist = fdist
            self.dist_args = (df_num, df_denom)
            self.pvalue = fdist.sf(F, df_num, df_denom)
        elif t is not None:
            self.distribution = 't'
            self.tvalue = t
            self.statistic = t  # generic alias
            self.sd = sd
            self.df_denom = df_denom
            self.dist = student_t
            self.dist_args = (df_denom, )
            self.pvalue = self.dist.sf(np.abs(t), df_denom) * 2
        elif 'statistic' in kwds:
            # TODO: currently targeted to normal distribution, and chi2
            self.distribution = kwds['distribution']
            self.statistic = kwds['statistic']
            self.tvalue = value = kwds['statistic']  # keep alias
            # TODO: for results instance we decided to use tvalues also for normal
            self.sd = sd
            self.dist = getattr(stats, self.distribution)
            self.dist_args = kwds.get('dist_args', ())
            if self.distribution == 'chi2':
                self.pvalue = self.dist.sf(self.statistic, df_denom)
                self.df_denom = df_denom
            else:
                "normal"
                self.pvalue = np.full_like(value, np.nan)
                not_nan = ~np.isnan(value)
                self.pvalue[not_nan] = self.dist.sf(np.abs(value[not_nan])) * 2

        # cleanup
        # should we return python scalar?
        self.pvalue = np.squeeze(self.pvalue)

        if self.effect is not None:
            self.c_names = ['c%d' % ii for ii in range(len(self.effect))]
        else:
            self.c_names = None
Beispiel #20
0
    def evaluate(self, x, y_true):
        """
        Evaluates the performance of the trained model on a global and variable
        level. For global, RSE, R^2 and F-statistic are standard. For variables
        the SE and t-statistic is used.
        :param x: Matrix of predictors
        :param y_true: Vector of true y values
        :return: 
        """
        x = core.enhance_matrix(x)
        y_pred = self.predict(x)
        global_metrics = [['RSE', reg_eval.residual_standard_error],
                          ['R^2', reg_met.r_squared],
                          ['F-statistic', reg_eval.f_statistic], ['p-value']]
        var_metrics = [['SE', reg_eval.standard_error_coefs],
                       ['t-statistic', reg_eval.t_statistic], ['p-value']]

        glob_outcomes = {'Metric': [], 'Value': []}
        for i in global_metrics:
            if len(i) > 1:
                glob_outcomes['Metric'].append(i[0])
                glob_outcomes['Value'].append(i[1](x=x,
                                                   y_true=y_true,
                                                   y_pred=y_pred,
                                                   num_predictors=x.n_cols))
            elif i[0] == 'p-value':
                glob_outcomes['Metric'].append(i[0])
                glob_outcomes['Value'].append(
                    f.sf(glob_outcomes['Value'][2],
                         dfn=len(y_pred),
                         dfd=x.n_cols - 1))
            else:
                raise Exception('Single value metric not implemented')

        var_outcomes = {
            'Column': list(range(x.n_cols)),
            'Coefficient': self.coefficients.data
        }
        for i in var_metrics:
            if len(i) > 1:
                var_outcomes[i[0]] = i[1](x=x,
                                          y_true=y_true,
                                          y_pred=y_pred,
                                          coefs=var_outcomes['Coefficient'])
            elif i[0] == 'p-value':
                var_outcomes[i[0]] = [
                    2 * t.sf(abs(float(score)),
                             len(y_pred) - x.n_cols)
                    for score in var_outcomes['t-statistic']
                ]

        print(tabulate(glob_outcomes, headers='keys'))
        print(tabulate(var_outcomes, headers='keys'))

        return glob_outcomes, var_outcomes
Beispiel #21
0
    def test(self, *args):
        r"""
        Calculates the MANOVA test statistic and p-value.

        Parameters
        ----------
        *args : ndarray
            Variable length input data matrices. All inputs must have the same
            number of dimensions. That is, the shapes must be `(n, p)` and
            `(m, p)`, ... where `n`, `m`, ... are the number of samples and `p` is
            the number of dimensions.

        Returns
        -------
        stat : float
            The computed MANOVA statistic.
        pvalue : float
            The computed MANOVA p-value.

        Examples
        --------
        >>> import numpy as np
        >>> from hyppo.ksample import MANOVA
        >>> x = np.arange(7)
        >>> y = x
        >>> stat, pvalue = MANOVA().test(x, y)
        >>> '%.3f, %.1f' % (stat, pvalue)
        '0.000, 1.0'
        """
        inputs = list(args)
        check_input = _CheckInputs(inputs=inputs, )
        inputs = check_input()

        N = np.sum([i.shape[0] for i in inputs])
        p = inputs[0].shape[1]
        nu_w = N - len(inputs)

        if nu_w < p:
            raise ValueError("Test cannot be run, degree of freedoms is off")

        stat = self.statistic(*inputs)
        nu_b = len(inputs) - 1
        s = np.min([p, nu_b])
        m = (np.abs(p - nu_b) - 1) / 2
        n = (nu_w - p - 1) / 2
        num = 2 * n + s + 1
        denom = 2 * m + s + 1
        pvalue = f.sf(num / denom * stat / (s - stat), s * denom, s * num)
        self.stat = stat
        self.pvalue = pvalue
        self.null_dist = None

        return stat, pvalue
Beispiel #22
0
def hotel2(X1, X2):
  """ Computes Hotelling t-squared statistic under two assumptions or variance.

  :param X1 pandas DataFrame with samples from first group
  :param X2 pandas DataFrame with samples from second group
  :return None
  """
  # TODO: Verify Hotelling results
  n1, k = X1.shape
  n2, k2 = X2.shape
  assert(k == k2)

  ybar1 = X1.mean().as_matrix()
  s1 = np.cov(X1, rowvar=False)
  ybar2 = X2.mean(axis=0).as_matrix()
  s2 = np.cov(X2, rowvar=False)

  alpha = 0.05
  diffs = (ybar1 - ybar2).reshape(1, k)

  # TODO: Incorporate a test for equal variances

  # If variances assumed equal, then pool
  if True:
    spool = ((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2)
    t2 = diffs\
      .dot(np.linalg.inv(spool * (1.0 / n1 + 1.0 / n2)))\
      .dot(ybar1 - ybar2)\
      .item(0)
    eff = (n1 + n2 - k - 1) * t2 / (k * (n1 + n2 - 2))
    df1 = k
    df2 = n1 + n2 - k - 1
    p_value = f.sf(eff, df1, df2)
    print('If variances are assumed equal between classes')
    if p_value < alpha:
      print("\t=> Reject the null hypothesis that mean(X1) == mean(X2)")
    else:
      print("\t=> Accept null hypothesis that mean(X1) == mean(X2)")
    print(t2, p_value)

  # If variances not assumed equal, then use modified Hotelling
  if True:
    t2 = diffs\
      .dot(np.linalg.inv(s1 / n1 + s2 / n2))\
      .dot(ybar1 - ybar2)\
      .item(0)
    p_value = chi2.sf(t2, k)
    print('If variances are not assumed equal between classes')
    if p_value < alpha:
      print("\t=> Reject the null hypothesis that mean(X1) == mean(X2)")
    else:
      print("\t=> Accept null hypothesis that mean(X1) == mean(X2)")
    print(t2, p_value)
Beispiel #23
0
def p_value(y1, x1, y2, x2, **kwargs):
    F, coeff_total, coeff_1, coeff_2 = f_value(y1, x1, y2, x2, **kwargs)
    if not F:
        return 1, coeff_total, coeff_1, coeff_2
    df1 = 2
    df2 = len(x1) + len(x2) - 4

    # The survival function (1-cdf) is more precise than using 1-cdf,
    # this helps when p-values are very close to zero.
    # -f.logsf would be another alternative to directly get -log(pval) instead.
    p_val = f.sf(F[0], df1, df2)
    return p_val, coeff_total, coeff_1, coeff_2
Beispiel #24
0
 def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None,
              df_num=None):
     if F is not None:
         self.fvalue = F
         self.df_denom = df_denom
         self.df_num = df_num
         self.pvalue = fdist.sf(F, df_num, df_denom)
     else:
         self.tvalue = t
         self.sd = sd
         self.effect = effect
         self.df_denom = df_denom
         self.pvalue = student_t.sf(np.abs(t), df_denom)
Beispiel #25
0
 def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None,
              df_num=None):
     if F is not None:
         self.fvalue = F
         self.df_denom = df_denom
         self.df_num = df_num
         self.pvalue = fdist.sf(F, df_num, df_denom)
     else:
         self.tvalue = t
         self.sd = sd
         self.effect = effect
         self.df_denom = df_denom
         self.pvalue = student_t.sf(np.abs(t), df_denom)
Beispiel #26
0
def proc_Dsuff(xlist, ylist, pltitle='DXY', Csuff=0.0, fname='DXY.png'):
    ssd = 0.0
    msd = 0.0
    df1 = 0.0
    F = 0.0
    PVAL = 0.0
    nullsd = 0.0
    ssd = calc_ssd(xlist, ylist, pltitle, fname)
    df1 = calc_df1(xlist, ylist)

    df2 = len(ylist)

    nullsd = calc_nullsd2(xlist, ylist, error_value)
    emsd = nullsd
    if (df1 > 0):
        msd = ssd / df1
        F = msd / emsd
        PVAL = f.sf(F, df1, df2, loc=0, scale=1)
    #else:		print 'ERR - DF1 Div by Zero. F error.'
    ## Only do calcs if DF1 > 0. Error o'wise.
    '''
	print 'Fname:', os.path.splitext( fname )[0]
	print 'SSD:', ssd
	print 'DF1:', df1
	print 'DF2:', df2
	print 'NULLSD:', nullsd
	print 'MSD:', msd
	print 'EMSD:', emsd
	print 'F:', F 
	print 'PVAL - SF:', PVAL
	print 'Csuff:', Csuff
	'''

    ProcLabel = os.path.splitext(fname)[0]
    ProcLabel = ProcLabel.replace('D', '')  # Remove D from Label.
    print '{:>10s}'.format(ProcLabel),
    print '{:>2d}'.format(Yval),
    print '{:>4.3f}'.format(Csuff),
    print '{:>8.3f}'.format(ssd),
    print '{:>11.3f}'.format(F),
    print '{:>3.2f}'.format(PVAL),
    print '{:>3.2f}'.format(df1)
    print '{:>3d}'.format(df2)
    #print
    opcsv.writerow([
        ProcLabel, Yval, '{:>4.3f}'.format(Csuff), '{:>8.3f}'.format(ssd),
        '{:>11.3f}'.format(F), '{:>3.2f}'.format(PVAL), '{:>3.2f}'.format(df1),
        '{:>3d}'.format(df2)
    ])
Beispiel #27
0
def ANOVA(n_T, k, SSTr, SSE):
    df1 = k - 1
    df2 = n_T - k

    MSTr = SSTr / (k - 1)
    MSE = SSE / (n_T - k)

    F_statistic = MSTr / MSE
    p_value = f.sf(F_statistic, df1, df2)

    print("ANOVA with n_T {}, k {}, SSTr {}, SSE {}".format(n_T, k, SSTr, SSE))
    print("MSTr : {:.4f}, MSE : {:.4f}".format(MSTr, MSE))
    print(
        "F-statistic : {:.4f}, p-value = P(F_(k-1,n_T-k) >= {:.4f}) = {:.6f}".
        format(F_statistic, F_statistic, p_value))
Beispiel #28
0
def robust_cv_test(res_a, res_b):
    # Combined 5x2cv F Test for Comparing SupervisedClassification Learning Algorithms
    # https://www.cmpe.boun.edu.tr/~ethem/files/papers/NC110804.PDF
    # res_a and res_b are the results of two classifiers with shape num_folds x 2
    assert res_a.shape == res_b.shape, 'The two arrays should have equal dimensions'
    assert res_a.shape[1] == 2, 'Dimension 1 should be 2 for both arrays'
    num_folds = res_a.shape[0]

    diff = res_a - res_b
    diff_fold = diff.mean(axis=1, keepdims=True)
    var = ((diff - diff_fold)**2).sum(axis=1)
    f_val = (diff**2).sum() / (2 * var.sum())
    p_val = f.sf(f_val, 2 * num_folds, num_folds)

    return p_val
Beispiel #29
0
    def hist(self):
        if cp.spec_hist != 'hist':
            cp.ax2.set_yscale('linear')
            cp.ax2.set_xscale('linear')
            cp.curveSpec.remove()
            cp.curveM2.remove()
            cp.curveM1.remove()
            cp.curveLorentz.remove()
            cp.spec_hist = 'hist'
            cp.ax2.set_ylabel('Count')

        param = h_map[:, :, cp.marker]

        if cp.marker == 4:
            param = (1. / np.exp(param)) / 60.
        elif cp.marker == 5:
            param = (1. / (np.exp(h_map[:, :, 4] + h_map[:, :, 5]) -
                           np.exp(h_map[:, :, 4] - h_map[:, :, 5]))) / 60.

        if not cp.mask_bool:
            cp.ax2_title.set_text('Histogram: %s' % titles[cp.marker])
            pflat = param.flatten()
            h_color = 'black'

        elif cp.mask_bool:
            cp.ax2_title.set_text('Histogram: %s | Masked' % titles[cp.marker])
            h_color = 'red'

            # ---- generate p-value heatmap + masked Lorentzian component heatmaps
            dof1, dof2 = 3, 6  # degrees of freedom for model M1, M2
            p_val = ff.sf(h_map[:, :, 6], dof1, dof2)
            p_val[np.isnan(p_val)] = 1
            param_mask = np.copy(param)
            param_mask[p_val > cp.mask_val] = 0.
            pflat = param_mask.flatten()
            pflat = pflat[pflat != 0]

        pNaN = pflat[~np.isnan(pflat)]
        y, x, _ = cp.ax2.hist(pNaN,
                              bins=25,
                              edgecolor='black',
                              alpha=0.75,
                              color=h_color)  # need a set_data
        cp.ax2.set_xlabel('%s' % titles[cp.marker])
        cp.ax2.set_xlim(np.percentile(pNaN, 1), np.percentile(pNaN, 99))
        cp.ax2.set_ylim(0, y.max() * 1.1)
        cp.leg.set_visible(False)
        plt.draw()
Beispiel #30
0
def proc_Dsuff(xlist,
               ylist,
               pltitle='DXY',
               Csuff=0.0,
               fname='DXY.png',
               Yval=1,
               opdirname='./',
               opcsv='OP',
               optxt='OT'):
    ssd = 0.0
    msd = 0.0
    df1 = 0.0
    F = 0.0
    PVAL = 0.0
    nullsd = 0.0
    #print( 'PDsuff OP Dirname:', opdirname )
    ssd = calc_ssd(xlist, ylist, pltitle, fname, Yval, opdirname)
    df1 = calc_df1(xlist, ylist)

    df2 = len(ylist)

    nullsd = calc_nullsd2(xlist, ylist, error_value)
    emsd = nullsd
    if (df1 > 0):
        msd = ssd / df1
        F = msd / emsd
        PVAL = f.sf(F, df1, df2, loc=0, scale=1)
    ## Only do calcs if DF1 > 0. Error o'wise.
    ## Write output to text file instead of on screen for GUI version.
    ## Spaces added for alignment.
    ProcLabel = os.path.splitext(fname)[0]
    ProcLabel = ProcLabel.replace('D', '')  # Remove D from Label.
    txtmsg = '{:>10s}'.format( ProcLabel ) + \
    ' {:>2d}'.format( Yval ) + \
    ' {:>4.3f}'.format( Csuff ) + \
    ' {:>8.3f}'.format( ssd ) + \
    ' {:>11.3f}'.format( F ) + \
    ' {:>3.2f}'.format( PVAL ) + \
    ' {:>3.2f}'.format( df1 ) + \
    '{:>3d}'.format( df2 ) + '\n'

    optxt.write(txtmsg)
    #print
    opcsv.writerow([
        ProcLabel, Yval, '{:>4.3f}'.format(Csuff), '{:>8.3f}'.format(ssd),
        '{:>11.3f}'.format(F), '{:>3.2f}'.format(PVAL), '{:>3.2f}'.format(df1),
        '{:>3d}'.format(df2)
    ])
Beispiel #31
0
def proc_Dsuff(  xlist, ylist, pltitle = 'DXY', Csuff = 0.0, fname = 'DXY.png' ):
	ssd = 0.0
	msd = 0.0
	df1 = 0.0
	F   = 0.0
	PVAL = 0.0
	nullsd = 0.0
	ssd = calc_ssd(  xlist, ylist, pltitle, fname )
	df1 = calc_df1(  xlist, ylist )
	
	df2 = len( ylist )

	nullsd = calc_nullsd3( xlist, ylist, error_value )
	emsd = nullsd 
	if ( df1 > 0 ):
		msd = ssd/df1
		F = msd/emsd 
		PVAL = f.sf ( F, df1, df2, loc=0, scale=1 ) 
	#else:		print 'ERR - DF1 Div by Zero. F error.'
	## Only do calcs if DF1 > 0. Error o'wise.


	'''
	print 'Fname:', os.path.splitext( fname )[0]
	print 'SSD:', ssd
	print 'DF1:', df1
	print 'DF2:', df2
	print 'NULLSD:', nullsd
	print 'MSD:', msd
	print 'EMSD:', emsd
	print 'F:', F 
	print 'PVAL - SF:', PVAL
	print 'Csuff:', Csuff
	'''
	
	ProcLabel = os.path.splitext( fname )[0] 
	ProcLabel = ProcLabel.replace( 'D', '' ) # Remove D from Label.
	print '{:>10s}'.format( ProcLabel ), 
	print '{:>2d}'.format( Yval ),
	print '{:>4.3f}'.format( Csuff ),
	print '{:>8.3f}'.format( ssd ),
	print '{:>11.3f}'.format( F ),
	print '{:>3.2f}'.format( PVAL ),
	print '{:>3d}'.format( df2 )
	#print
	opcsv.writerow( [ ProcLabel, Yval, '{:>4.3f}'.format( Csuff ), '{:>8.3f}'.format( ssd ), 
	'{:>11.3f}'.format( F ), '{:>3.2f}'.format( PVAL ), '{:>3d}'.format( df2 ) ] )
Beispiel #32
0
def histMask(p):
    global mask_val
    global hist0

    param = h_map[:, :, p]

    # generate p-value heatmap + masked Lorentzian component heatmaps
    dof1, dof2 = 3, 6  # degrees of freedom for model M1, M2
    p_val = ff.sf(h_map[:, :, 6], dof1, dof2)
    param_mask = np.copy(param)
    param_mask[p_val > mask_val] = 0.
    param1d = np.reshape(param_mask,
                         (param_mask.shape[0] * param_mask.shape[1]))
    pmask = param1d[param1d != 0]
    pmask = pmask[~np.isnan(pmask)]
    title.set_text('Histogram: %s | Masked' % titles[marker])
    hist0 = ax2.hist(pmask, bins=25, edgecolor='black')
def cal_fullmodel(data_df, out_col, consist_col, rank, RSS):
    """

    This function is used to calculate rsquared, rsquared_adj, fvalue, f_pvalue, and DoF of F-test for full model(
    data before demean process)

    """
    TSS = sum(((data_df[out_col] - data_df[out_col].mean())**2).values)[0]
    rsquared = 1 - RSS / TSS
    rsquared_adj = 1 - (len(data_df) - 1) / (len(data_df) - len(consist_col) -
                                             rank) * (1 - rsquared)
    fvalue = (TSS - RSS) * (len(data_df) - len(consist_col) -
                            rank) / (RSS * (rank + len(consist_col) - 1))
    f_pvalue = f.sf(fvalue, (rank + len(consist_col) - 1),
                    (len(data_df) - len(consist_col) - rank))
    f_df = [(rank + len(consist_col) - 1),
            (len(data_df) - len(consist_col) - rank)]
    return rsquared, rsquared_adj, fvalue, f_pvalue, f_df
Beispiel #34
0
 def calAnova(self, x, y):
     """
     Input
     x:变量[1D]array
     y:实际标签[1D]array
     return
     DataFrame
     """
     m = len(y)
     yValues = np.unique(y)
     n = len(yValues)
     xbar = x.mean()
     ximean = []                         #每类样本的均值列表
     xicount = []                        #每类样本的数量列表
     #计算自由度
     dfList = [n-1, m-n, m-1]            #(组间,组内,合计)
     #计算离差平方和
     ##组内
     SSList = []
     SSw = 0
     for value in yValues:
         xi = x[y==value]
         xicount.append(len(xi))         #每类的数量
         xmean = xi.mean()
         ximean.append(xmean)            #每类的均值
         SSw += np.power((xi-xmean), 2).sum()
     ##组间
     SSb = np.dot(np.power((ximean-xbar), 2), xicount)
     ##合计
     SSt = SSw + SSb
     SSList = [SSb, SSw, SSt]
     #计算均方
     MSList = [SSb/dfList[0], SSw/dfList[1]]
     #计算F值和P值
     Fvalue = MSList[0]/MSList[1]
     pValue = f.sf(Fvalue, dfList[0], dfList[1])
     #返回df
     df = pd.DataFrame(index=['组间', '组内', '合计'], columns=['自由度', '离差平方和', '均方', 'F值', 'P值'])
     df.iloc[:,0] = dfList
     df.iloc[:,1] = SSList
     df.iloc[:2,2] = MSList
     df.iloc[0,3] = Fvalue
     df.iloc[0,4] = pValue
     return df
Beispiel #35
0
    def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None,
                 df_num=None, alpha=0.05, **kwds):

        self.effect = effect  # Let it be None for F
        if F is not None:
            self.distribution = 'F'
            self.fvalue = F
            self.statistic = self.fvalue
            self.df_denom = df_denom
            self.df_num = df_num
            self.dist = fdist
            self.dist_args = (df_num, df_denom)
            self.pvalue = fdist.sf(F, df_num, df_denom)
        elif t is not None:
            self.distribution = 't'
            self.tvalue = t
            self.statistic = t  # generic alias
            self.sd = sd
            self.df_denom = df_denom
            self.dist = student_t
            self.dist_args = (df_denom,)
            self.pvalue = self.dist.sf(np.abs(t), df_denom) * 2
        elif 'statistic' in kwds:
            # TODO: currently targeted to normal distribution, and chi2
            self.distribution = kwds['distribution']
            self.statistic = kwds['statistic']
            self.tvalue = value = kwds['statistic']  # keep alias
            # TODO: for results instance we decided to use tvalues also for normal
            self.sd = sd
            self.dist = getattr(stats, self.distribution)
            self.dist_args = kwds.get('dist_args', ())
            if self.distribution is 'chi2':
                self.pvalue = self.dist.sf(self.statistic, df_denom)
                self.df_denom = df_denom
            else:
                "normal"
                self.pvalue = np.full_like(value, np.nan)
                not_nan = ~np.isnan(value)
                self.pvalue[not_nan] = self.dist.sf(np.abs(value[not_nan])) * 2

        # cleanup
        # should we return python scalar?
        self.pvalue = np.squeeze(self.pvalue)
def cal_fullmodel(data_df, out_col, consist_col, category_col, rank, RSS,
                  originRSS):
    """

    This function is used to calculate rsquared, rsquared_adj, fvalue, f_pvalue, and DoF of F-test for full model(
    data before demean process)

    """
    k0 = 0
    if ('const' in consist_col):
        k0 = 1

    if k0 == 0 and category_col == []:
        TSS = sum(((data_df[out_col])**2).values)[0]
    else:
        TSS = sum(((data_df[out_col] - data_df[out_col].mean())**2).values)[0]

    RSS = float("{:.6f}".format(RSS))
    TSS = float("{:.6f}".format(TSS))
    originRSS = float("{:.6f}".format(originRSS))

    rsquared = 1 - RSS / TSS
    #rsquared_adj = 1 - (len(data_df) - 1) / (len(data_df) - len(consist_col) - rank) * (1 - rsquared)
    if category_col != []:
        rsquared_adj = 1 - len(data_df) / (len(data_df) - len(consist_col) -
                                           rank + k0) * (1 - rsquared)
    else:
        rsquared_adj = 1 - (len(data_df) - k0) / (
            len(data_df) - len(consist_col)) * (1 - rsquared)

    df_full_model = rank + len(consist_col) - k0

    if df_full_model > 0:
        fvalue = (TSS - originRSS) * (len(data_df) - len(consist_col) -
                                      rank) / (RSS * df_full_model)
    else:
        fvalue = 0
    f_pvalue = f.sf(fvalue, (rank + len(consist_col) - k0),
                    (len(data_df) - len(consist_col) - rank + k0))
    f_df = [(rank + len(consist_col) - k0),
            (len(data_df) - len(consist_col) - rank + k0)]

    return rsquared, rsquared_adj, fvalue, f_pvalue, f_df
def granger_causality(df, max_lag, pval=0.05, autocausation=True):
    '''
    Calculates the causality between time series pairs in `df` using the 
    Granger Causality technique. 

    Arguments:
        df: A pandas dataframe of shape (N x p)
        max_lag: Number of past lagged inputs to include in the linear model (int)
        pval: Type 1 error for rejecting null hypothesis of no causality. 

    Returns:
        A binary causality matrix.
    '''
    # Create numpy input and output arrays for VAR modelling from dataframe
    X, Y = _create_dataset_vector_output(df, max_lag)

    assert len(X) == len(Y)

    # Infer data dimensions
    N, p = Y.shape

    # Calculate RSS for full model
    RSS_full = np.expand_dims(_calc_RSS(X, Y), axis=1)

    # Calculate RSS for reduced models
    RSS = np.zeros((p, p))
    for i in range(p):
        RSS[:, i] = _calc_RSS(
            np.delete(X, slice(i * max_lag, (i + 1) * max_lag), axis=1), Y)

    # Calculate f-stats
    f_stats = ((RSS - RSS_full) / max_lag) / (RSS_full / (N - p * max_lag))

    # Binarize values based on significance
    causalities = f.sf(f_stats, max_lag, N - p * max_lag) <= pval

    if not autocausation:
        for i in range(len(causalities)):
            causalities[i, i] = False

    return causalities.astype(float)
Beispiel #38
0
    def test(self, x, y):
        r"""
        Calculates the Hotelling :math:`T^2` test statistic and p-value.

        Parameters
        ----------
        x,y : ndarray
            Input data matrices. ``x`` and ``y`` must have the same number of
            dimensions. That is, the shapes must be ``(n, p)`` and ``(m, p)`` where
            `n` is the number of samples and `p` and `q` are the number of
            dimensions.

        Returns
        -------
        stat : float
            The computed Hotelling :math:`T^2` statistic.
        pvalue : float
            The computed Hotelling :math:`T^2` p-value.

        Examples
        --------
        >>> import numpy as np
        >>> from hyppo.ksample import Hotelling
        >>> x = np.arange(7)
        >>> y = x
        >>> stat, pvalue = Hotelling().test(x, y)
        >>> '%.3f, %.1f' % (stat, pvalue)
        '0.000, 1.0'
        """
        check_input = _CheckInputs(inputs=[x, y], )
        x, y = check_input()

        stat = self.statistic(x, y)
        nx, p = x.shape
        ny = y.shape[0]
        pvalue = f.sf(stat, p, nx + ny - p - 1)
        self.stat = stat
        self.pvalue = pvalue
        self.null_dist = None

        return stat, pvalue
Beispiel #39
0
def plotMask(p):
    global mask_val
    global cbar1
    global im
    cbar1.remove()
    im.remove()

    param = h_map[:, :, p]
    pflat = np.reshape(param, (param.shape[0] * param.shape[1]))
    pNaN = pflat[~np.isnan(pflat)]
    h_min = np.percentile(pNaN, 1)
    h_max = np.percentile(pNaN, 99)

    # generate p-value heatmap + masked Lorentzian component heatmaps
    dof1, dof2 = 3, 6  # degrees of freedom for model M1, M2
    p_val = ff.sf(h_map[:, :, 6], dof1, dof2)
    param_mask = np.copy(param)
    param_mask[p_val > mask_val] = np.NaN

    # determine percentage of region masked
    count = np.count_nonzero(np.isnan(param_mask))
    total_pix = p_val.shape[0] * p_val.shape[1]
    mask_percent = ((np.float(count)) / total_pix) * 100

    if p == 4:
        c_map = 'jet_r'
    else:
        c_map = 'jet'
    im = ax1.imshow(param_mask,
                    cmap=c_map,
                    interpolation='nearest',
                    vmin=h_min,
                    vmax=h_max,
                    picker=True)
    ax1.set_title(r'%s | $f_{masked}$ = %0.1f%s' %
                  (titles[p], mask_percent, '%'),
                  y=1.01,
                  fontsize=17)
    colorBar()
Beispiel #40
0
def _calc_stats(numer_ss, numer_df, denom_ss, denom_df):
    """Given the appropriate sum of squares for the numerator and the mean sum
    of squares for the denominator (with respective degrees of freedom) this
    will return the relevant statistics of an F-test.

    Arguments:
        numer_ss - Sum of squares for the numerator.
        numer_df - Degrees of freedom for the numerator.
        denom_ms - Mean sum of squares for the denominator.
        denom_df - Degrees of freedom for the denominator.

    Returns:
        A tuple of three values: Element 0 contains the mean sum of squares for
        the numerator, element 1 contains the F statistic calculated, and 
        element 2 contains the associated p-value for the generated
        F-statistic.
    """
    numer_ms = numer_ss / numer_df
    denom_ms = denom_ss / denom_df
    f_val = numer_ms / denom_ms
    p_val = f.sf(f_val, numer_df, denom_df)
    return f_val, p_val
Beispiel #41
0
def f_test(chi1, chi2, ndata, par1, par2, alpha=0.02, verbose=True):
    '''Do the f-test. Function from Sepideh. Returns 0 if less params is better,
    1 if null-hypothesis rejected and more params give gain'''
    # chi1_red = chi1 / (ndata - par1)
    chi2_red = chi2 / (ndata - par2)

    F = ((chi1 - chi2) / (par2 - par1)) / chi2_red  # Fstatistic
    # p-value threshold = 0.005
    alpha = 0.005  # 0.01 #Or whatever you want your alpha to be.

    p_value = f.sf(F, par2 - par1, ndata - par2, loc=0, scale=1)
    print("p_value", p_value)
    print("log(p_value)", np.log10(p_value))
    if p_value < alpha:
        better_model = 1
        print("Null hypothesis rejected. Model 1 with mre params is better.")
        print("Probability = ", (1.0 - p_value) * 100.0, '%')
    else:
        print("Null hypothesis cannot be rejected. \
        Seems model 0 with less params is good enough.")
        better_model = 0
    return better_model
Beispiel #42
0
 def f_oneway(self, *args):
     if args==None or len(args)==0 or len(args[0])==0:
         return[None,None]
     r = len(args)
     c = len(args[0])
     meanr=[]
     mean,sa,se=0.0,0.0,0.0
     for i in args:
         meanr.append(sum(i)/float(len(i)))
         mean+=sum(i)
     mean/=(r*c)
     for i in range(r):
         sa+=c*(meanr[i]-mean)**2
         for j in range(c):
             se+=(args[i][j]-meanr[i])**2
     fa=r-1
     fe=r*c-r
     va=sa/fa
     ve=se/fe
     F=va/ve
     P=f.sf(F,fa,fe)
     return[round(F,6),round(P,6)]
Beispiel #43
0
## Depict P(F(1, n) > F) ie. folor the surface defined by x values larger than F beloww the F(1, n)
from scipy.stats import f
fvalues = np.linspace(10, 25, 100)

plt.plot(fvalues, f.pdf(fvalues, 1, 30), 'b-', label="F(1, 30)")

upper_fval_fvalues = fvalues[fvalues > fval]
plt.fill_between(upper_fval_fvalues, 0, f.pdf(upper_fval_fvalues, 1, 30), alpha=.8)

# pdf(x, df1, df2): Probability density function at x of the given RV.
plt.legend()

## P(F(1, n) > F) is the p-value, compute it

# Survival function (1 - `cdf`)
pval = f.sf(fval, 1, n - 2)



## With statmodels
from statsmodels.formula.api import ols
model = ols('S ~ X', salary)
results = model.fit()
print(results.summary())

## sklearn
import sklearn.feature_selection
#sklearn.feature_selection.f_regression??
sklearn.feature_selection.f_regression(x.reshape((n, 1)), y)

"""
Beispiel #44
0
    def granger_causality(self):
        """Returns the f-stats and p-values from the Granger Causality Test.

        If the data consists of columns x1, x2, x3, then we perform the
        following regressions:

        x1 ~ L(x2, x3)
        x1 ~ L(x1, x3)
        x1 ~ L(x1, x2)

        The f-stats of these results are placed in the 'x1' column of the
        returned DataFrame.  We then repeat for x2, x3.

        Returns
        -------
        Dict, where 'f-stat' returns the DataFrame containing the f-stats,
        and 'p-value' returns the DataFrame containing the corresponding
        p-values of the f-stats.
        """
        from pandas.stats.api import ols
        from scipy.stats import f

        d = {}
        for col in self._columns:
            d[col] = {}
            for i in xrange(1, 1 + self._p):
                lagged_data = self._lagged_data[i].filter(self._columns - [col])

                for key, value in lagged_data.iteritems():
                    d[col][_make_param_name(i, key)] = value

        f_stat_dict = {}
        p_value_dict = {}

        for col, y in self._data.iteritems():
            ssr_full = (self.resid[col] ** 2).sum()

            f_stats = []
            p_values = []

            for col2 in self._columns:
                result = ols(y=y, x=d[col2])

                resid = result.resid
                ssr_reduced = (resid ** 2).sum()

                M = self._p
                N = self._nobs
                K = self._k * self._p + 1
                f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K))
                f_stats.append(f_stat)

                p_value = f.sf(f_stat, M, N - K)
                p_values.append(p_value)

            f_stat_dict[col] = Series(f_stats, self._columns)
            p_value_dict[col] = Series(p_values, self._columns)

        f_stat_mat = DataFrame(f_stat_dict)
        p_value_mat = DataFrame(p_value_dict)

        return {
            'f-stat': f_stat_mat,
            'p-value': p_value_mat,
        }
Beispiel #45
0
def typeII(response, ancova, recarray):
    """
    Produce an ANCOVA table
    from a given ANCOVA formula
    with type II sums of squares.

    Inputs
    ------

    response: str
              field name of response in recarray

    ancova: ANCOVA
            specifies the model to be fit

    recarray: np.ndarray
              should contain all field names in the terms of ancova
              as well as response
    """

    Y = recarray[response]
    X = ancova.formula.design(recarray, return_float=True)
    model = OLS(Y, X)
    results = model.fit()
    SSE_F = np.sum(results.resid**2)
    df_F = results.df_resid

    names = []
    sss = []
    fs = []
    dfs = []
    pvals = []

    for name, expr_factors in zip(ancova.contrast_names,
                                  ancova.sequence()):
        expr, factors = expr_factors
        F = ancova.all_but_above(expr, factors)
        C = ancova.contrasts[name]
        XF, contrast_matrices = F.formula.design(recarray, contrasts={'C':C})
        modelF = OLS(Y, XF)
        resultsF = modelF.fit()

        SSEF = np.sum(resultsF.resid**2)
        dfF = resultsF.df_resid
        ftest = resultsF.f_test(contrast_matrices['C'])

        SSER = SSEF + ftest.fvalue * ftest.df_num * (SSEF / dfF)
        dfR = dfF + ftest.df_num

        sss.append(SSER - SSEF)
        dfs.append(ftest.df_num)
        fs.append(((SSER - SSEF) / (dfR - dfF)) / (SSE_F / df_F))
        pvals.append(f_dbn.sf(fs[-1], dfR-dfF, df_F))
        names.append(name)

    # Add in the "residual row"

    sss.append(SSE_F)
    dfs.append(df_F)
    pvals.append(np.nan)
    fs.append(np.nan)
    names.append('Residuals')

    result = np.array(names, np.dtype([('contrast','S%d' % max([len(n) for n in names]))]))
    result = ML.rec_append_fields(result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals])
    return result
Beispiel #46
0
def typeI(response, ancova, recarray):
    """
    Produce an ANCOVA table
    from a given ANCOVA formula
    with type I sums of squares
    where the order is based on the order of terms
    in the contrast_names of ancova.

    Inputs
    ------

    response: str
              field name of response in recarray

    ancova: ANCOVA
            specifies the model to be fit

    recarray: np.ndarray
              should contain all field names in the terms of ancova
              as well as response
    """

    Y = recarray[response]
    X = ancova.formula.design(recarray, return_float=True)
    model = OLS(Y, X)
    results = model.fit()
    SSE_F = np.sum(results.resid**2)
    df_F = results.df_resid

    model = OLS(Y, ancova.formulae[0].design(recarray, return_float=True))
    results = model.fit()
    SSE_old = np.sum(results.resid**2)
    df_old = results.df_resid

    names = []
    sss = []
    fs = []
    dfs = []
    pvals = []

    names.append(ancova.contrast_names[0])
    fs.append(((np.sum(Y**2) - SSE_old) / (Y.shape[0] - df_old)) / (SSE_F / df_F))
    sss.append((np.sum(Y**2) - SSE_old))
    dfs.append(Y.shape[0] - df_old)
    pvals.append(f_dbn.sf(fs[-1], Y.shape[0]-df_old, df_F))

    for d in range(1,len(ancova.formulae)):
        terms = []
        for f in ancova.formulae[:(d+1)]:
            terms += list(f.terms)

        # JT: this is not numerically efficient
        # could be done by updating some factorization of the full X

        X = Formula(terms).design(recarray, return_float=True)
        model = OLS(Y, X)
        results = model.fit()
        SSE_new = np.sum(results.resid**2)
        df_new = results.df_resid

        sss.append(SSE_old - SSE_new)
        dfs.append(df_old - df_new)
        fs.append(((SSE_old-SSE_new) / (df_old - df_new)) / (SSE_F / df_F))
        pvals.append(f_dbn.sf(fs[-1], df_old-df_new, df_new))
        names.append(ancova.contrast_names[d])
        SSE_old = SSE_new
        df_old = df_new

    # Add in the "residual row"

    sss.append(SSE_new)
    dfs.append(df_new)
    pvals.append(np.nan)
    fs.append(np.nan)
    names.append('Residuals')

    result = np.array(names, np.dtype([('contrast','S%d' % max([len(n) for n in names]))]))
    result = ML.rec_append_fields(result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals])
    return result
def lomb(time, signal, error, f1, df, numf, nharm=8, psdmin=6., detrend_order=0,freq_zoom=10.,tone_control=1.,return_model=True,lambda0=1.,lambda0_range=[-8,6]):
    """
    C version of lomb_scargle:
    Simultaneous fit of a sum of sinusoids by weighted, linear least squares.
          model(t) = Sum_k Ck*t^k + Sum_i Sum_j Aij sin(2*pi*j*fi*(t-t0)+phij), i=[1,nfreq], j=[1,nharm]
           [t0 defined such that ph11=0]

    Inputs:
        time: time vector
        signal: data vector
        error: data uncertainty vector
        df: frequency step
        numf: number of frequencies to consider

        detrend_order: order of polynomial detrending (Ck orthogonol polynomial terms above;
            0 floating mean; <0 no detrending)

        psdmin: refine periodogram values with larger psd using multi-harmonic fit
        nharm: number of harmonics to use in refinement
        lambda0: typical value for regularization parameter (expert parameter)
        lambda0_range: allowable range for log10 of regularization parameter

    Output:
        psd: power spectrum on frequency grid: f1,f1+df,...,f1+numf*df
        out_dict: dictionary describing various parameters of the multiharmonic fit at
            the best-fit frequency
    """
    numt = len(time)

    freq_zoom = round(freq_zoom/2.)*2.

    dord = detrend_order
    if (detrend_order<0):
        dord=0

    if (tone_control<0):
        tone_control=0.

    # polynomial terms
    coef = empty(dord+1,dtype='float64')
    norm = empty(dord+1,dtype='float64')

    wth0 = (1./error).astype('float64')
    s0 = dot(wth0,wth0)
    wth0 /= sqrt(s0)

    cn = (signal*wth0).astype('float64')
    coef[0] = dot(cn,wth0); cn0 = coef[0]; norm[0] = 1.
    cn -= coef[0]*wth0
    vcn = 1.

    # sin's and cosin's for later
    tt = 2*pi*time.astype('float64')
    sinx,cosx = sin(tt*f1)*wth0,cos(tt*f1)*wth0
    sinx_step,cosx_step = sin(tt*df),cos(tt*df)
    sinx_back,cosx_back = -sin(tt*df/2.),cos(tt*df/2)
    sinx_smallstep,cosx_smallstep = sin(tt*df/freq_zoom),cos(tt*df/freq_zoom)

    npar=2*nharm
    hat_matr = empty((npar,numt),dtype='float64')
    hat0 = empty((npar,dord+1),dtype='float64')
    hat_hat = empty((npar,npar),dtype='float64')
    soln = empty(npar,dtype='float64')
    psd = zeros(numf,dtype='float64')

    # detrend the data and create the orthogonal detrending basis
    if (dord>0):
        wth = empty((dord+1,numt),dtype='float64')
        wth[0,:] = wth0
    else:
        wth = wth0

    for i in range(detrend_order):
        f = wth[i,:]*tt/(2*pi)
        for j in range(i+1):
            f -= dot(f,wth[j,:])*wth[j,:]
        norm[i+1] = sqrt(dot(f,f)); f /= norm[i+1]
        coef[i+1] = dot(cn,f)
        cn -= coef[i+1]*f
        wth[i+1,:] = f
        vcn += (f/wth0)**2


    chi0 = dot(cn,cn)
    varcn = chi0/(numt-1-dord)
    psdmin *= 2*varcn

    Tr = array(0.,dtype='float64')
    ifreq = array(0,dtype='int32')
    lambda0 = array(lambda0/s0,dtype='float64')
    lambda0_range = 10**array(lambda0_range,dtype='float64')/s0

    vars=['numt','numf','nharm','detrend_order','psd','cn','wth','sinx','cosx','sinx_step','cosx_step','sinx_back','cosx_back','sinx_smallstep','cosx_smallstep','hat_matr','hat_hat','hat0','soln','chi0','freq_zoom','psdmin','tone_control','lambda0','lambda0_range','Tr','ifreq']
    weave.inline(lomb_code, vars, support_code = eigs_code + lomb_scargle_support,force=0)

    hat_hat /= s0
    ii = arange(nharm,dtype='int32')
    soln[0:nharm] /= (1.+ii)**2; soln[nharm:] /= (1.+ii)**2
    if (detrend_order>=0):
        hat_matr0 = outer(hat0[:,0],wth0)
    for i in range(detrend_order):
        hat_matr0 += outer(hat0[:,i+1],wth[i+1,:])


    modl = dot(hat_matr.T,soln); modl0 = dot(hat_matr0.T,soln)
    coef0 = dot(soln,hat0)
    coef -= coef0
    if (detrend_order>=0):
        hat_matr -= hat_matr0

    out_dict={}
    out_dict['chi0'] = chi0*s0
    if (return_model):
        if (dord>0):
            out_dict['trend'] = dot(coef,wth)/wth0
        else:
            out_dict['trend'] = coef[0] + 0*wth0
        out_dict['model'] = modl/wth0 + out_dict['trend']

    j = psd.argmax()
    freq = f1+df*j + (ifreq/freq_zoom - 1/2.)*df
    tt = (time*freq) % 1. ; s =tt.argsort()
    out_dict['freq'] = freq
    out_dict['s0'] = s0
    out_dict['chi2'] = (chi0 - psd[j])*s0
    out_dict['psd'] = psd[j]*0.5/varcn
    out_dict['lambda0'] = lambda0*s0
    out_dict['gcv_weight'] = (1-3./numt)/Tr
    out_dict['trace'] = Tr
    out_dict['nu0'] = numt - npar
    npars = (1-Tr)*numt/2
    out_dict['nu'] = numt-npars
    out_dict['npars'] = npars

    A0, B0 = soln[0:nharm],soln[nharm:]
    hat_hat /= outer( hstack(((1.+ii)**2,(1.+ii)**2)),hstack(((1.+ii)**2,(1.+ii)**2)) )
    err2 = diag(hat_hat)
    vA0, vB0 = err2[0:nharm], err2[nharm:]
    covA0B0 = hat_hat[(ii,nharm+ii)]

    if (return_model):
        vmodl = vcn/s0 + dot( (hat_matr/wth0).T, dot(hat_hat, hat_matr/wth0) )
        vmodl0 = vcn/s0 + dot( (hat_matr0/wth0).T, dot(hat_hat, hat_matr0/wth0) )
        out_dict['model_error'] = sqrt(diag(vmodl))
        out_dict['trend_error'] = sqrt(diag(vmodl0))

    amp = sqrt(A0**2+B0**2)
    damp = sqrt( A0**2*vA0 + B0**2*vB0 + 2.*A0*B0*covA0B0 )/amp
    phase = arctan2( B0,A0 )
    rel_phase = phase - phase[0]*(1.+ii)
    rel_phase = arctan2( sin(rel_phase),cos(rel_phase) )
    dphase = 0.*rel_phase
    for i in range(nharm-1):
        j=i+1
        v = array([-A0[0]*(1.+j)/amp[0]**2,B0[0]*(1.+j)/amp[0]**2,A0[j]/amp[j]**2,-B0[j]/amp[j]**2])
        jj=array([0,nharm,j,j+nharm])
        m = hat_hat[ix_(jj,jj)]
        dphase[j] = sqrt( dot(dot(v,m),v) )

    out_dict['amplitude'] = amp
    out_dict['amplitude_error'] = damp
    out_dict['rel_phase'] = rel_phase
    out_dict['rel_phase_error'] = dphase
    out_dict['time0'] = -phase[0]/(2*pi*freq)

    ncp = norm.cumprod()
    out_dict['trend_coef'] = coef/ncp
    out_dict['cn0'] = out_dict['trend_coef'][0] - cn0
    out_dict['trend_coef_error'] = sqrt( ( 1./s0 + diag(dot(hat0.T,dot(hat_hat,hat0))) )/ncp**2 )
    out_dict['cn0_error'] = out_dict['trend_coef_error'][0]

    prob = fdist.sf( 0.5*(numt-1.-dord)*(1.-out_dict['chi2']/out_dict['chi0']), 2,numt-1-dord )
    out_dict['signif'] = lprob2sigma(log(prob))

    return 0.5*psd/varcn,out_dict