def proc_Dsuff( xlist, ylist, pltitle = 'DXY', Csuff = 0.0, fname = 'DXY.png' ): # Mix of error calcs. Suffix-2 vars use the old error-value calc. # New method uses MEM and Sigma, inpit on command line. ssd = 0.0 msd = 0.0 df1 = 0.0 F = 0.0 F2 = 0.0 PVAL = 0.0 PVAL2 = 0.0 nullsd = 0.0 nullsd2 = 0.0 ssd = calc_ssd( xlist, ylist, pltitle, fname ) df1 = calc_df1( xlist, ylist ) df2 = len( ylist ) nullsd = calc_nullsd1( measurement_error_mean, sigma_mem ) nullsd2 = calc_nullsd3( xlist, ylist, error_value ) emsd = nullsd emsd2 = nullsd2 if ( df1 > 0 ): msd = ssd/df1 F = msd/emsd PVAL = f.sf ( F, df1, df2, loc=0, scale=1 ) #else: print 'ERR - DF1 Div by Zero. F error.' ## Only do calcs if DF1 > 0. Error o'wise. if ( df1 > 0 ): msd = ssd/df1 F2 = msd/emsd2 PVAL2 = f.sf ( F2, df1, df2, loc=0, scale=1 ) ProcLabel = os.path.splitext( fname )[0] ProcLabel = ProcLabel.replace( 'D', '' ) # Remove D from Label. print '{:>10s}'.format( ProcLabel ), print '{:>2d}'.format( Yval ), print '{:>4.3f}'.format( Csuff ), print '{:>8.3f}'.format( ssd ), print '{:>8.2f}'.format( msd ), print '{:>11.2f}'.format( emsd ), print '{:>16.3f}'.format( F ), print '{:>5.2f}'.format( PVAL ), print '{:>11.3f}'.format( F2 ), print '{:>5.2f}'.format( PVAL2 ), print '{:>3d}'.format( df2 ), print opcsv.writerow( [ ProcLabel, Yval, '{:>4.3f}'.format( Csuff ), '{:>8.3f}'.format( ssd ), '{:>8.2f}'.format( msd ), '{:>8.2f}'.format( emsd ), '{:>11.3f}'.format( F ), '{:>5.2f}'.format( PVAL ), '{:>11.3f}'.format( F2 ), '{:>5.2f}'.format( PVAL2 ), '{:>3d}'.format( df2 ) ] )
def proc_Dsuff(xlist, ylist, pltitle='DXY', Csuff=0.0, fname='DXY.png'): # Mix of error calcs. Suffix-2 vars use the old error-value calc. # New method uses MEM and Sigma, inpit on command line. ssd = 0.0 msd = 0.0 df1 = 0.0 F = 0.0 F2 = 0.0 PVAL = 0.0 PVAL2 = 0.0 nullsd = 0.0 nullsd2 = 0.0 ssd = calc_ssd(xlist, ylist, pltitle, fname) df1 = calc_df1(xlist, ylist) df2 = len(ylist) nullsd = calc_nullsd1(measurement_error_mean, sigma_mem) nullsd2 = calc_nullsd3(xlist, ylist, error_value) emsd = nullsd emsd2 = nullsd2 if (df1 > 0): msd = ssd / df1 F = msd / emsd PVAL = f.sf(F, df1, df2, loc=0, scale=1) #else: print 'ERR - DF1 Div by Zero. F error.' ## Only do calcs if DF1 > 0. Error o'wise. if (df1 > 0): msd = ssd / df1 F2 = msd / emsd2 PVAL2 = f.sf(F2, df1, df2, loc=0, scale=1) ProcLabel = os.path.splitext(fname)[0] ProcLabel = ProcLabel.replace('D', '') # Remove D from Label. print '{:>10s}'.format(ProcLabel), print '{:>2d}'.format(Yval), print '{:>4.3f}'.format(Csuff), print '{:>8.3f}'.format(ssd), print '{:>8.2f}'.format(msd), print '{:>11.2f}'.format(emsd), print '{:>16.3f}'.format(F), print '{:>5.2f}'.format(PVAL), print '{:>11.3f}'.format(F2), print '{:>5.2f}'.format(PVAL2), print '{:>3d}'.format(df2), print opcsv.writerow([ ProcLabel, Yval, '{:>4.3f}'.format(Csuff), '{:>8.3f}'.format(ssd), '{:>8.2f}'.format(msd), '{:>8.2f}'.format(emsd), '{:>11.3f}'.format(F), '{:>5.2f}'.format(PVAL), '{:>11.3f}'.format(F2), '{:>5.2f}'.format(PVAL2), '{:>3d}'.format(df2) ])
def f_one(*args): a = list(args) for ai in a: if not ai: return [None, None] r = len(a[0]) m = len(a) x_bar = mean(a) # t = mean(sum(a, [])) x_ib = [] for x in a: x_ib.append(mean(x)) assert len(x_ib) == m s_e = 0.0 s_t = 0.0 for i in range(m): for j in range(r): s_e += (a[i][j] - x_ib[i]) ** 2 s_t += (a[i][j] - x_bar) ** 2 s_a = s_t - s_e va = s_a / (m - 1) ve = s_e / (m * r - m) fv = va / ve p = f.sf(fv, dfn=m - 1, dfd=m * r - m) return [fv, p]
def test_scipy_f(): rng = np.random.RandomState(20120407) x = rng.normal(size=(100)) * 4 for m in np.arange(1, 15): for n in np.arange(1, 15): assert_array_almost_equal(f_sf(x, m, n), f.sf(x, m, n)) assert_array_almost_equal(f_cdf(x, m, n), f.cdf(x, m, n))
def f_oneway(self, *args): m = len(args) if m==0: return [None,None] r = len(args[0]) if r==0: return [None,None] n = r*m mean = self.mean(args) sa = 0 for i in args: sa = sa + (self.meani(i)-mean)**2 sa = sa * r va = sa/(m-1) se = 0 for i in args: for j in i: se = se + (j-self.meani(i))**2 ve = se/(n-m) f = va/ve p = F.sf(f,m-1,n-m) return [round(f,6),round(p,6)]
def f_one(*args): a = list(args) for ai in a: if not ai: return [None, None] r = len(a[0]) m = len(a) x_bar = mean(a) # t = mean(sum(a, [])) x_ib = [] for x in a: x_ib.append(mean(x)) assert len(x_ib) == m s_e = 0.0 s_t = 0.0 for i in range(m): for j in range(r): s_e += (a[i][j] - x_ib[i])**2 s_t += (a[i][j] - x_bar)**2 s_a = s_t - s_e va = s_a / (m - 1) ve = s_e / (m * r - m) fv = va / ve p = f.sf(fv, dfn=m - 1, dfd=m * r - m) return [fv, p]
def var_homo_test_base(n, var1, var2): if var1 < var2: var1, var2 = var2, var1 _F = var1 / var2 v1, v2 = n - 1, n - 1 _p = f.sf(_F, v1, v2) return _F, _p
def plotMap(self, p): param = h_map[:,:,p] h_min, h_max = vminVmax(param) if p == 4: c_map = 'jet_r' else: c_map = 'jet' if cp.mask_bool: # generate p-value heatmap + masked Lorentzian component heatmaps dof1, dof2 = 3, 6 # degrees of freedom for model M1, M2 p_val = ff.sf(h_map[:,:,6], dof1, dof2) p_val[np.isnan(p_val)] = 1 param_mask = np.copy(param) param_mask[p_val > cp.mask_val] = np.NaN # determine percentage of region masked count = np.count_nonzero(np.isnan(param_mask)) total_pix = p_val.shape[0]*p_val.shape[1] mask_percent = ((np.float(count))/total_pix)*100 cp.ax1_title.set_text(r'%s | $f_{masked}$ = %0.1f%s' % (titles[p], mask_percent, '%')) cp.im.set_data(param_mask) elif not cp.mask_bool: cp.ax1_title.set_text(r'%s' % titles[p]) cp.im.set_data(param) cp.im.set_clim(h_min, h_max) cp.im.set_cmap(c_map) cp.colorBar()
def f_oneway(self, *args): if len(args)==0: return[None,None] elif len(args)==1: return[None,None] else: meanr = [] mean = 0.0 sa = 0.0 se = 0.0 for i in args: meanr.append((float)(sum(i))/len(i)) mean+=sum(i) mean/=(len(args)*len(args[0])) for i in meanr: sa+=len(args[0])*(i-mean)**2 for i in range(len(meanr)): for j in range(len(args[0])): se+=(args[i][j]-meanr[i])**2 fa = len(args)-1 fe = len(args)*len(args[0])-len(args) va = sa/fa ve = se/fe F = va/ve P = f.sf(F,fa,fe) return[round(F,6),round(P,6)]
def f_classif(X, y): # TODO Ancora non ci siamo con la memoria (soprattutto le comprehension) groups, mask, counts = np.unique(y, return_inverse=True, return_counts=True) # SStotal print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "SStotal...") gmeans_ = np.array( [X[mask == g, :].mean(axis=0) for g in range(len(groups))]) means_ = gmeans_.mean(axis=0) sst_ = ((X - means_)**2).sum(axis=0) # SSwithin print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "SSwithin...") grouped_ss = [((X[mask == g, :] - gmeans_[g])**2).sum(axis=0) for g in range(len(groups))] #grouped_ss = [np.square(X[mask==g,:] - gmeans_[g]).sum(axis=0) for g in range(len(groups))] ssw_ = np.array(grouped_ss).sum(axis=0) # SSbetween print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "SSbetween...") grouped_ss = [(counts[g] * ((gmeans_[g] - means_)**2)) for g in range(len(groups))] #grouped_ss = [(counts[g] * np.square(gmeans_[g] - means_)) for g in range(len(groups))] ssb_ = np.array(grouped_ss).sum(axis=0) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "Completing...") # DF (degree of freedom) k, N = len(groups), X.shape[0] DFbetween, DFwithin, DFtotal = k - 1, N - k, N - 1 # F-score MSbetween = ssb_ / DFbetween MSwithin = ssw_ / DFwithin F = MSbetween / MSwithin # p-value pval = f.sf(F, DFbetween, DFwithin) return F, pval
def var_var(self, alpha): f0 = self.S1**2/self.S2**2 n1, n2 = self.n1, self.n2 # hypothesis testing2 H1a = f.ppf(1 - alpha/2., n1 - 1, n2 - 1) < f0 or f.ppf(alpha/2., n1 - 1, n2 - 1) > f0 H1b = f.ppf(alpha/2., n1 - 1, n2 - 1) < f0 H1c = f.ppf(1 - alpha/2., n1 - 1, n2 - 1) > f0 # p-value p1a = np.max(np.array([f.sf(f0, n1 - 1, n2 - 1), 1 - f.sf(f0, n1 - 1, n2 - 1)])) p1b = f.sf(f0, n1 - 1, n2 - 1) p1c = 1 - f.sf(f0, n1 - 1, n2 - 1) # confidence intervals: the minimum level of significance # alpha for which the null hypothesis is rejected c1 = self.S1**2/self.S1**2 * f.ppf(alpha/2., n2 - 1, n1 - 1) c2 = self.S1**2/self.S1**2 * f.ppf(1 - alpha/2., n2 - 1, n1 - 1) return H1a, H1b, H1c, p1a, p1b, p1c, (c1,c2)
def var_var(self, alpha): f0 = self.S1 ** 2 / self.S2 ** 2 n1, n2 = self.n1, self.n2 # hypothesis testing2 H1a = f.ppf(1 - alpha / 2.0, n1 - 1, n2 - 1) < f0 or f.ppf(alpha / 2.0, n1 - 1, n2 - 1) > f0 H1b = f.ppf(alpha / 2.0, n1 - 1, n2 - 1) < f0 H1c = f.ppf(1 - alpha / 2.0, n1 - 1, n2 - 1) > f0 # p-value p1a = np.max(np.array([f.sf(f0, n1 - 1, n2 - 1), 1 - f.sf(f0, n1 - 1, n2 - 1)])) p1b = f.sf(f0, n1 - 1, n2 - 1) p1c = 1 - f.sf(f0, n1 - 1, n2 - 1) # confidence intervals: the minimum level of significance # alpha for which the null hypothesis is rejected c1 = self.S1 ** 2 / self.S1 ** 2 * f.ppf(alpha / 2.0, n2 - 1, n1 - 1) c2 = self.S1 ** 2 / self.S1 ** 2 * f.ppf(1 - alpha / 2.0, n2 - 1, n1 - 1) return H1a, H1b, H1c, p1a, p1b, p1c, (c1, c2)
def fit(self): # use independet variable + subjects self.reg_t = LinearRegression(n_jobs=self.njobs).fit(self.X_t, self.y) self.preds_t = self.reg_t.predict(self.X_t) self.RSS_t = ((self.preds_t - self.y)**2).sum() self.signCorr = np.sign(self.reg_t.coef_[-1]) #only independent variable self.reg_m = LinearRegression(n_jobs=self.njobs).fit(self.X_m, self.y) self.preds_m = self.reg_m.predict(self.X_m) self.RSS_m = ((self.preds_m - self.y)**2).sum() self.SS_m = ((self.preds_m - self.preds_t)**2).sum() #only subjects self.reg_s = LinearRegression(n_jobs=self.njobs).fit(self.X_s, self.y) self.preds_s = self.reg_s.predict(self.X_s) self.RSS_s = ((self.preds_s - self.y)**2).sum() self.SS_s = ((self.preds_s - self.preds_t)**2).sum() ## MSE self.MSE_s = self.SS_s / self.df_s self.MSE_m = self.SS_m / self.df_m self.MSE_t = self.RSS_t / self.df_t ## F ratio self.F_ratio_m = self.MSE_m / self.MSE_t self.F_ratio_s = self.MSE_s / self.MSE_t ## p value self.p_value_m = fdist.sf(self.F_ratio_m, self.df_m, self.df_t) self.p_value_s = fdist.sf(self.F_ratio_s, self.df_s, self.df_t) ##final DF self.df_total = self.df_m + self.df_s + self.df_t self.ss_total = ((self.y - self.y.mean())**2).sum() self.mse_total = self.ss_total / self.df_total self.result = pd.DataFrame( { 'DoF': [self.df_m, self.df_s, self.df_t, self.df_total], 'SumOfSq': [self.SS_m, self.SS_s, self.RSS_t, self.ss_total], 'MSE': [self.MSE_m, self.MSE_s, self.MSE_t, self.mse_total], 'F_value': [self.F_ratio_m, self.F_ratio_s, None, None], 'p_value': [ '<0.0001' if self.p_value_m < 0.0001 else self.p_value_m, self.p_value_s, None, None ] }, index=[self.subject, self.independent, 'Residual', 'Total']) self.corr = self.signCorr * np.sqrt(self.SS_s / (self.SS_s + self.RSS_t)) return self.corr, self.p_value_s
def ptl_anovaR(inframe): ''' edw prepei na allakseis tous arithmous sta t repeated measures one way ANOVA data inframe is a dataframe ss is sum of square wg is within group bg is between group ms is means square ''' rows, cols = inframe.shape k = cols n_sbj = rows allser = dftoser(inframe) n_t = allser.size # Within group ss_wg = 0 for i in range(k): ss_wg += ssd(inframe.iloc[:, i]) df_wg = n_t - k t1 = (ss_wg, df_wg) # Between groups ss_t = ssd(allser) ss_bg = ss_t - ss_wg df_bg = n_t - k ms_bg = ss_bg / df_bg t0 = (ss_bg, df_bg, ms_bg) # Subjects subjects_means = pd.Series([0 for i in range(rows)]) for i in range(rows): sm = inframe.iloc[i, :].mean(skipna=True) subjects_means.iloc[i] = sm ss_sb = k * ssd(subjects_means) df_sb = n_sbj - 1 t3 = (ss_sb, df_sb) ss_er = ss_wg - ss_sb df_er = df_wg - df_sb ms_er = ss_er / df_er t2 = (ss_er, df_er, ms_er) df_t = n_t - 1 t4 = (ss_t, df_t) if ms_er == 0: F = float("Inf") else: F = ms_bg / ms_er p = f.sf(F, df_bg, df_er, loc=0, scale=1) return t0, t1, t2, t3, t4, F, p
def f_oneway(slef, *args): m = len(args) if (m == 0): return [None, None] for arg in args: if (len(arg) == 0): return [None, None] #cal avg_array,all_avg avg_array = [] for arg in args: su = 0.0 for x in arg: su += x avg = su / len(arg) avg_array.append(avg) print avg_array all_avg = 0.0 for a in avg_array: all_avg += a all_avg = all_avg / len(avg_array) print all_avg #cal Sa,Se,St Sa = 0.0 for a in avg_array: t = (a - all_avg)**2 Sa += t Sa = Sa * len(args[0]) Se = 0.0 i = 0 for arg in args: temp = avg_array[i] t = 0.0 for x in arg: t += (x - temp)**2 Se += t i = i + 1 #cal fa,fe fa = m - 1 fe = m * len(args[0]) - m #cal Va,Ve Va = Sa / fa Ve = Se / fe #cal Fa Fa = round(Va / Ve, 6) p = round(f.sf(x=Fa, dfn=fa, dfd=fe), 6) return [Fa, p]
def f_oneway(slef,*args): m=len(args) if(m==0): return [None,None] for arg in args: if(len(arg)==0): return [None,None] #cal avg_array,all_avg avg_array=[] for arg in args: su=0.0 for x in arg: su+=x avg=su/len(arg) avg_array.append(avg) print avg_array all_avg=0.0 for a in avg_array: all_avg+=a all_avg=all_avg/len(avg_array) print all_avg #cal Sa,Se,St Sa=0.0 for a in avg_array: t=(a-all_avg)**2 Sa+=t Sa=Sa*len(args[0]) Se=0.0 i=0 for arg in args: temp=avg_array[i] t=0.0 for x in arg: t+=(x-temp)**2 Se+=t i=i+1 #cal fa,fe fa=m-1 fe=m*len(args[0])-m #cal Va,Ve Va=Sa/fa Ve=Se/fe #cal Fa Fa=round(Va/Ve,6) p=round(f.sf(x=Fa,dfn=fa,dfd=fe),6) return [Fa,p]
def test_f_test(): X, y = build() res = get_f_mat(X, y, grouping=[0, 1, 2]) df1 = [[x[1] for x in y] for y in res] df2 = [[x[2] for x in y] for y in res] assert np.allclose(df1, 1) assert np.allclose(df2, 7997) p_mat = [[f.sf(f_score, df1, df2) for (f_score, df1, df2) in row] for row in res] assert np.allclose(p_mat, 0)
def GRS_test(self,alpha,Sigma): """ This test should be used for time series regressions """ if rank(Sigma,tol=1e-9) < self.N: print ("Warning Sigma has deficient rank of ", rank(Sigma)) val = (self.T - self.N - self.k)*dot(alpha.T,dot(pinv(Sigma),alpha))/(self.N*(1 + dot(self.fm.T,solve(self.vcvfac,self.fm)))) else: val = (self.T - self.N - self.k)*dot(alpha.T,solve(Sigma,alpha))/(self.N*(1 + dot(self.fm.T,solve(self.vcvfac,self.fm)))) return squeeze(f.sf(val,self.N,self.T-self.N-self.k))
def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None, df_num=None, alpha=0.05, **kwds): self.effect = effect # Let it be None for F if F is not None: self.distribution = 'F' self.fvalue = F self.statistic = self.fvalue self.df_denom = df_denom self.df_num = df_num self.dist = fdist self.dist_args = (df_num, df_denom) self.pvalue = fdist.sf(F, df_num, df_denom) elif t is not None: self.distribution = 't' self.tvalue = t self.statistic = t # generic alias self.sd = sd self.df_denom = df_denom self.dist = student_t self.dist_args = (df_denom, ) self.pvalue = self.dist.sf(np.abs(t), df_denom) * 2 elif 'statistic' in kwds: # TODO: currently targeted to normal distribution, and chi2 self.distribution = kwds['distribution'] self.statistic = kwds['statistic'] self.tvalue = value = kwds['statistic'] # keep alias # TODO: for results instance we decided to use tvalues also for normal self.sd = sd self.dist = getattr(stats, self.distribution) self.dist_args = kwds.get('dist_args', ()) if self.distribution == 'chi2': self.pvalue = self.dist.sf(self.statistic, df_denom) self.df_denom = df_denom else: "normal" self.pvalue = np.full_like(value, np.nan) not_nan = ~np.isnan(value) self.pvalue[not_nan] = self.dist.sf(np.abs(value[not_nan])) * 2 # cleanup # should we return python scalar? self.pvalue = np.squeeze(self.pvalue) if self.effect is not None: self.c_names = ['c%d' % ii for ii in range(len(self.effect))] else: self.c_names = None
def evaluate(self, x, y_true): """ Evaluates the performance of the trained model on a global and variable level. For global, RSE, R^2 and F-statistic are standard. For variables the SE and t-statistic is used. :param x: Matrix of predictors :param y_true: Vector of true y values :return: """ x = core.enhance_matrix(x) y_pred = self.predict(x) global_metrics = [['RSE', reg_eval.residual_standard_error], ['R^2', reg_met.r_squared], ['F-statistic', reg_eval.f_statistic], ['p-value']] var_metrics = [['SE', reg_eval.standard_error_coefs], ['t-statistic', reg_eval.t_statistic], ['p-value']] glob_outcomes = {'Metric': [], 'Value': []} for i in global_metrics: if len(i) > 1: glob_outcomes['Metric'].append(i[0]) glob_outcomes['Value'].append(i[1](x=x, y_true=y_true, y_pred=y_pred, num_predictors=x.n_cols)) elif i[0] == 'p-value': glob_outcomes['Metric'].append(i[0]) glob_outcomes['Value'].append( f.sf(glob_outcomes['Value'][2], dfn=len(y_pred), dfd=x.n_cols - 1)) else: raise Exception('Single value metric not implemented') var_outcomes = { 'Column': list(range(x.n_cols)), 'Coefficient': self.coefficients.data } for i in var_metrics: if len(i) > 1: var_outcomes[i[0]] = i[1](x=x, y_true=y_true, y_pred=y_pred, coefs=var_outcomes['Coefficient']) elif i[0] == 'p-value': var_outcomes[i[0]] = [ 2 * t.sf(abs(float(score)), len(y_pred) - x.n_cols) for score in var_outcomes['t-statistic'] ] print(tabulate(glob_outcomes, headers='keys')) print(tabulate(var_outcomes, headers='keys')) return glob_outcomes, var_outcomes
def test(self, *args): r""" Calculates the MANOVA test statistic and p-value. Parameters ---------- *args : ndarray Variable length input data matrices. All inputs must have the same number of dimensions. That is, the shapes must be `(n, p)` and `(m, p)`, ... where `n`, `m`, ... are the number of samples and `p` is the number of dimensions. Returns ------- stat : float The computed MANOVA statistic. pvalue : float The computed MANOVA p-value. Examples -------- >>> import numpy as np >>> from hyppo.ksample import MANOVA >>> x = np.arange(7) >>> y = x >>> stat, pvalue = MANOVA().test(x, y) >>> '%.3f, %.1f' % (stat, pvalue) '0.000, 1.0' """ inputs = list(args) check_input = _CheckInputs(inputs=inputs, ) inputs = check_input() N = np.sum([i.shape[0] for i in inputs]) p = inputs[0].shape[1] nu_w = N - len(inputs) if nu_w < p: raise ValueError("Test cannot be run, degree of freedoms is off") stat = self.statistic(*inputs) nu_b = len(inputs) - 1 s = np.min([p, nu_b]) m = (np.abs(p - nu_b) - 1) / 2 n = (nu_w - p - 1) / 2 num = 2 * n + s + 1 denom = 2 * m + s + 1 pvalue = f.sf(num / denom * stat / (s - stat), s * denom, s * num) self.stat = stat self.pvalue = pvalue self.null_dist = None return stat, pvalue
def hotel2(X1, X2): """ Computes Hotelling t-squared statistic under two assumptions or variance. :param X1 pandas DataFrame with samples from first group :param X2 pandas DataFrame with samples from second group :return None """ # TODO: Verify Hotelling results n1, k = X1.shape n2, k2 = X2.shape assert(k == k2) ybar1 = X1.mean().as_matrix() s1 = np.cov(X1, rowvar=False) ybar2 = X2.mean(axis=0).as_matrix() s2 = np.cov(X2, rowvar=False) alpha = 0.05 diffs = (ybar1 - ybar2).reshape(1, k) # TODO: Incorporate a test for equal variances # If variances assumed equal, then pool if True: spool = ((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2) t2 = diffs\ .dot(np.linalg.inv(spool * (1.0 / n1 + 1.0 / n2)))\ .dot(ybar1 - ybar2)\ .item(0) eff = (n1 + n2 - k - 1) * t2 / (k * (n1 + n2 - 2)) df1 = k df2 = n1 + n2 - k - 1 p_value = f.sf(eff, df1, df2) print('If variances are assumed equal between classes') if p_value < alpha: print("\t=> Reject the null hypothesis that mean(X1) == mean(X2)") else: print("\t=> Accept null hypothesis that mean(X1) == mean(X2)") print(t2, p_value) # If variances not assumed equal, then use modified Hotelling if True: t2 = diffs\ .dot(np.linalg.inv(s1 / n1 + s2 / n2))\ .dot(ybar1 - ybar2)\ .item(0) p_value = chi2.sf(t2, k) print('If variances are not assumed equal between classes') if p_value < alpha: print("\t=> Reject the null hypothesis that mean(X1) == mean(X2)") else: print("\t=> Accept null hypothesis that mean(X1) == mean(X2)") print(t2, p_value)
def p_value(y1, x1, y2, x2, **kwargs): F, coeff_total, coeff_1, coeff_2 = f_value(y1, x1, y2, x2, **kwargs) if not F: return 1, coeff_total, coeff_1, coeff_2 df1 = 2 df2 = len(x1) + len(x2) - 4 # The survival function (1-cdf) is more precise than using 1-cdf, # this helps when p-values are very close to zero. # -f.logsf would be another alternative to directly get -log(pval) instead. p_val = f.sf(F[0], df1, df2) return p_val, coeff_total, coeff_1, coeff_2
def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None, df_num=None): if F is not None: self.fvalue = F self.df_denom = df_denom self.df_num = df_num self.pvalue = fdist.sf(F, df_num, df_denom) else: self.tvalue = t self.sd = sd self.effect = effect self.df_denom = df_denom self.pvalue = student_t.sf(np.abs(t), df_denom)
def proc_Dsuff(xlist, ylist, pltitle='DXY', Csuff=0.0, fname='DXY.png'): ssd = 0.0 msd = 0.0 df1 = 0.0 F = 0.0 PVAL = 0.0 nullsd = 0.0 ssd = calc_ssd(xlist, ylist, pltitle, fname) df1 = calc_df1(xlist, ylist) df2 = len(ylist) nullsd = calc_nullsd2(xlist, ylist, error_value) emsd = nullsd if (df1 > 0): msd = ssd / df1 F = msd / emsd PVAL = f.sf(F, df1, df2, loc=0, scale=1) #else: print 'ERR - DF1 Div by Zero. F error.' ## Only do calcs if DF1 > 0. Error o'wise. ''' print 'Fname:', os.path.splitext( fname )[0] print 'SSD:', ssd print 'DF1:', df1 print 'DF2:', df2 print 'NULLSD:', nullsd print 'MSD:', msd print 'EMSD:', emsd print 'F:', F print 'PVAL - SF:', PVAL print 'Csuff:', Csuff ''' ProcLabel = os.path.splitext(fname)[0] ProcLabel = ProcLabel.replace('D', '') # Remove D from Label. print '{:>10s}'.format(ProcLabel), print '{:>2d}'.format(Yval), print '{:>4.3f}'.format(Csuff), print '{:>8.3f}'.format(ssd), print '{:>11.3f}'.format(F), print '{:>3.2f}'.format(PVAL), print '{:>3.2f}'.format(df1) print '{:>3d}'.format(df2) #print opcsv.writerow([ ProcLabel, Yval, '{:>4.3f}'.format(Csuff), '{:>8.3f}'.format(ssd), '{:>11.3f}'.format(F), '{:>3.2f}'.format(PVAL), '{:>3.2f}'.format(df1), '{:>3d}'.format(df2) ])
def ANOVA(n_T, k, SSTr, SSE): df1 = k - 1 df2 = n_T - k MSTr = SSTr / (k - 1) MSE = SSE / (n_T - k) F_statistic = MSTr / MSE p_value = f.sf(F_statistic, df1, df2) print("ANOVA with n_T {}, k {}, SSTr {}, SSE {}".format(n_T, k, SSTr, SSE)) print("MSTr : {:.4f}, MSE : {:.4f}".format(MSTr, MSE)) print( "F-statistic : {:.4f}, p-value = P(F_(k-1,n_T-k) >= {:.4f}) = {:.6f}". format(F_statistic, F_statistic, p_value))
def robust_cv_test(res_a, res_b): # Combined 5x2cv F Test for Comparing SupervisedClassification Learning Algorithms # https://www.cmpe.boun.edu.tr/~ethem/files/papers/NC110804.PDF # res_a and res_b are the results of two classifiers with shape num_folds x 2 assert res_a.shape == res_b.shape, 'The two arrays should have equal dimensions' assert res_a.shape[1] == 2, 'Dimension 1 should be 2 for both arrays' num_folds = res_a.shape[0] diff = res_a - res_b diff_fold = diff.mean(axis=1, keepdims=True) var = ((diff - diff_fold)**2).sum(axis=1) f_val = (diff**2).sum() / (2 * var.sum()) p_val = f.sf(f_val, 2 * num_folds, num_folds) return p_val
def hist(self): if cp.spec_hist != 'hist': cp.ax2.set_yscale('linear') cp.ax2.set_xscale('linear') cp.curveSpec.remove() cp.curveM2.remove() cp.curveM1.remove() cp.curveLorentz.remove() cp.spec_hist = 'hist' cp.ax2.set_ylabel('Count') param = h_map[:, :, cp.marker] if cp.marker == 4: param = (1. / np.exp(param)) / 60. elif cp.marker == 5: param = (1. / (np.exp(h_map[:, :, 4] + h_map[:, :, 5]) - np.exp(h_map[:, :, 4] - h_map[:, :, 5]))) / 60. if not cp.mask_bool: cp.ax2_title.set_text('Histogram: %s' % titles[cp.marker]) pflat = param.flatten() h_color = 'black' elif cp.mask_bool: cp.ax2_title.set_text('Histogram: %s | Masked' % titles[cp.marker]) h_color = 'red' # ---- generate p-value heatmap + masked Lorentzian component heatmaps dof1, dof2 = 3, 6 # degrees of freedom for model M1, M2 p_val = ff.sf(h_map[:, :, 6], dof1, dof2) p_val[np.isnan(p_val)] = 1 param_mask = np.copy(param) param_mask[p_val > cp.mask_val] = 0. pflat = param_mask.flatten() pflat = pflat[pflat != 0] pNaN = pflat[~np.isnan(pflat)] y, x, _ = cp.ax2.hist(pNaN, bins=25, edgecolor='black', alpha=0.75, color=h_color) # need a set_data cp.ax2.set_xlabel('%s' % titles[cp.marker]) cp.ax2.set_xlim(np.percentile(pNaN, 1), np.percentile(pNaN, 99)) cp.ax2.set_ylim(0, y.max() * 1.1) cp.leg.set_visible(False) plt.draw()
def proc_Dsuff(xlist, ylist, pltitle='DXY', Csuff=0.0, fname='DXY.png', Yval=1, opdirname='./', opcsv='OP', optxt='OT'): ssd = 0.0 msd = 0.0 df1 = 0.0 F = 0.0 PVAL = 0.0 nullsd = 0.0 #print( 'PDsuff OP Dirname:', opdirname ) ssd = calc_ssd(xlist, ylist, pltitle, fname, Yval, opdirname) df1 = calc_df1(xlist, ylist) df2 = len(ylist) nullsd = calc_nullsd2(xlist, ylist, error_value) emsd = nullsd if (df1 > 0): msd = ssd / df1 F = msd / emsd PVAL = f.sf(F, df1, df2, loc=0, scale=1) ## Only do calcs if DF1 > 0. Error o'wise. ## Write output to text file instead of on screen for GUI version. ## Spaces added for alignment. ProcLabel = os.path.splitext(fname)[0] ProcLabel = ProcLabel.replace('D', '') # Remove D from Label. txtmsg = '{:>10s}'.format( ProcLabel ) + \ ' {:>2d}'.format( Yval ) + \ ' {:>4.3f}'.format( Csuff ) + \ ' {:>8.3f}'.format( ssd ) + \ ' {:>11.3f}'.format( F ) + \ ' {:>3.2f}'.format( PVAL ) + \ ' {:>3.2f}'.format( df1 ) + \ '{:>3d}'.format( df2 ) + '\n' optxt.write(txtmsg) #print opcsv.writerow([ ProcLabel, Yval, '{:>4.3f}'.format(Csuff), '{:>8.3f}'.format(ssd), '{:>11.3f}'.format(F), '{:>3.2f}'.format(PVAL), '{:>3.2f}'.format(df1), '{:>3d}'.format(df2) ])
def proc_Dsuff( xlist, ylist, pltitle = 'DXY', Csuff = 0.0, fname = 'DXY.png' ): ssd = 0.0 msd = 0.0 df1 = 0.0 F = 0.0 PVAL = 0.0 nullsd = 0.0 ssd = calc_ssd( xlist, ylist, pltitle, fname ) df1 = calc_df1( xlist, ylist ) df2 = len( ylist ) nullsd = calc_nullsd3( xlist, ylist, error_value ) emsd = nullsd if ( df1 > 0 ): msd = ssd/df1 F = msd/emsd PVAL = f.sf ( F, df1, df2, loc=0, scale=1 ) #else: print 'ERR - DF1 Div by Zero. F error.' ## Only do calcs if DF1 > 0. Error o'wise. ''' print 'Fname:', os.path.splitext( fname )[0] print 'SSD:', ssd print 'DF1:', df1 print 'DF2:', df2 print 'NULLSD:', nullsd print 'MSD:', msd print 'EMSD:', emsd print 'F:', F print 'PVAL - SF:', PVAL print 'Csuff:', Csuff ''' ProcLabel = os.path.splitext( fname )[0] ProcLabel = ProcLabel.replace( 'D', '' ) # Remove D from Label. print '{:>10s}'.format( ProcLabel ), print '{:>2d}'.format( Yval ), print '{:>4.3f}'.format( Csuff ), print '{:>8.3f}'.format( ssd ), print '{:>11.3f}'.format( F ), print '{:>3.2f}'.format( PVAL ), print '{:>3d}'.format( df2 ) #print opcsv.writerow( [ ProcLabel, Yval, '{:>4.3f}'.format( Csuff ), '{:>8.3f}'.format( ssd ), '{:>11.3f}'.format( F ), '{:>3.2f}'.format( PVAL ), '{:>3d}'.format( df2 ) ] )
def histMask(p): global mask_val global hist0 param = h_map[:, :, p] # generate p-value heatmap + masked Lorentzian component heatmaps dof1, dof2 = 3, 6 # degrees of freedom for model M1, M2 p_val = ff.sf(h_map[:, :, 6], dof1, dof2) param_mask = np.copy(param) param_mask[p_val > mask_val] = 0. param1d = np.reshape(param_mask, (param_mask.shape[0] * param_mask.shape[1])) pmask = param1d[param1d != 0] pmask = pmask[~np.isnan(pmask)] title.set_text('Histogram: %s | Masked' % titles[marker]) hist0 = ax2.hist(pmask, bins=25, edgecolor='black')
def cal_fullmodel(data_df, out_col, consist_col, rank, RSS): """ This function is used to calculate rsquared, rsquared_adj, fvalue, f_pvalue, and DoF of F-test for full model( data before demean process) """ TSS = sum(((data_df[out_col] - data_df[out_col].mean())**2).values)[0] rsquared = 1 - RSS / TSS rsquared_adj = 1 - (len(data_df) - 1) / (len(data_df) - len(consist_col) - rank) * (1 - rsquared) fvalue = (TSS - RSS) * (len(data_df) - len(consist_col) - rank) / (RSS * (rank + len(consist_col) - 1)) f_pvalue = f.sf(fvalue, (rank + len(consist_col) - 1), (len(data_df) - len(consist_col) - rank)) f_df = [(rank + len(consist_col) - 1), (len(data_df) - len(consist_col) - rank)] return rsquared, rsquared_adj, fvalue, f_pvalue, f_df
def calAnova(self, x, y): """ Input x:变量[1D]array y:实际标签[1D]array return DataFrame """ m = len(y) yValues = np.unique(y) n = len(yValues) xbar = x.mean() ximean = [] #每类样本的均值列表 xicount = [] #每类样本的数量列表 #计算自由度 dfList = [n-1, m-n, m-1] #(组间,组内,合计) #计算离差平方和 ##组内 SSList = [] SSw = 0 for value in yValues: xi = x[y==value] xicount.append(len(xi)) #每类的数量 xmean = xi.mean() ximean.append(xmean) #每类的均值 SSw += np.power((xi-xmean), 2).sum() ##组间 SSb = np.dot(np.power((ximean-xbar), 2), xicount) ##合计 SSt = SSw + SSb SSList = [SSb, SSw, SSt] #计算均方 MSList = [SSb/dfList[0], SSw/dfList[1]] #计算F值和P值 Fvalue = MSList[0]/MSList[1] pValue = f.sf(Fvalue, dfList[0], dfList[1]) #返回df df = pd.DataFrame(index=['组间', '组内', '合计'], columns=['自由度', '离差平方和', '均方', 'F值', 'P值']) df.iloc[:,0] = dfList df.iloc[:,1] = SSList df.iloc[:2,2] = MSList df.iloc[0,3] = Fvalue df.iloc[0,4] = pValue return df
def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None, df_num=None, alpha=0.05, **kwds): self.effect = effect # Let it be None for F if F is not None: self.distribution = 'F' self.fvalue = F self.statistic = self.fvalue self.df_denom = df_denom self.df_num = df_num self.dist = fdist self.dist_args = (df_num, df_denom) self.pvalue = fdist.sf(F, df_num, df_denom) elif t is not None: self.distribution = 't' self.tvalue = t self.statistic = t # generic alias self.sd = sd self.df_denom = df_denom self.dist = student_t self.dist_args = (df_denom,) self.pvalue = self.dist.sf(np.abs(t), df_denom) * 2 elif 'statistic' in kwds: # TODO: currently targeted to normal distribution, and chi2 self.distribution = kwds['distribution'] self.statistic = kwds['statistic'] self.tvalue = value = kwds['statistic'] # keep alias # TODO: for results instance we decided to use tvalues also for normal self.sd = sd self.dist = getattr(stats, self.distribution) self.dist_args = kwds.get('dist_args', ()) if self.distribution is 'chi2': self.pvalue = self.dist.sf(self.statistic, df_denom) self.df_denom = df_denom else: "normal" self.pvalue = np.full_like(value, np.nan) not_nan = ~np.isnan(value) self.pvalue[not_nan] = self.dist.sf(np.abs(value[not_nan])) * 2 # cleanup # should we return python scalar? self.pvalue = np.squeeze(self.pvalue)
def cal_fullmodel(data_df, out_col, consist_col, category_col, rank, RSS, originRSS): """ This function is used to calculate rsquared, rsquared_adj, fvalue, f_pvalue, and DoF of F-test for full model( data before demean process) """ k0 = 0 if ('const' in consist_col): k0 = 1 if k0 == 0 and category_col == []: TSS = sum(((data_df[out_col])**2).values)[0] else: TSS = sum(((data_df[out_col] - data_df[out_col].mean())**2).values)[0] RSS = float("{:.6f}".format(RSS)) TSS = float("{:.6f}".format(TSS)) originRSS = float("{:.6f}".format(originRSS)) rsquared = 1 - RSS / TSS #rsquared_adj = 1 - (len(data_df) - 1) / (len(data_df) - len(consist_col) - rank) * (1 - rsquared) if category_col != []: rsquared_adj = 1 - len(data_df) / (len(data_df) - len(consist_col) - rank + k0) * (1 - rsquared) else: rsquared_adj = 1 - (len(data_df) - k0) / ( len(data_df) - len(consist_col)) * (1 - rsquared) df_full_model = rank + len(consist_col) - k0 if df_full_model > 0: fvalue = (TSS - originRSS) * (len(data_df) - len(consist_col) - rank) / (RSS * df_full_model) else: fvalue = 0 f_pvalue = f.sf(fvalue, (rank + len(consist_col) - k0), (len(data_df) - len(consist_col) - rank + k0)) f_df = [(rank + len(consist_col) - k0), (len(data_df) - len(consist_col) - rank + k0)] return rsquared, rsquared_adj, fvalue, f_pvalue, f_df
def granger_causality(df, max_lag, pval=0.05, autocausation=True): ''' Calculates the causality between time series pairs in `df` using the Granger Causality technique. Arguments: df: A pandas dataframe of shape (N x p) max_lag: Number of past lagged inputs to include in the linear model (int) pval: Type 1 error for rejecting null hypothesis of no causality. Returns: A binary causality matrix. ''' # Create numpy input and output arrays for VAR modelling from dataframe X, Y = _create_dataset_vector_output(df, max_lag) assert len(X) == len(Y) # Infer data dimensions N, p = Y.shape # Calculate RSS for full model RSS_full = np.expand_dims(_calc_RSS(X, Y), axis=1) # Calculate RSS for reduced models RSS = np.zeros((p, p)) for i in range(p): RSS[:, i] = _calc_RSS( np.delete(X, slice(i * max_lag, (i + 1) * max_lag), axis=1), Y) # Calculate f-stats f_stats = ((RSS - RSS_full) / max_lag) / (RSS_full / (N - p * max_lag)) # Binarize values based on significance causalities = f.sf(f_stats, max_lag, N - p * max_lag) <= pval if not autocausation: for i in range(len(causalities)): causalities[i, i] = False return causalities.astype(float)
def test(self, x, y): r""" Calculates the Hotelling :math:`T^2` test statistic and p-value. Parameters ---------- x,y : ndarray Input data matrices. ``x`` and ``y`` must have the same number of dimensions. That is, the shapes must be ``(n, p)`` and ``(m, p)`` where `n` is the number of samples and `p` and `q` are the number of dimensions. Returns ------- stat : float The computed Hotelling :math:`T^2` statistic. pvalue : float The computed Hotelling :math:`T^2` p-value. Examples -------- >>> import numpy as np >>> from hyppo.ksample import Hotelling >>> x = np.arange(7) >>> y = x >>> stat, pvalue = Hotelling().test(x, y) >>> '%.3f, %.1f' % (stat, pvalue) '0.000, 1.0' """ check_input = _CheckInputs(inputs=[x, y], ) x, y = check_input() stat = self.statistic(x, y) nx, p = x.shape ny = y.shape[0] pvalue = f.sf(stat, p, nx + ny - p - 1) self.stat = stat self.pvalue = pvalue self.null_dist = None return stat, pvalue
def plotMask(p): global mask_val global cbar1 global im cbar1.remove() im.remove() param = h_map[:, :, p] pflat = np.reshape(param, (param.shape[0] * param.shape[1])) pNaN = pflat[~np.isnan(pflat)] h_min = np.percentile(pNaN, 1) h_max = np.percentile(pNaN, 99) # generate p-value heatmap + masked Lorentzian component heatmaps dof1, dof2 = 3, 6 # degrees of freedom for model M1, M2 p_val = ff.sf(h_map[:, :, 6], dof1, dof2) param_mask = np.copy(param) param_mask[p_val > mask_val] = np.NaN # determine percentage of region masked count = np.count_nonzero(np.isnan(param_mask)) total_pix = p_val.shape[0] * p_val.shape[1] mask_percent = ((np.float(count)) / total_pix) * 100 if p == 4: c_map = 'jet_r' else: c_map = 'jet' im = ax1.imshow(param_mask, cmap=c_map, interpolation='nearest', vmin=h_min, vmax=h_max, picker=True) ax1.set_title(r'%s | $f_{masked}$ = %0.1f%s' % (titles[p], mask_percent, '%'), y=1.01, fontsize=17) colorBar()
def _calc_stats(numer_ss, numer_df, denom_ss, denom_df): """Given the appropriate sum of squares for the numerator and the mean sum of squares for the denominator (with respective degrees of freedom) this will return the relevant statistics of an F-test. Arguments: numer_ss - Sum of squares for the numerator. numer_df - Degrees of freedom for the numerator. denom_ms - Mean sum of squares for the denominator. denom_df - Degrees of freedom for the denominator. Returns: A tuple of three values: Element 0 contains the mean sum of squares for the numerator, element 1 contains the F statistic calculated, and element 2 contains the associated p-value for the generated F-statistic. """ numer_ms = numer_ss / numer_df denom_ms = denom_ss / denom_df f_val = numer_ms / denom_ms p_val = f.sf(f_val, numer_df, denom_df) return f_val, p_val
def f_test(chi1, chi2, ndata, par1, par2, alpha=0.02, verbose=True): '''Do the f-test. Function from Sepideh. Returns 0 if less params is better, 1 if null-hypothesis rejected and more params give gain''' # chi1_red = chi1 / (ndata - par1) chi2_red = chi2 / (ndata - par2) F = ((chi1 - chi2) / (par2 - par1)) / chi2_red # Fstatistic # p-value threshold = 0.005 alpha = 0.005 # 0.01 #Or whatever you want your alpha to be. p_value = f.sf(F, par2 - par1, ndata - par2, loc=0, scale=1) print("p_value", p_value) print("log(p_value)", np.log10(p_value)) if p_value < alpha: better_model = 1 print("Null hypothesis rejected. Model 1 with mre params is better.") print("Probability = ", (1.0 - p_value) * 100.0, '%') else: print("Null hypothesis cannot be rejected. \ Seems model 0 with less params is good enough.") better_model = 0 return better_model
def f_oneway(self, *args): if args==None or len(args)==0 or len(args[0])==0: return[None,None] r = len(args) c = len(args[0]) meanr=[] mean,sa,se=0.0,0.0,0.0 for i in args: meanr.append(sum(i)/float(len(i))) mean+=sum(i) mean/=(r*c) for i in range(r): sa+=c*(meanr[i]-mean)**2 for j in range(c): se+=(args[i][j]-meanr[i])**2 fa=r-1 fe=r*c-r va=sa/fa ve=se/fe F=va/ve P=f.sf(F,fa,fe) return[round(F,6),round(P,6)]
## Depict P(F(1, n) > F) ie. folor the surface defined by x values larger than F beloww the F(1, n) from scipy.stats import f fvalues = np.linspace(10, 25, 100) plt.plot(fvalues, f.pdf(fvalues, 1, 30), 'b-', label="F(1, 30)") upper_fval_fvalues = fvalues[fvalues > fval] plt.fill_between(upper_fval_fvalues, 0, f.pdf(upper_fval_fvalues, 1, 30), alpha=.8) # pdf(x, df1, df2): Probability density function at x of the given RV. plt.legend() ## P(F(1, n) > F) is the p-value, compute it # Survival function (1 - `cdf`) pval = f.sf(fval, 1, n - 2) ## With statmodels from statsmodels.formula.api import ols model = ols('S ~ X', salary) results = model.fit() print(results.summary()) ## sklearn import sklearn.feature_selection #sklearn.feature_selection.f_regression?? sklearn.feature_selection.f_regression(x.reshape((n, 1)), y) """
def granger_causality(self): """Returns the f-stats and p-values from the Granger Causality Test. If the data consists of columns x1, x2, x3, then we perform the following regressions: x1 ~ L(x2, x3) x1 ~ L(x1, x3) x1 ~ L(x1, x2) The f-stats of these results are placed in the 'x1' column of the returned DataFrame. We then repeat for x2, x3. Returns ------- Dict, where 'f-stat' returns the DataFrame containing the f-stats, and 'p-value' returns the DataFrame containing the corresponding p-values of the f-stats. """ from pandas.stats.api import ols from scipy.stats import f d = {} for col in self._columns: d[col] = {} for i in xrange(1, 1 + self._p): lagged_data = self._lagged_data[i].filter(self._columns - [col]) for key, value in lagged_data.iteritems(): d[col][_make_param_name(i, key)] = value f_stat_dict = {} p_value_dict = {} for col, y in self._data.iteritems(): ssr_full = (self.resid[col] ** 2).sum() f_stats = [] p_values = [] for col2 in self._columns: result = ols(y=y, x=d[col2]) resid = result.resid ssr_reduced = (resid ** 2).sum() M = self._p N = self._nobs K = self._k * self._p + 1 f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K)) f_stats.append(f_stat) p_value = f.sf(f_stat, M, N - K) p_values.append(p_value) f_stat_dict[col] = Series(f_stats, self._columns) p_value_dict[col] = Series(p_values, self._columns) f_stat_mat = DataFrame(f_stat_dict) p_value_mat = DataFrame(p_value_dict) return { 'f-stat': f_stat_mat, 'p-value': p_value_mat, }
def typeII(response, ancova, recarray): """ Produce an ANCOVA table from a given ANCOVA formula with type II sums of squares. Inputs ------ response: str field name of response in recarray ancova: ANCOVA specifies the model to be fit recarray: np.ndarray should contain all field names in the terms of ancova as well as response """ Y = recarray[response] X = ancova.formula.design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_F = np.sum(results.resid**2) df_F = results.df_resid names = [] sss = [] fs = [] dfs = [] pvals = [] for name, expr_factors in zip(ancova.contrast_names, ancova.sequence()): expr, factors = expr_factors F = ancova.all_but_above(expr, factors) C = ancova.contrasts[name] XF, contrast_matrices = F.formula.design(recarray, contrasts={'C':C}) modelF = OLS(Y, XF) resultsF = modelF.fit() SSEF = np.sum(resultsF.resid**2) dfF = resultsF.df_resid ftest = resultsF.f_test(contrast_matrices['C']) SSER = SSEF + ftest.fvalue * ftest.df_num * (SSEF / dfF) dfR = dfF + ftest.df_num sss.append(SSER - SSEF) dfs.append(ftest.df_num) fs.append(((SSER - SSEF) / (dfR - dfF)) / (SSE_F / df_F)) pvals.append(f_dbn.sf(fs[-1], dfR-dfF, df_F)) names.append(name) # Add in the "residual row" sss.append(SSE_F) dfs.append(df_F) pvals.append(np.nan) fs.append(np.nan) names.append('Residuals') result = np.array(names, np.dtype([('contrast','S%d' % max([len(n) for n in names]))])) result = ML.rec_append_fields(result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals]) return result
def typeI(response, ancova, recarray): """ Produce an ANCOVA table from a given ANCOVA formula with type I sums of squares where the order is based on the order of terms in the contrast_names of ancova. Inputs ------ response: str field name of response in recarray ancova: ANCOVA specifies the model to be fit recarray: np.ndarray should contain all field names in the terms of ancova as well as response """ Y = recarray[response] X = ancova.formula.design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_F = np.sum(results.resid**2) df_F = results.df_resid model = OLS(Y, ancova.formulae[0].design(recarray, return_float=True)) results = model.fit() SSE_old = np.sum(results.resid**2) df_old = results.df_resid names = [] sss = [] fs = [] dfs = [] pvals = [] names.append(ancova.contrast_names[0]) fs.append(((np.sum(Y**2) - SSE_old) / (Y.shape[0] - df_old)) / (SSE_F / df_F)) sss.append((np.sum(Y**2) - SSE_old)) dfs.append(Y.shape[0] - df_old) pvals.append(f_dbn.sf(fs[-1], Y.shape[0]-df_old, df_F)) for d in range(1,len(ancova.formulae)): terms = [] for f in ancova.formulae[:(d+1)]: terms += list(f.terms) # JT: this is not numerically efficient # could be done by updating some factorization of the full X X = Formula(terms).design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_new = np.sum(results.resid**2) df_new = results.df_resid sss.append(SSE_old - SSE_new) dfs.append(df_old - df_new) fs.append(((SSE_old-SSE_new) / (df_old - df_new)) / (SSE_F / df_F)) pvals.append(f_dbn.sf(fs[-1], df_old-df_new, df_new)) names.append(ancova.contrast_names[d]) SSE_old = SSE_new df_old = df_new # Add in the "residual row" sss.append(SSE_new) dfs.append(df_new) pvals.append(np.nan) fs.append(np.nan) names.append('Residuals') result = np.array(names, np.dtype([('contrast','S%d' % max([len(n) for n in names]))])) result = ML.rec_append_fields(result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals]) return result
def lomb(time, signal, error, f1, df, numf, nharm=8, psdmin=6., detrend_order=0,freq_zoom=10.,tone_control=1.,return_model=True,lambda0=1.,lambda0_range=[-8,6]): """ C version of lomb_scargle: Simultaneous fit of a sum of sinusoids by weighted, linear least squares. model(t) = Sum_k Ck*t^k + Sum_i Sum_j Aij sin(2*pi*j*fi*(t-t0)+phij), i=[1,nfreq], j=[1,nharm] [t0 defined such that ph11=0] Inputs: time: time vector signal: data vector error: data uncertainty vector df: frequency step numf: number of frequencies to consider detrend_order: order of polynomial detrending (Ck orthogonol polynomial terms above; 0 floating mean; <0 no detrending) psdmin: refine periodogram values with larger psd using multi-harmonic fit nharm: number of harmonics to use in refinement lambda0: typical value for regularization parameter (expert parameter) lambda0_range: allowable range for log10 of regularization parameter Output: psd: power spectrum on frequency grid: f1,f1+df,...,f1+numf*df out_dict: dictionary describing various parameters of the multiharmonic fit at the best-fit frequency """ numt = len(time) freq_zoom = round(freq_zoom/2.)*2. dord = detrend_order if (detrend_order<0): dord=0 if (tone_control<0): tone_control=0. # polynomial terms coef = empty(dord+1,dtype='float64') norm = empty(dord+1,dtype='float64') wth0 = (1./error).astype('float64') s0 = dot(wth0,wth0) wth0 /= sqrt(s0) cn = (signal*wth0).astype('float64') coef[0] = dot(cn,wth0); cn0 = coef[0]; norm[0] = 1. cn -= coef[0]*wth0 vcn = 1. # sin's and cosin's for later tt = 2*pi*time.astype('float64') sinx,cosx = sin(tt*f1)*wth0,cos(tt*f1)*wth0 sinx_step,cosx_step = sin(tt*df),cos(tt*df) sinx_back,cosx_back = -sin(tt*df/2.),cos(tt*df/2) sinx_smallstep,cosx_smallstep = sin(tt*df/freq_zoom),cos(tt*df/freq_zoom) npar=2*nharm hat_matr = empty((npar,numt),dtype='float64') hat0 = empty((npar,dord+1),dtype='float64') hat_hat = empty((npar,npar),dtype='float64') soln = empty(npar,dtype='float64') psd = zeros(numf,dtype='float64') # detrend the data and create the orthogonal detrending basis if (dord>0): wth = empty((dord+1,numt),dtype='float64') wth[0,:] = wth0 else: wth = wth0 for i in range(detrend_order): f = wth[i,:]*tt/(2*pi) for j in range(i+1): f -= dot(f,wth[j,:])*wth[j,:] norm[i+1] = sqrt(dot(f,f)); f /= norm[i+1] coef[i+1] = dot(cn,f) cn -= coef[i+1]*f wth[i+1,:] = f vcn += (f/wth0)**2 chi0 = dot(cn,cn) varcn = chi0/(numt-1-dord) psdmin *= 2*varcn Tr = array(0.,dtype='float64') ifreq = array(0,dtype='int32') lambda0 = array(lambda0/s0,dtype='float64') lambda0_range = 10**array(lambda0_range,dtype='float64')/s0 vars=['numt','numf','nharm','detrend_order','psd','cn','wth','sinx','cosx','sinx_step','cosx_step','sinx_back','cosx_back','sinx_smallstep','cosx_smallstep','hat_matr','hat_hat','hat0','soln','chi0','freq_zoom','psdmin','tone_control','lambda0','lambda0_range','Tr','ifreq'] weave.inline(lomb_code, vars, support_code = eigs_code + lomb_scargle_support,force=0) hat_hat /= s0 ii = arange(nharm,dtype='int32') soln[0:nharm] /= (1.+ii)**2; soln[nharm:] /= (1.+ii)**2 if (detrend_order>=0): hat_matr0 = outer(hat0[:,0],wth0) for i in range(detrend_order): hat_matr0 += outer(hat0[:,i+1],wth[i+1,:]) modl = dot(hat_matr.T,soln); modl0 = dot(hat_matr0.T,soln) coef0 = dot(soln,hat0) coef -= coef0 if (detrend_order>=0): hat_matr -= hat_matr0 out_dict={} out_dict['chi0'] = chi0*s0 if (return_model): if (dord>0): out_dict['trend'] = dot(coef,wth)/wth0 else: out_dict['trend'] = coef[0] + 0*wth0 out_dict['model'] = modl/wth0 + out_dict['trend'] j = psd.argmax() freq = f1+df*j + (ifreq/freq_zoom - 1/2.)*df tt = (time*freq) % 1. ; s =tt.argsort() out_dict['freq'] = freq out_dict['s0'] = s0 out_dict['chi2'] = (chi0 - psd[j])*s0 out_dict['psd'] = psd[j]*0.5/varcn out_dict['lambda0'] = lambda0*s0 out_dict['gcv_weight'] = (1-3./numt)/Tr out_dict['trace'] = Tr out_dict['nu0'] = numt - npar npars = (1-Tr)*numt/2 out_dict['nu'] = numt-npars out_dict['npars'] = npars A0, B0 = soln[0:nharm],soln[nharm:] hat_hat /= outer( hstack(((1.+ii)**2,(1.+ii)**2)),hstack(((1.+ii)**2,(1.+ii)**2)) ) err2 = diag(hat_hat) vA0, vB0 = err2[0:nharm], err2[nharm:] covA0B0 = hat_hat[(ii,nharm+ii)] if (return_model): vmodl = vcn/s0 + dot( (hat_matr/wth0).T, dot(hat_hat, hat_matr/wth0) ) vmodl0 = vcn/s0 + dot( (hat_matr0/wth0).T, dot(hat_hat, hat_matr0/wth0) ) out_dict['model_error'] = sqrt(diag(vmodl)) out_dict['trend_error'] = sqrt(diag(vmodl0)) amp = sqrt(A0**2+B0**2) damp = sqrt( A0**2*vA0 + B0**2*vB0 + 2.*A0*B0*covA0B0 )/amp phase = arctan2( B0,A0 ) rel_phase = phase - phase[0]*(1.+ii) rel_phase = arctan2( sin(rel_phase),cos(rel_phase) ) dphase = 0.*rel_phase for i in range(nharm-1): j=i+1 v = array([-A0[0]*(1.+j)/amp[0]**2,B0[0]*(1.+j)/amp[0]**2,A0[j]/amp[j]**2,-B0[j]/amp[j]**2]) jj=array([0,nharm,j,j+nharm]) m = hat_hat[ix_(jj,jj)] dphase[j] = sqrt( dot(dot(v,m),v) ) out_dict['amplitude'] = amp out_dict['amplitude_error'] = damp out_dict['rel_phase'] = rel_phase out_dict['rel_phase_error'] = dphase out_dict['time0'] = -phase[0]/(2*pi*freq) ncp = norm.cumprod() out_dict['trend_coef'] = coef/ncp out_dict['cn0'] = out_dict['trend_coef'][0] - cn0 out_dict['trend_coef_error'] = sqrt( ( 1./s0 + diag(dot(hat0.T,dot(hat_hat,hat0))) )/ncp**2 ) out_dict['cn0_error'] = out_dict['trend_coef_error'][0] prob = fdist.sf( 0.5*(numt-1.-dord)*(1.-out_dict['chi2']/out_dict['chi0']), 2,numt-1-dord ) out_dict['signif'] = lprob2sigma(log(prob)) return 0.5*psd/varcn,out_dict