def f_test_two_p_variance(var1, var2, n1, n2, alpha_level): ''' Perform an F-test for comparing variances between two samples. Inputs: var1: the variance of the population 1. var2: the variance of the population 2. n1: sample size of the population 1. n2: sample size of the population 2. alpha_level: an alpha threshold for rejecting Null hypothesis. (can be 0.1, 0.05, or 0.1) Outputs: F_ratio: the ratio of variances between two populations. p-value: p-value. (lower, upper): confidence intervals given an alpha level. ''' # F-test statistic F_ratio = var1 / var2 # p-value: (two-sided test: the interest is to compare whether two variances are equal) p_value = min(f.cdf(F_ratio, n1 - 1, n2 - 1), 1 - f.cdf(F_ratio, n1 - 1, n2 - 1)) p_value = p_value * 2 # 95% C.I. if alpha_level = 0.05. upper = 1 / (f.ppf(alpha_level / 2, n1 - 1, n2 - 1)) * F_ratio lower = 1 / (f.ppf(1 - (alpha_level / 2), n1 - 1, n2 - 1)) * F_ratio # Outputs return round(F_ratio, 4), round(p_value, 4), (round(lower, 4), round(upper, 4))
def mc_wrapper(model_func, data, origin, weights, iterations, stat_cutoff): # GET IT?! LOL #the acutal monte carlo function for searching the error space rng = default_rng() n_pop, n_params = origin.shape random_hops = rng.standard_normal((iterations, n_pop, n_params)) * weights landscape_RSS = np.ones(iterations) model_landscape = np.zeros((iterations, n_pop, n_params)) origin_RSS = np.linalg.norm(data - model_func(origin)) model_DOF = len(data) - np.prod((n_pop, n_params)) new_hop = origin for idx, hop in enumerate(random_hops): hop_RSS = np.linalg.norm(data - model_func(new_hop)) landscape_RSS[idx] = hop_RSS model_landscape[idx] = new_hop if ftest.cdf(hop_RSS / origin_RSS, model_DOF, model_DOF) > stat_cutoff: new_hop = abs(origin + hop) #prevent negative values, particularly in amp else: new_hop = abs(new_hop + hop) #prevent negative values, particularly in amp landscape_statistics = ftest.cdf(landscape_RSS / origin_RSS, model_DOF, model_DOF) return model_landscape, landscape_statistics
def f_test(data1, data2, tail="both", ratio=1): """ F 分布 :param data1: 样本值 1 :param data2: 样本值 2 :param tail: 尾类型 :param ratio: :return: """ assert tail in ["both", "left", "right"], \ 'tail should be one of “both”, “left”, “right”' n1 = len(data1) n2 = len(data2) sample1_var = variance(data1) sample2_var = variance(data2) f_val = sample1_var / sample2_var / ratio df1 = n1 - 1 df2 = n2 - 1 if tail == "both": p = 2 * min(1 - f.cdf(f_val, df1, df2), f.cdf(f_val, df1, df2)) elif tail == "left": p = f.cdf(f_val, df1, df2) else: p = 1 - f.cdf(f_val, df1, df2) return f_val, df1, df2, p
def f_test_var(data1, data2): """ F Test to test hypothesis if two samples have different variances. H0: samples have same variances (p-value close to one). Parameters ---------- data1: n,1 - dim array with data data2: n,1 - dim array with data Returns ------- p-value of F test Notes ----- See 3rd Edition of Numerical recipes chapter 14.2.2, p.730 """ var1, var2 = np.var(data1, ddof=1), np.var(data2, ddof=1) # compute variance df1, df2, = len(data1) - 1, len(data2) - 1 # compute degrees of freedom if var1 > var2: prob = 2. * f.cdf(var1 / var2, df1, df2) else: prob = 2. * f.cdf(var2 / var1, df2, df1) if prob > 1.: return 2. - prob else: return prob
def perform_f_test(data1, data2, alpha, alternative): statistics = np.var(data1, ddof=1) / np.var(data2, ddof=1) df1 = len(data1) - 1 df2 = len(data2) - 1 if alternative == "≠": print( f"< 0, {round(f.ppf(alpha / 2, df1, df2), 3)} > < {round(f.ppf(1 - alpha / 2, df1, df2), 3)} , ∞)" ) elif alternative == "<": print(f"< 0, {round(f.ppf(alpha, df1, df2), 3)} >") elif alternative == ">": print(f"< {round(f.ppf(1 - alpha, df1, df2), 3)} , ∞)") else: print("Incorrect alternative") return print(f"Test statistics: {round(statistics, 4)}") median = f.ppf(0.5, df1, df2) if alternative == "≠": if statistics < median: pvalue = f.cdf(statistics, df1, df2) * 2 else: pvalue = (1 - f.cdf(statistics, df1, df2)) * 2 elif (alternative == "<" and statistics < median) or (alternative == ">" and statistics > median): pvalue = f.cdf(statistics, df1, df2) else: pvalue = 1 - f.cdf(statistics, df1, df2) if pvalue < alpha: print("H0 rejected") else: print("H0 NOT rejected") print(f"p-value: {round(pvalue, 4)}") return pvalue
def f_test_var(data1,data2): """ F Test to test hypothesis if two samples have different variances. H0: samples have same variances (p-value close to one). Parameters ---------- data1: n,1 - dim array with data data2: n,1 - dim array with data Returns ------- p-value of F test Notes ----- See 3rd Edition of Numerical recipes chapter 14.2.2, p.730 """ var1, var2 = np.var(data1,ddof = 1),np.var(data2,ddof = 1) # compute variance df1, df2, = len(data1) - 1, len(data2) - 1 # compute degrees of freedom if var1 > var2: prob = 2. * f.cdf(var1/var2,df1,df2) else: prob = 2. * f.cdf(var2/var1,df2,df1) if prob > 1.: return 2. - prob else: return prob
def anova_twoway(data): """双因素方差分析2×2""" r, s = 2, 2 data = np.array(data) group_szs = np.tile(np.size(data, axis=1), (np.size(data, axis=0), 1)) n = sum(group_szs) # 样本总数 # 计算均值 group_means = np.mean(data, axis=1) group_mean = group_means.dot(group_szs) / n group_i_means = np.array([mean(group_means[:2]), mean(group_means[2:])]) group_j_means = np.array([(group_means[0] + group_means[2]) / 2, (group_means[1] + group_means[3]) / 2]) # 计算i,j各水平的效应 group_i_effect = group_i_means - group_mean group_j_effect = group_j_means - group_mean # 计算i, j的交叉效应 group_ij_effect = (group_means.reshape(2, 2) - np.tile( group_mean, (2, 2))) - np.tile(group_i_effect, (2, 1)).T - np.tile(group_j_effect, (2, 1)) # 计算总变化 sst = np.sum((data - group_mean)**2) # 计算第一个因素引起的变化 ss_method = ((group_i_means - group_mean)**2).dot( [np.sum(group_szs[:2]), np.sum(group_szs[2:])]) # 计算第二个因素引起的变化 ss_reward = ((group_j_means - group_mean)**2).dot([ np.sum([group_szs[0], group_szs[2]]), np.sum([group_szs[1], group_szs[3]]) ]) # 计算第一个因素与第二个因素交互引起的变化 ss_mr = (group_ij_effect.reshape(1, 4)**2).dot(group_szs) # 其他因素引起的变化 ss_error = np.sum((data - group_means.reshape(-1, 1))**2) # 计算其他因素引起的误差 ms_error = ss_error / (n - r * s) # 计算第一个因素引起的变化ms值, f值, p值 ms_method = ss_method / (r - 1) f_ms_method = ms_method / ms_error p_ms_method = 1 - f.cdf(f_ms_method, r - 1, n - r * s) # 计算第二个因素引起的变化ms值, f值, p值 ms_reward = ss_reward / (r - 1) f_ms_reward = ms_reward / ms_error p_ms_reward = 1 - f.cdf(f_ms_reward, r - 1, n - r * s) # 计算第一、二个因素交互引起的变化ms值, f值, p值 ms_mr = ss_mr / (r - 1) f_ms_mr = ms_mr / ms_error p_ms_mr = 1 - f.cdf(f_ms_mr, r - 1, n - r * s) # 整理输出矩阵各行 method = [r - 1, ss_method, ms_method, f_ms_method, p_ms_method] reward = [r - 1, ss_reward, ms_reward, f_ms_reward, p_ms_reward] mr = [r - 1, ss_mr, ms_mr, f_ms_mr, p_ms_mr] residuals = [n - r * s, ss_error, ms_error, None, None] return np.array([method, reward, mr, residuals]).astype(np.float32)
def app_time(x, dfn, dfd, a, b): mean = 0.0 dist = np.divide(f.pdf(x, dfn, dfd), (f.cdf(b, dfn, dfd) - f.cdf(a, dfn, dfd))) # f-dist for duration, truncated from a to b dist = np.divide(dist, np.sum(dist)) # normalization for item in zip(x, dist): mean = mean + (item[0] * item[1]) # expectation of duration return dist, mean
def dof(res1, v1, res2, v2): # Calculate chi**2 sums. Ea_1 = np.sum(res1**2, axis=0) Ea_2 = np.sum(res2**2, axis=0) Fobs = (Ea_1 / v1) / (Ea_2 / v2) P = 1 - (f.cdf(Fobs, v1, v2) - f.cdf(1 / Fobs, v1, v2)) return P
def app_time(x, dfn, dfd, a, b): mean = 0.0 dist = np.divide( f.pdf(x, dfn, dfd), (f.cdf(b, dfn, dfd) - f.cdf(a, dfn, dfd))) # f-dist for duration, truncated from a to b dist = np.divide(dist, np.sum(dist)) # normalization for item in zip(x, dist): mean = mean + (item[0] * item[1]) # expectation of duration return dist, mean
def two_way(data, f1_name, f2_name): """Run two way analysis of variance in a factor by factor design. * Identify main effects for each factor. * Identify interaction between factors. * Print a table with a spss-style output. Parameters ---------- data: ndarray | Each row represents a 1st factor level. | Each column respresents a 2nd factor level. | Each layer (depth dimension) is an observation. """ #Sums of squares factor_1_effect, factor_2_effect, within_error = factor_sumofsq(data) total_sumofsq = np.sum((data.ravel() - data.mean())**2) interaction_sumofsq = total_sumofsq - factor_1_effect - factor_2_effect - within_error #degrees of freedom factor_1_df, factor_2_df = data.shape[1] - 1, data.shape[2] - 1 error_df = (data.shape[0] - 1) * (data.shape[1] * data.shape[2]) interaction_df = factor_1_df * factor_2_df #total_df = factor_1_df + factor_2_df + error_df + interaction_df #Mean squares within_mean_ssq = within_error / error_df f1_mean_ssq, f2_mean_ssq = factor_1_effect / factor_1_df, factor_2_effect / factor_2_df interaction_ssq = interaction_sumofsq / interaction_df #F values F1, F2 = f1_mean_ssq / within_mean_ssq, f2_mean_ssq / within_mean_ssq F_interaction = interaction_ssq / within_mean_ssq #P values p_F1 = 1 - f.cdf(F1, factor_1_df, error_df) p_F2 = 1 - f.cdf(F2, factor_2_df, error_df) p_interaction = 1 - f.cdf(F_interaction, interaction_df, error_df) print( tabulate([[f1_name, f1_mean_ssq, factor_1_df, F1, p_F1], [f2_name, f2_mean_ssq, factor_2_df, F2, p_F2], [ 'Interaction', interaction_ssq, interaction_df, F_interaction, p_interaction ]], ['Source', 'Mean square', 'df', 'F-values', 'p-values'], tablefmt='grid'))
def granger_causality_test(self, alpha=0.05): """ Computes granger causality test on the bivariate VAR model :param alpha: (float) Significance level (0.05 by default) :return: () *? """ # Get lagged matrix and the two response variables idx = self.lag_order + self.fit_intercept ydx = range(idx, self.design.shape[1]) ydx = [0] + list(ydx) if self.fit_intercept else ydx xlag = self.design[:, :idx] ylag = self.design[:, ydx] x = self.response[:, 0] y = self.response[:, 1] # Regress x against lags of itself self.lr.fit_intercept = False self.lr.fit(xlag, x) xrss_r, xddof_r = self.lr.rss, self.lr.ddof # Regress y against lags of itself self.lr.fit(ylag, y) yrss_r, yddof_r = self.lr.rss, self.lr.ddof # Get unstricted rss from original var model x_resid = self.residuals[:, 0] y_resid = self.residuals[:, 1] xrss_u = x_resid @ x_resid yrss_u = y_resid @ y_resid xddof_u = x_resid.shape[0] - self.k_params / 2 yddof_u = y_resid.shape[0] - self.k_params / 2 # Compute F test f_stat_x = ((xrss_r - xrss_u) / (xddof_r - xddof_u)) f_stat_x *= xddof_u / xrss_u f_stat_y = (yrss_r - yrss_u) / (yddof_r - yddof_u) f_stat_y *= yddof_u / yrss_u # Pvalue for Ftest x_pval = ftest.cdf(f_stat_x, xddof_r, xddof_u) y_pval = ftest.cdf(f_stat_y, yddof_r, yddof_u) # Null hypothesis is x does not granger cause y result = {} result['x_granger_causes_y'] = x_pval < alpha result['y_granger_causes_x'] = y_pval < alpha return result
def get_p_value(year, month): global info if year == 2018: info = minK_2018 f_stat = ( (info[month][1] - info[month][2]) / 2) / (info[month][2] / (info[month][3] - 4)) p_value_2018[month] = 1 - fisher_f.cdf(f_stat, 2, info[month][3] - 4) # p_value_2018[month] = f_stat elif year == 2019: info = minK_2019 f_stat = ( (info[month][1] - info[month][2]) / 2) / (info[month][2] / (info[month][3] - 4)) p_value_2019[month] = 1 - fisher_f.cdf(f_stat, 2, info[month][3] - 4)
def f_compare(ndata, nparas, new_chi, best_chi, nfix=1): """Return the probability calculated using the F-test. The null model (i.e., best-fit solution) is compared to an alternate model where one or more parameters are fixed. Parameters ---------- ndata : int Number of data points: :math:`N`. nparas : int Number of variables in the alternate model. new_chi : float Chi-square of the alternate model. best_chi : float Chi-square of the null model. nfix : int Number of fixed parameters (default is 1). Returns ------- prob : float Value of the calculated probality. """ nparas = nparas + nfix nfree = ndata - nparas nfix = 1.0*nfix dchi = new_chi / best_chi - 1.0 return f.cdf(dchi * nfree / nfix, nfix, nfree)
def ap_TS(mr, mf): dT, dN = mr.shape dT, dK = mf.shape valpha = np.empty((dN, 1)) valpha_t = np.empty((dN, 1)) mresid = np.empty((dT, dN)) # Time-series regressions vones = np.ones((dT, 1)) for i in range(0, dN): vres = newey(mr[:, i], np.hstack((vones, mf)).reshape(dT, dK + 1), 0) valpha[i] = vres.beta[0] valpha_t[i] = vres.tstat[0] mresid[:, i] = vres.resid ## Properties of risk premia vlambda = np.mean(mf, 0).transpose() vlambda_t = vlambda / np.sqrt(np.diag(np.cov(mf, rowvar=0)) / dT) ## GRS test dGRS = ((dT - dN - dK) / dN) * 1 / ( 1 + np.mean(mf, 0) @ np.linalg.inv(np.cov(mf, rowvar=0, bias=True)) @ np.mean(mf, 0).transpose()) * valpha.transpose() @ np.linalg.inv( np.cov(mresid, rowvar=0, bias=True)) @ valpha dGRS_p = 1 - f.cdf(dGRS, dN, dT - dN - dK) return valpha, valpha_t, vlambda, vlambda_t, dGRS, dGRS_p
def t_test(group1, group2): mean1 = np.mean(group1) mean2 = np.mean(group2) std1 = np.std(group1) std2 = np.std(group2) nobs1 = len(group1) nobs2 = len(group2) modified_std1 = np.sqrt(np.float32(nobs1) / np.float32(nobs1 - 1)) * std1 modified_std2 = np.sqrt(np.float32(nobs2) / np.float32(nobs2 - 1)) * std2 #f檢定 f1 = np.square(modified_std1) / np.square(modified_std2) fp = 1 - f.cdf(f1, nobs1 - 1, nobs2 - 1) if fp > 0.05: (statistic, pvalue) = stats.ttest_ind_from_stats(mean1=mean1, std1=modified_std1, nobs1=nobs1, mean2=mean2, std2=modified_std2, nobs2=nobs2, equal_var=True) else: (statistic, pvalue) = stats.ttest_ind_from_stats(mean1=mean1, std1=modified_std1, nobs1=nobs1, mean2=mean2, std2=modified_std2, nobs2=nobs2, equal_var=False) return [mean1, std1, mean2, std2, fp, statistic, pvalue]
def ftest(self): """ Evaluates the significance of the predictors as regards the behaviour of the observations by performing an F-test. In particular, the null hypothesis states that the predictors do not explain the variation of the observations at all. The inverse of the p-value of such experiment (1 - p_value) is returned. Refer to the "fstats" method if what you are looking for is the value of the f-statistic rather than the p-value. """ corrected_data = self.corrected_data() # Get the error obtained when using the full model (correctors + predictors) prediction_error = corrected_data - self.predicted_data() # Now compare the variances of the errors # Residual Sum of Squares for restricted model rss1 = (corrected_data**2).sum(axis=0) # Residual Sum of Squares for full model rss2 = (prediction_error**2).sum(axis=0) # Degrees of freedom dfc = self.df_correction() dfp = self.df_prediction() n = corrected_data.shape[0] df1 = dfp # degrees of freedom of rss1 - rss2 df2 = n - dfc - dfp # degrees of freedom of rss2 # Compute f-scores var1 = (rss1 - rss2) / df1 var2 = rss2 / df2 f_score = var1 / var2 # Compute p-values return f_stat.cdf(f_score, df1, df2)
def test_scipy_f(): rng = np.random.RandomState(20120407) x = rng.normal(size=(100)) * 4 for m in np.arange(1, 15): for n in np.arange(1, 15): assert_array_almost_equal(f_sf(x, m, n), f.sf(x, m, n)) assert_array_almost_equal(f_cdf(x, m, n), f.cdf(x, m, n))
def f_test(chi1,df1,chi2,df2,red_chi = True): """ F Test to compare hypothesis 1 against hypothesis 2. Returns the significance that hypothesis 1 is more probable than hypothesis 2, i.e. if close to one, hypothesis one is preferred. Parameters ---------- chi1: n-dim array / scalar, chi^2 value of first hypothesis test df1: n-dim array / scalar, degrees of freedom of first hypothesis test chi2: n-dim array / scalar, chi^2 value of second hypothesis test df2: n-dim array / scalar, degrees of freedom of second hypothesis test red_chi: if True, F-test is calculated for reduced chi values Returns ------- p-value of F-test (float) """ # if chi1/df1 > chi2/df2: # prob = 2. * f.cdf(chi1/df1, chi2/df2, df1, df2) # else: # prob = 2. * f.cdf(chi2/df2, chi1/df1, df2, df1) if red_chi: fval = (chi1/df1) / (chi2/df2) else: fval = chi1 / chi2 prob = 2. * f.cdf((chi1/df1) / (chi2/df2), df1, df2) if prob > 1.: return 2. - prob else: return prob
def fisher(): global DisYs global F_val global F_cr AvDisYs = sum(DisYs) / len(DisYs) Sad = 0 for dis in DisYs: Sad += dis * (m - 1) Sad = Sad * m / (N - d) F_val = Sad / AvDisYs x_vec = [i * 0.001 for i in range(int(10 / 0.001))] F_cr = None for i in x_vec: if abs(f.cdf(i, N - d, f3) - p) < 0.0001: F_cr = i break if not F_cr: print( "\nSomething went wrong.\nUnable to calculate critical value for Fisher's test" ) elif F_cr >= F_val: print( "\nF = {}\t\t\tF_cr = {}\t\t\tF =< F_cr\nAccording to Fisher's F-test model is adequate to the original." .format(F_val, F_cr)) return True else: print( "\nF = {}\t\t\tF_cr = {}\t\t\tF > F_cr\nAccording to Fisher's F-test model is not adequate to the original." .format(F_val, F_cr)) return False
def F_stat(multarray, labels, cdf=True): """ Given an a multidimensional array multarray and a set of trial labels (0 and 1) corresponding to the 0th axis of multarray, return an array of cdf values calculated from the F distribution that represents the ratio of means of the two label groups along the 0th dimension. If cdf is False, return the F statistic map. """ lls = np.array(labels) # make sure this is an array arr0 = multarray[lls == 0] arr1 = multarray[lls == 1] # if each element of arr0 is chi2(1), then the mean of d such # arrays is chi2(d)/d, and a ratio of such variables is F(d1, d2) chi2n = np.nanmean(arr0, axis=0) chi2d = np.nanmean(arr1, axis=0) # calculate degrees of freedom: assume 2 per pixel per trial nu = 2 dfn = nu * np.sum(~np.isnan(arr0), axis=0) dfd = nu * np.sum(~np.isnan(arr1), axis=0) Fmap = chi2n / chi2d if cdf: # calculate cdf return fdist.cdf(Fmap, dfn, dfd) else: # return statistic itself return Fmap
def assert_equality_in_groups(results, alpha=0.05, groups="groups", test_var="test_var"): data = pd.DataFrame(results) means_models = data.groupby(groups).agg({test_var: np.mean})[test_var] grand_mean = data[test_var].mean() n = len(data) n_models = len(means_models) # Degrees of freedom df_models = n_models - 1 # Numerator df_error = n - n_models # Denominator df_total = df_models + df_error # Sum of Squares ss_total = sum(data[test_var].map(lambda x: (x-grand_mean)**2)) ss_error = sum(data.apply(lambda x: (x[test_var]-means_models[x[groups]])**2, axis=1)) #ss_models = ss_total - ss_error ss_models = sum(means_models.map(lambda x: (x-grand_mean)**2)*(n/n_models)) # Mean Square (Variance) ms_models = ss_models / df_models ms_error = ss_error / df_error # F Statistic f = ms_models / ms_error p = 1. - F.cdf(f, df_models, df_error) assert p >= alpha, "Theres is statistic evidence to confirm that the measure and the std measure is " \ "quite diferent for alpha=%.3f:\n %s\n\nANOVA table\n%s" % (alpha, data, tabulate([ ["Source of Variation", "DF", "SS", "MS", "F", "p-value"], [groups, "%d" % df_models, "%.4f" % ss_models, "%.4f" % ms_models, "%.4f" % f, "%.4f" % p], ["Error", "%d" % df_error, "%.4f" % ss_error, "%.4f" % ms_error, "", ""], ["Total", "%d" % df_total, "%.4f" % ss_total, "", "", ""] ]))
def overall_anova(Xin, Yin): ''' Xin : 2-D array Yin : 1-D array ''' n = np.shape(Xin)[0] #- Number of samples p = np.shape(Xin)[1] #- Number of regression parameters X = np.hstack((np.vstack(np.ones(n)), Xin)) Y = Yin #- Estimated regression coefficients: As a 1_D array and duplicated # so each row gives all the estimated regression coefficients: beta = np.matmul(np.matmul(inv(np.matmul(X.T, X)), X.T), Y) beta_n = np.reshape(np.resize(beta, np.size(beta) * n), (n, np.size(beta))) #- Fitted response values (Yhat) and sum squares: Yhat = np.sum(beta_n * X, axis=1) SSR = np.sum((Yhat - np.mean(Yin))**2) SSE = np.sum((Yin - Yhat)**2) SSTO = np.sum((Yin - np.mean(Yin))**2) #- Mean square: MSR = SSR / (p - 1) MSE = SSE / (n - p) #- F-stat f_statistic = MSR / MSE p_value = 1.0 - f.cdf(f_statistic, p - 1, n - p) return SSR, p - 1, MSR, f_statistic, p_value, SSE, n - p, MSE, SSTO, n - 1
def hotelling_pval(X, mu): xbar = np.mean(X, axis=0) W = np.cov(X.T) n, p = X.shape t2 = n*np.dot(xbar-mu, np.linalg.solve(W, (xbar-mu).T)) fstat = (n-p)*t2/p/(n-1) return 1-f.cdf(fstat, p, n-p)
def anova_oneway(data): k = len(data) assert k > 1 group_means = [mean(group) for group in data] group_szs = [len(group) for group in data] n = sum(group_szs) assert n > k grand_mean = sum(group_mean * group_sz for group_mean, group_sz in zip(group_means, group_szs))/n sst = sum(sum((y-grand_mean)**2 for y in group)for group in data) ssg = sum((group_mean-grand_mean)**2*group_sz for group_mean, group_sz in zip(group_means, group_szs)) sse = sst-ssg dfg = k-1 dfe = n-k msg = ssg/dfg mse = sse/dfe f_value = msg/mse p = 1-f.cdf(f_value, dfg, dfe) return f_value, dfg, dfe, p
def f_test(X, y, beta, alpha): ######## PERFORM F-TEST ######## # INPUT # X: n by k (n=# of observations, k=# of input variables) # y: output target # beta: vector of estimated coefficient by do_linear_regression # beta[0] is intercept # the remained elements correspond to variables in X # alpha: significant level # OUTPUT # f: f-test statistic of the model # pvalue: p-value of f-test # decision: f-test result # True = reject null hypothesis # False = accept null hypothesis # TODO: F-test f = 0 pvalue = 0 decision = None n, p = X.shape MSR = cal_SS(X, y, beta)[1] / p MSE = cal_SS(X, y, beta)[2] / (n - p - 1) f = MSR / MSE pvalue = 1 - (fdist.cdf(f, p, n - p - 1)) if pvalue < alpha: decision = True else: decision = False return (f, pvalue, decision)
def ANOVA(m_list, std_list, n_list, verbose=False): #m_list = list of means #std_list = list of std devs #n_list = list of number of elements in each sample df1 = len(n_list) - 1 m_list = np.asarray(m_list) std_list = np.asarray(std_list) n_list = np.asarray(n_list) df2 = np.sum(n_list) - df1 - 1 x_hat = np.sum(n_list * m_list) / float(np.sum(n_list)) MS_error = np.sum(n_list * np.square(std_list))/float(df2) MS_group = np.sum(n_list * np.square(m_list - x_hat)) / float(df1) F = MS_group / MS_error p = 1 - f.cdf(F, df1, df2) if verbose: print '\n\n' print 'ANOVA Summary:' print 'df1 =', df1 print 'df2 =', df2 print 'SS_group =', np.sum(n_list * np.square(m_list - x_hat)) print 'SS_error =', np.sum(n_list * np.square(std_list)) print 'MS_group =', MS_group print 'MS_error =', MS_error print 'F =', F print 'p-value =', p print '\n\n' return F, p
def anova_oneway(data): """单因素方差分析""" k = len(data) # 类别数 assert k > 1, '数据量得大于1' group_means = [mean(group) for group in data] group_szs = [len(group) for group in data] n = sum(group_szs) # 每个类别中元素个数之和,即数据总个数 assert n > k group_mean = sum( group_mean * group_sz for group_mean, group_sz in zip(group_means, group_szs)) / n sst = np.sum((np.array(data) - group_mean)**2) ssg = ((np.array(group_means) - group_mean)**2).dot(np.array(group_szs)) sse = np.sum((np.array(data) - np.array(group_means).reshape(-1, 1))**2) assert round(sse, 2) == round(sst - ssg, 2) dfg = k - 1 dfe = n - k msg = ssg / dfg mse = sse / dfe f_value = msg / mse p = 1 - f.cdf(f_value, dfg, dfe) return round(f_value, 2), dfg, dfe, p
def f_test(self, mse_A, mse_min, m): """F检验""" if mse_min > mse_A: return False F = mse_A / mse_min p_value = f.cdf(F, m, m) # 通过 F分布的累计分布函数来计算置信度 return (p_value > 0.95) # 若置信度大于 0.95 ,则返回 True
def hotelling_pval(X, mu): xbar = np.mean(X, axis=0) W = np.cov(X.T) n, p = X.shape t2 = n * np.dot(xbar - mu, np.linalg.solve(W, (xbar - mu).T)) fstat = (n - p) * t2 / p / (n - 1) return 1 - f.cdf(fstat, p, n - p)
def GrangerTest(data, lag, kx, alpha=0.05): B0 = Var_fit(data, lag) B = vectorize(B0) Y, Z = Organise2(data, lag) k = np.shape(Y)[0] T = np.shape(Y)[1] + lag if (len(np.shape(data)) > 2): x = np.shape(data) t = x[1] * x[2] else: t = np.shape(data)[1] Y_BZ = Y - np.matmul(B0, Z) sigma = np.matmul(Y_BZ, Y_BZ.T) / t del Y, Y_BZ, B0 C = Cmatrix(k, kx, lag, 0) C_B = np.matmul(C, B) ZZ_t = np.linalg.pinv(np.matmul(Z, Z.T)) M = np.kron(ZZ_t, sigma) CM = np.matmul(C, M) CMCinv = np.linalg.pinv(np.matmul(CM, C.T)) d1 = np.shape(C)[0] del CM, M, C temp1 = np.matmul(C_B.T, CMCinv) lambdaf = np.matmul(temp1, C_B) d2 = T - k * lag - 1 pvalue = 1 - f.cdf(lambdaf[0] / d1, d1, d2)[0] result = pvalue < alpha return pvalue, result, lambdaf[0] / d1
def _f_stat_raw(self): """Returns the raw f-stat value.""" from scipy.stats import f cols = self._x.columns if self._nw_lags is None: F = self._r2_raw / (self._r2_raw - self._r2_adj_raw) q = len(cols) if 'intercept' in cols: q -= 1 shape = q, self.df_resid p_value = 1 - f.cdf(F, shape[0], shape[1]) return F, shape, p_value k = len(cols) R = np.eye(k) r = np.zeros((k, 1)) try: intercept = cols.get_loc('intercept') R = np.concatenate((R[0: intercept], R[intercept + 1:])) r = np.concatenate((r[0: intercept], r[intercept + 1:])) except KeyError: # no intercept pass return math.calc_F(R, r, self._beta_raw, self._var_beta_raw, self._nobs, self.df)
def quade_test(*args): """Not found in either scipy or statsmodels Used to determine if there is at least one treatment different than the others. Not that it does not tell us which treatment is different or how many differences there are. Parameters ---------- args: list or numpy array, 1-D An array containing the observations for each treatment. In this instance, each arg pertains to a specific treatment, with the indexes of each arg pertaining to a block Return ------ q: float Our Q statistic, or a measure of if each treatment has identical effects p: float, 0 <= p <= 1 The likelihood that our observed treatment effects would occur from a randomized block design """ k = len(args) if k < 3: raise AttributeError("Quade Test not appropriate for {} levels".format(k)) all_data = np.vstack(args).T b = all_data.shape[0] rank = np.apply_along_axis(rankdata, 1, all_data) rank_range = rankdata(np.ptp(all_data, axis=1)) s_ij = rank_range.reshape(1, -1).T * rank s_j = np.sum(s_ij, axis=1) a_2 = np.sum(np.power(s_ij, 2)) B = np.sum(np.power(s_j, 2)) / b q = (b - 1) * B / (a_2 - b) p = 1 - f.cdf(q, k - 1, (b - 1) * (k - 1)) return q, p
def _f_stat_raw(self): """Returns the raw f-stat value.""" from scipy.stats import f cols = self._x.columns if self._nw_lags is None: F = self._r2_raw / (self._r2_raw - self._r2_adj_raw) q = len(cols) if 'intercept' in cols: q -= 1 shape = q, self.df_resid p_value = 1 - f.cdf(F, shape[0], shape[1]) return F, shape, p_value k = len(cols) R = np.eye(k) r = np.zeros((k, 1)) try: intercept = cols.get_loc('intercept') R = np.concatenate((R[0:intercept], R[intercept + 1:])) r = np.concatenate((r[0:intercept], r[intercept + 1:])) except KeyError: # no intercept pass return math.calc_F(R, r, self._beta_raw, self._var_beta_raw, self._nobs, self.df)
def f_compare(ndata, nparas, new_chi, best_chi, nfix=1): """Return the probability calculated using the F-test. The null model (i.e., best-fit solution) is compared to an alternate model where one or more parameters are fixed. Parameters ---------- ndata : int Number of data points: :math:`N`. nparas : int Number of variables in the alternate model. new_chi : float Chi-square of the alternate model. best_chi : float Chi-square of the null model. nfix : int Number of fixed parameters (default is 1). Returns ------- prob : float Value of the calculated probality. """ nparas = nparas + nfix nfree = ndata - nparas nfix = 1.0 * nfix dchi = new_chi / best_chi - 1.0 return f.cdf(dchi * nfree / nfix, nfix, nfree)
def regression_model(x, y): """ Given the samplesm, implement and run a regression model in which heterozygous markers are ignored (F-test) """ numerator = 0 denominator = 0 sample_size = len(x) x_mean = np.mean(x) y_mean = np.mean(y) # 1. calculate beta 1, beta 0 for i in range(sample_size): numerator += (x[i] - x_mean) * (y[i] - y_mean) denominator += np.square(x[i] - x_mean) beta_1 = numerator / denominator beta_0 = np.mean(y) - (beta_1 * np.mean(x)) # 2. calculate Sum of Squares SSR = 0 SSE = 0 for i in range(sample_size): y_hat = beta_0 + (beta_1 * x[i]) SSR += np.square(y_hat - y_mean) SSE += np.square(y_hat - y[i]) SST = SSR + SSE R_squared = SSR / SST # 3.calculate F star and p val MSE = SSE / (sample_size - 2) F = SSR / MSE p_val = (1 - f.cdf(F, 1, sample_size - 2)) return F, p_val
def f_test(): """ Compares the best fits of two models with different numbers of parameters on the same data set and calculates if the higher parameter model result is statistically significant! """ m1_chi_sq = float(input('Chi-Squared value from first model: ')) p1 = float(input('Number of Parameters in first model: ')) m2_chi_sq = float(input('Chi-Squared value from second model: ')) p2 = float(input('Number of parameters in second model: ')) data_bins = float(input('Number of data points/bins: ')) null_limit = float(input('Confidence Limit (%): ')) alpha = 100 - null_limit """ Calculating the F-value """ dof1 = p2-p1 dof2 = data_bins-p2 numer = (m1_chi_sq - m2_chi_sq)/(p2-p1) denom = (m2_chi_sq)/(data_bins-p2) f_value = numer/denom """ Calculate P-value and compare to null hypothesis level of acceptance """ dof1 = p2-p1 dof2 = data_bins-p2 p_value = 1 - f.cdf(f_value,dof1,dof2) p_value = p_value*100 print('\nP-value = ',p_value,'% (probability of chance improvement)') if p_value >= alpha: print('No significant improvement in fit!') else: print('Significant improvement in fit!')
def hotelling_t2(X, Y): # X and Y are 3D arrays # dim 0: number of features # dim 1: number of subjects # dim 2: number of mesh nodes or voxels nx = X.shape[1] ny = Y.shape[1] p = X.shape[0] Xbar = X.mean(1) Ybar = Y.mean(1) Xbar = Xbar.reshape(Xbar.shape[0], 1, Xbar.shape[1]) Ybar = Ybar.reshape(Ybar.shape[0], 1, Ybar.shape[1]) X_Xbar = X - Xbar Y_Ybar = Y - Ybar Wx = np.einsum('ijk,ljk->ilk', X_Xbar, X_Xbar) Wy = np.einsum('ijk,ljk->ilk', Y_Ybar, Y_Ybar) W = (Wx + Wy) / float(nx + ny - 2) Xbar_minus_Ybar = Xbar - Ybar x = np.linalg.solve(W.transpose(2, 0, 1), Xbar_minus_Ybar.transpose(2, 0, 1)) x = x.transpose(1, 2, 0) t2 = np.sum(Xbar_minus_Ybar * x, 0) t2 = t2 * float(nx * ny) / float(nx + ny) stat = (t2 * float(nx + ny - 1 - p) / (float(nx + ny - 2) * p)) pval = 1 - np.squeeze(f_distrib.cdf(stat, p, nx + ny - 1 - p)) return pval, t2
def my_f_compare(best_fit, new_fit): nonlocal called called += 1 nfree = best_fit.nfree nfix = best_fit.nfree - new_fit.nfree dchi = new_fit.chisqr / best_fit.chisqr - 1.0 return f.cdf(dchi * nfree / nfix, nfix, nfree)
def solve_f(f_value=None, f1=None, f2=None, p=None): max_1_none(f_value, f1, f2, p) if f_value == None: return f(f1, f2, p) elif p == None: return sympify(sci_f.cdf(float(f_value), float(f1), float(f2))) else: raise NotImplemented("Not implemented yet - sorry")
def f_compare(ndata, nparams, new_chi, best_chi, nfix=1): """ Returns the probalitiy for two given parameter sets. nfix is the number of fixed parameters. """ nparams = nparams + nfix nfree = 1.0*(ndata - nparams) return f.cdf((new_chi / best_chi - 1) * nfree/nfix, nfix, nfree)
def f_compare(Ndata, Nparas, new_chi, best_chi, Nfix=1.): """ Returns the probalitiy for two given parameter sets. Nfix is the number of fixed parameters. """ Nparas = Nparas + Nfix return f.cdf((new_chi / best_chi - 1) * (Ndata - Nparas) / Nfix, Nfix, Ndata - Nparas)
def f_compare(ndata, nparas, new_chi, best_chi, nfix=1.): """ Returns the probalitiy for two given parameter sets. nfix is the number of fixed parameters. """ nparas = nparas + nfix nfree = ndata - nparas nfix = 1.0*nfix dchi = new_chi / best_chi - 1.0 return f.cdf(dchi * nfree / nfix, nfix, nfree)
def make_test_data(): # Make test data for stored data test # Run with: # import fisher322.tests.test_fisher as tf # tf.make_test_data() if not have_scipy: raise RuntimeError("Need scipy to store data") N = 10 mN = 15 nN = 15 x = np.random.normal(size=(N,)) ** 2 fcdf_data = np.zeros((mN, nN, N)) for m in range(15): for n in range(15): fcdf_data[m, n][:] = f.cdf(x, m, n) np.savez(TEST_DATA_FNAME, fcdf_data=fcdf_data, x=x)
def global_difference(table): """ Runs and F-test on the ranks. """ if (not table.is_summary) or (table.type != AbedTableTypes.RANKS): return None N = float(len(settings.DATASETS)) k = float(len(settings.METHODS)) averages = next((row for _id, row in table if _id == 'Average'), None) av_sq = sum([pow(float(x), 2.0) for x in averages]) chi2 = 12.0*N/(k*(k+1))*(av_sq - (k*pow(k+1, 2.0)/4.0)) # this can happen when the ordering of methods is always the same try: Fstat = (N - 1.0)*chi2/(N*(k - 1) - chi2) except ZeroDivisionError: Fstat = float('inf') Fprob = 1.0 - f_dist.cdf(Fstat, k-1, (k-1)*(N-1)) return Fstat, Fprob
def welch_anova(*args): ''' This helper function calculate Welch's ANOVA where the homogeneity assumption of variance is violated args here is the list of array-like data stores, ideally numpy arrays See this web link for the derived formula: http://www.uvm.edu/~dhowell/gradstat/psych340/Lectures/Anova/anova2.html ''' # Number of groups k = len(args) total_weight = 0 total_weighted_sum = 0 weight_list = [] mean_list = [] count_list = [] for sample in args: mean = sample.mean() mean_list.append(mean) var = sample.var() count = sample.count() count_list.append(count) weight = count / var weight_list.append(weight) total_weight += weight weighted_sum = weight * mean total_weighted_sum += weighted_sum weighted_grand_mean = total_weighted_sum / total_weight # Next, let's find Welch's F total_weighted_var = 0 crazy_sum = 0 for w, m, n in zip(weight_list, mean_list, count_list): # This part is used for f_stat calculation element = w * ((m - weighted_grand_mean) ** 2) total_weighted_var += element denom_squared_element = (1 - w / total_weight) ** 2 crazy_element = denom_squared_element / (n - 1) crazy_sum += crazy_element f_numer = total_weighted_var / (k - 1) f_denom = 1 + 2 * (k - 2) * crazy_sum / (k**2 - 1) f_stat = f_numer / f_denom # Next, let's find Welch's degree of freedom df = (k**2 - 1) / (3 * crazy_sum) # Now determine p-value from df pval = 1 - f.cdf(f_stat, k - 1, df) return f_stat, pval
def main(rank_file): ranks = [] N = None with open(rank_file, 'r') as f: for line in f: _, dsets, r = line.strip().split(',') if N is None: N = int(dsets) ranks.append(float(r)) k = len(ranks) ranksum = np.sum(np.square(ranks)) friedman_statistic = (12.0*N/(k*(k+1)))*(ranksum - ((k * (k+1)**2) / 4.0)) f_value = ((N - 1)*friedman_statistic) / (N*(k-1) - friedman_statistic) print 'p-value (Friedman Statistics): %f' % (1.0 - chi2.cdf(friedman_statistic, k-1)) print ' p-value (Iman/Davenport): %f' % (1.0 - fdist.cdf(f_value, k-1, (k-1)*(N-1))) for alpha in (0.10, 0.05, 0.01): print 'CD_%.2f: %f' % (alpha, nemenyi.critical_difference(alpha, k, N))
def linRegStats(A, b): ''' Use linear regression to solve for Ax=b where A and b are known. Also report stats from Wald test. ''' o = ones(b.size) A = column_stack((A, o)) lstsqStats = linalg.lstsq(A,b) x = lstsqStats[0] SSE = lstsqStats[1][0] meanResponse = sum(b) / b.size computedResponse = dot(A,x) SSR = dot( computedResponse - meanResponse, computedResponse - meanResponse ) fstat = SSR / (SSE / (b.size - 2 )); p = 1.0 - f.cdf(fstat, 1, b.size - 2) return {'betas' : x , 'F' : fstat, 'p' : p, 'df1' : 1 , 'df2' : b.size-2}
def F(self): """ Calculate the F-statistic of the model, also used in `summary()` function Returns ======= F, tuple(F, df-1, n-df, p-val) F value as defined in notes, degree of freedoms, p value Notes ===== How F statistic been calculated: .. :math: \frac{(y-\bar{y})^2-RSS}{RSS}\frac{n-d.f.}{d.f.-1} """ n, df = self.model[1].shape rss = np.var(self.model[0])*n fval = (rss-self.rss)/self.rss*(n-df)/(df-1) fval = float(fval) # convert design matrix to float p = f.cdf(fval, df-1, n-df) return fval, df-1, n-df, 1-p
def compute(data): N = data.size C = len(data.columns) dfc = C - 1 dfer = N - C dft = N - 1 cm = data.mean() tm = data.sum().sum() / N n = data.shape[0] SSC = sum((tm - cm) ** 2) * n MSC = SSC / dfc SSE = ((data - cm) ** 2).sum().sum() MSE = SSE / dfer SST = ((data - tm) ** 2).sum().sum() F = MSC / MSE alpha = 0.05 p_value = 1 - f.cdf(F, dfc, dfer) print data print print pandas.DataFrame({'df': [dfc, dfer, dft], 'SS': [SSC, SSE, SST], 'MS': [MSC, MSE, ''], 'F': [F, '', ''], 'p value': [p_value, '', '']}, columns=['df', 'SS', 'MS', 'F', 'p value'], index=['between', 'within', 'total']) print if p_value > alpha: print "Reject null hypothesis" else: print "Accept null hypothesis" print '~~~~~~~~~'
def cfriedman(self): """ Friedman test based on Conover 1999. This method uses Conover's recommendation for an improved version that compares to the F-distribution rather than the Chi-square. Generates P value and distribution information. Parameters ---------- none Sets property ------------- self.P : float P value self.distribution : list String describing what distribution was used for test. self.pairwisePs : None Sets to None to prevent mismatches between testing methods Returns ------- none """ try: # Calculate p-value based on cdf of F-distribution for T2 df1 = self.nts-1 df2 = (self.nblocks-1)*(self.nts-1) self.P = 1-f.cdf(self.T2, df1, df2) self.distribution = ['f.cdf', [['df1', df1], ['df2', df2]]] self.pairwisePs = None except: self.P = None self.distribution = None self.pairwisePs = None print('Error in cfriedman')
def calc_F(R, r, beta, var_beta, nobs, df): """ Computes the standard F-test statistic for linear restriction hypothesis testing Parameters ---------- R: ndarray (N x N) Restriction matrix r: ndarray (N x 1) Restriction vector beta: ndarray (N x 1) Estimated model coefficients var_beta: ndarray (N x N) Variance covariance matrix of regressors nobs: int Number of observations in model df: int Model degrees of freedom Returns ------- F value, (q, df_resid), p value """ from scipy.stats import f hyp = np.dot(R, beta.reshape(len(beta), 1)) - r RSR = np.dot(R, np.dot(var_beta, R.T)) q = len(r) F = np.dot(hyp.T, np.dot(inv(RSR), hyp)).squeeze() / q p_value = 1 - f.cdf(F, q, nobs - df) return F, (q, nobs - df), p_value
def anova_f(s, whichCat, nCategory, SigLevel): import numpy as np from scipy.stats import f Xmean = np.mean(s, axis=2) df_b = nCategory - 1 df_w = s.shape[2] - nCategory SS_b = np.zeros((s.shape[0], s.shape[1])) SS_w = np.zeros((s.shape[0], s.shape[1])) for kk in range(nCategory): Xk = s[:,:, whichCat[:, kk]>0] Xkmean = np.mean(Xk, axis=2) dtemp = Xkmean-Xmean SS_b = SS_b + Xk.shape[2] * (dtemp * dtemp) for i in range(Xk.shape[2]): temp = (Xk[:,:,i] - Xkmean) SS_w = SS_w + (temp * temp) SS_t = SS_b + SS_w; MS_b = SS_b / df_b MS_w = SS_w / df_w XF = np.divide(MS_b, MS_w) PvalF = np.ones((XF.shape[0],XF.shape[1])) for ii in range(XF.shape[0]): for jj in range(XF.shape[1]): PvalF[ii,jj] = 1-f.cdf(XF[ii,jj], df_b, df_w) SigF = 1.0 * (PvalF < SigLevel) nSigS = np.sum(SigF, axis=0) nSigT = np.sum(SigF, axis=1) statOut = {'F':XF,'Pval':PvalF,'Sig':SigF,'df_b':df_b,'df_w':df_w,'SS_b':SS_b,'SS_w':SS_w,'SS_t':SS_t,'MS_b':MS_b,'MS_w':MS_w,'nSigS':nSigS,'nSigT':nSigT} #'T':XT,'Pval':PvalST,'Sig':SigST} return statOut #[XF, PvalF, SigF, nSig, nSigT, df_b, df_w, SS_b, SS_w, MS_b, MS_w]
def separateRegression(response, predictor, sepData, bpChoices, predictorName, equaltestid, equaltestid2): results = np.zeros((len(bpChoices)-1, 4)) print bpChoices chosencriterion = 'r2' # max(r2) is the same criterion as min(sse) for bpid in range(len(bpChoices)-1): print bpid responseLeft, responseRight, predictorLeft, predictorRight, dataleftIdx, datarightIdx = separateData(response, predictor, sepData, bpChoices[bpid]) leftmodel = ols.ols(responseLeft, predictorLeft,'y',predictorName) rightmodel = ols.ols(responseRight, predictorRight,'y',predictorName) results[bpid,0] = bpid if chosencriterion == 'r2': results[bpid,1] = leftmodel.R2adj results[bpid,2] = rightmodel.R2adj #results[bpid,3] = 1 - (leftmodel.e.var() + rightmodel.e.var())/(leftmodel.y.var() + rightmodel.y.var()) results[bpid,3] = calculateR2(leftmodel, rightmodel, np.mean(response)) #results[bpid,3] = (leftmodel.R2 + rightmodel.R2)/2 elif chosencriterion == 'sse': results[bpid,1] = leftmodel.sse * leftmodel.df_e results[bpid,2] = rightmodel.sse * rightmodel.df_e results[bpid,3] = results[bpid,1] + results[bpid,2] #yhatL = np.dot(leftmodel.x, leftmodel.b) #print results[bpid,1], np.sum((responseLeft - yhatL)**2) print results if chosencriterion == 'r2': optBP = int(results[np.argmax(results, axis = 0)[-1],0]) elif chosencriterion == 'sse': optBP = int(results[np.argmin(results, axis = 0)[-1],0]) responseLeft, responseRight, predictorLeft, predictorRight, dataleftIdx, datarightIdx = separateData(response, predictor, sepData, bpChoices[optBP]) leftmodel = ols.ols(responseLeft, predictorLeft,'y',predictorName) rightmodel = ols.ols(responseRight, predictorRight,'y',predictorName) #equaltestid = int(equaltestid) if equaltestid[0] > -1: temppredictorLeft = predictorLeft temppredictorLeft[:,equaltestid[1]] = temppredictorLeft[:,equaltestid[0]] + temppredictorLeft[:,equaltestid[1]] temppredictorLeft = np.delete(temppredictorLeft, equaltestid[0], 1) temppredictorRight = predictorRight temppredictorRight[:,equaltestid[1]] = temppredictorRight[:,equaltestid[0]] + temppredictorRight[:,equaltestid[1]] temppredictorRight = np.delete(temppredictorRight, equaltestid[0], 1) temppredictorName = np.delete(predictorName, equaltestid[0], None) templeftmodel = ols.ols(responseLeft, temppredictorLeft,'y',temppredictorName) temprightmodel = ols.ols(responseRight, temppredictorRight,'y',temppredictorName) fleft = (leftmodel.R2 - templeftmodel.R2) * (leftmodel.nobs - len(predictorName) - 2) / (1 - leftmodel.R2) fright = (rightmodel.R2 - temprightmodel.R2) * (rightmodel.nobs - len(predictorName) - 2) / (1 - rightmodel.R2) pleft = 1 - f.cdf(fleft, 1, leftmodel.nobs - len(predictorName) - 2) pright = 1 - f.cdf(fright, 1, rightmodel.nobs - len(predictorName) - 2) if equaltestid2[0] > -1: temppredictorLeft = predictorLeft temppredictorLeft[:,equaltestid2[1]] = temppredictorLeft[:,equaltestid2[0]] + temppredictorLeft[:,equaltestid2[1]] temppredictorLeft = np.delete(temppredictorLeft, equaltestid2[0], 1) temppredictorRight = predictorRight temppredictorRight[:,equaltestid2[1]] = temppredictorRight[:,equaltestid2[0]] + temppredictorRight[:,equaltestid2[1]] temppredictorRight = np.delete(temppredictorRight, equaltestid2[0], 1) temppredictorName = np.delete(predictorName, equaltestid2[0], None) templeftmodel = ols.ols(responseLeft, temppredictorLeft,'y',temppredictorName) temprightmodel = ols.ols(responseRight, temppredictorRight,'y',temppredictorName) fleft = (leftmodel.R2 - templeftmodel.R2) * (leftmodel.nobs - len(predictorName) - 2) / (1 - leftmodel.R2) fright = (rightmodel.R2 - temprightmodel.R2) * (rightmodel.nobs - len(predictorName) - 2) / (1 - rightmodel.R2) pleft2 = 1 - f.cdf(fleft, 1, leftmodel.nobs - len(predictorName) - 2) pright2 = 1 - f.cdf(fright, 1, rightmodel.nobs - len(predictorName) - 2) yhatL = np.dot(leftmodel.x, leftmodel.b) yhatR = np.dot(rightmodel.x, rightmodel.b) yhat = np.zeros(len(response)) for i in range(len(yhatL)): yhat[dataleftIdx[i]] = yhatL[i] for i in range(len(yhatR)): yhat[datarightIdx[i]] = yhatR[i] yhat = np.exp(yhat) fileLoc = filepath + 'separateR_model2_y_hat.csv' #np.savetxt(fileLoc, yhat, delimiter=',', fmt = '%s') print 'Optimal Index:', optBP print 'Optimal changepoint: ', bpChoices[optBP], ' exp value: ', np.exp(bpChoices[optBP]), ' with R2 = ', calculateR2(leftmodel, rightmodel, np.mean(response)) print '----------------------------- left model -----------------------------' print leftmodel.summary() print '----------------------------- right model -----------------------------' print rightmodel.summary() print 'Optimal Index:', optBP print 'Optimal changepoint: ', bpChoices[optBP], ' exp value: ', np.exp(bpChoices[optBP]), ' with R2 = ', results[optBP, -1] outputstring = 'before bp' for i in range(len(predictorName)+1): outputstring += ', b' + str(i) + ' = ' + "%.2f" %(leftmodel.b[i]) + '(' + "%.3f" %(leftmodel.se[i])+ ')' outputstring += ', with R2 = ' + "%.4f" %(leftmodel.R2adj) if equaltestid[0] > -1: outputstring += ', f1 <> f2 with pvalue = ' + "%.2f" %(pleft) if equaltestid2[0] > -1: outputstring += ', f12 <> f22 with pvalue = ' + "%.2f" %(pleft2) print outputstring outputstring = 'after bp' for i in range(len(predictorName)+1): outputstring += ', b' + str(i) + ' = ' + "%.2f" %(rightmodel.b[i]) + '(' + "%.3f" %(rightmodel.se[i])+ ')' outputstring += ', with R2 = ' + "%.4f" %(rightmodel.R2adj) if equaltestid[0] > -1: outputstring += ', f1 <> f2 with pvalue = ' + "%.2f" %(pright) if equaltestid2[0] > -1: outputstring += ', f12 <> f22 with pvalue = ' + "%.2f" %(pright2) print outputstring #calpredictedvalue(predictor, bpChoices[optBP], zip(leftmodel.b, rightmodel.b), 'exp_inoutflow_model2B.csv') #calconfidenceinterval(predictorLeft, predictorRight, [leftmodel.sse, rightmodel.sse], response, predictor, bpChoices[optBP], zip(leftmodel.b, rightmodel.b), 'ci_model2B.csv') return results, yhat
for fiber in fiber_list: mod = Model(lambda x, a, b: a * x + b) slope_displ = mod.fit(fiber.binned_exp['static_fr_mean'], x=fiber.binned_exp['displ_mean'], a=1, b=1).best_values['a'] slope_force = mod.fit(fiber.binned_exp['static_fr_mean'], x=fiber.binned_exp['force_mean'], a=1, b=1).best_values['a'] slope_displ_list.append(slope_displ) slope_force_list.append(slope_force) slope_displ_arr = np.array(slope_displ_list) slope_force_arr = np.array(slope_force_list) sensitivity_df = pd.DataFrame( np.c_[slope_displ_arr, slope_force_arr], index=['#' + str(i+1) for i in range(slope_displ_arr.size)], columns=['Displacement sensitivity (Hz/mm)', 'Force sensitivity (Hz/mN)']) for column in sensitivity_df.columns: sensitivity_df[column[:5] + '_normalized'] = sensitivity_df[column] /\ sensitivity_df[column].median() sensitivity_df.transpose().to_excel('./csvs/sensitivity.xlsx') print(sensitivity_df.var()) from scipy.stats import f, bartlett, levene print(f.cdf(sensitivity_df['Displ_normalized'].var() / sensitivity_df['Force_normalized'].var(), sensitivity_df.shape[0], sensitivity_df.shape[0])) print(bartlett(sensitivity_df['Displ_normalized'], sensitivity_df['Force_normalized'])) print(levene(sensitivity_df['Displ_normalized'], sensitivity_df['Force_normalized']))
#% maxFZ = geneFZ(Lruns_CLT, N, xsensors_m, Fs_Hz, range_azimuth_deg, range_elevation_deg, range_velocity_mps) maxFF = geneFF(Lruns_CLT, N, xsensors_m, Fs_Hz, range_azimuth_deg, range_elevation_deg, range_velocity_mps) FF = maxFsimul[0:ir+1,0] #== compute the p-value with the asymptotic distribution # (not independent) ppv = pvalunderH0(FF, N, xsensors_m, Fs_Hz, range_azimuth_deg, range_elevation_deg, range_velocity_mps); # pvalues with he limG independent and Findependent ppvG = 1-norm.cdf(FF,1.0,sqrt(2.0*M/(M-1.0)/N))**Q; ppvF = 1-f.cdf(FF,N,N*(M-1))**Q; # pdf of the max of the limG independent and Findependent linx = linspace(0.69,1.3,200) sigmaGlim = sqrt(2.0*M/(M-1.0)/N) nu1 = N nu2 = N*(M-1) pdffromF = f.pdf(linx,nu1,nu2) pdffromFind = Q * pdffromF * (f.cdf(linx,nu1,nu2)**(Q-1)); pdffromGind = Q * norm.pdf(linx,1.0,sigmaGlim) * (norm.cdf(linx,1.0,sigmaGlim)**(Q-1)); dirfigsave = '/Users/maurice/etudes/stephenA/propal2/figures/' #%% #
def get_result_simple(Fst, d): return Fst, (q, d), 1 - f.cdf(Fst, q, d)
chi2_m = np.sum((BimgSky - polyValues)**2) ## Loop through polynomial degrees and store Ftest result alpha = 0.05 #Use a 5% "random probability" as a cutoff # sigma = 3.0 #Use a 3 sigma requirement for so much data # alpha = (1 - norm.cdf(sigma)) Ftests = [] coeffs = [] for deg in range(1,6): dof = numSamp - deg - 1 polyCoeffs = np.polyfit(BimgTimes, BimgSky, deg) coeffs.append(polyCoeffs) polyValues = np.polyval(polyCoeffs, BimgTimes) chi2_m1 = np.sum((BimgSky - polyValues)**2) Fchi = (chi2_m - chi2_m1)/(chi2_m1/dof) prob = 1.0 - f.cdf(Fchi, 1, dof) Ftests.append(prob < alpha) ## Store chi2_m1 in chi2 for use in next iteration chi2_m = chi2_m1 # Find the lowest order FAILED F-test to get higest order good fit bestDegree = np.min(np.where([not test for test in Ftests])) # Fit the best fitting polynomial polyCoeffs = np.polyfit(BimgTimes, BimgSky, bestDegree) polyValues = np.polyval(polyCoeffs, BimgTimes) # Subtract the best fitting polynomial and save for use in the FFT BimgSky1 = BimgSky - polyValues
def fcdf(x, d1, d2): result = f.cdf(x, d1, d2) if isnan(result): return betainc(d1/2., d2/2., d1*x*1./(d1*x+d2)) return result