def __unconditional_power_simpson_term(self, fcrit, df1, df2, t): """ calculate the integration performed in glueck and muller 200?? eq ?? """ # check bounds H0 ,H1 if self.H1 < self.H0: raise GlimmpseValidationException("H1 is greater than H0") elif round(self.H1, 12) == round(self.H0, 12): return 0 else: t1 = special.ncfdtr(df1, df2, t, fcrit) t2_fcrit = (fcrit * df1) / (df1 + 2) t2 = special.ncfdtr(df1 + 2, df2, t, t2_fcrit) return self.cdf(t) * (t1 - t2)
def _tiku_approximation(df1, df2, fcrit, noncen): """Tiku approximation (best approximation)""" h_tiku = 2 * (df1 + noncen)**3 + 3 * (df1 + noncen) * ( df1 + 2 * noncen) * (df2 - 2) + (df1 + 3 * noncen) * (df2 - 2)**2 k_tiku = (df1 + noncen)**2 + (df2 - 2) * (df1 + 2 * noncen) df1_tiku = math.floor(0.5 * (df2 - 2) * ((h_tiku**2 / (h_tiku**2 - 4 * k_tiku**3))**0.5 - 1)) c_tiku = (df1_tiku / df1) / (2 * df1_tiku + df2 - 2) * (h_tiku / k_tiku) b_tiku = -df2 / (df2 - 2) * (c_tiku - 1 - noncen / df1) fcrit_tiku = (fcrit - b_tiku) / c_tiku prob = special.ncfdtr(df1_tiku, df2, 0, fcrit_tiku) fmethod = Constants.FMETHOD_TIKU return prob, fmethod
def calc_anova(*samples, **kwargs): """Calculates statistical power for a one way ANOVA""" # Checks the keywords kwds = {'counts': None, 'alpha': 0.05} for k, v in kwargs.iteritems(): kwds[k] = v if kwds['counts'] is None: raise ValueError('counts is undefined!') counts = kwds['counts'] alpha = kwds['alpha'] # Converts the samples to arrays samples = [np.asarray(sample) for sample in samples] # Determines the group sizes and characteristics k = len(samples) grand_mean = np.concatenate(samples).mean() df1 = k - 1 df2 = k * (counts - 1) # Calculates the noncentrality paramter noncentrality = np.array([ np.square((sample.mean() - grand_mean) / sample.std()) for sample in samples ]).sum() * counts fl = stats.f.ppf(alpha / 2, df1, df2) fu = stats.f.ppf(1 - alpha / 2, df1, df2) # Calculates the power using the non-central F distribution power = (1 - sp.ncfdtr(df1, df2, noncentrality, fu) + sp.ncfdtr(df1, df2, noncentrality, fl)) # the non central F distribution does not return a value of 1, # so we replace nans with a value of 1. power[np.isnan(power)] = 1 return power
def unconditional_power_simpson(self, fcrit, df1, df2): """ Calculates unconditional power using integration by simpsons rule. """ y = lambda x: self.__unconditional_power_simpson_term( fcrit=fcrit, df1=df1, df2=df2, t=x) bounds = [self.H0, self.H1] t1 = special.ncfdtr(df1, df2, self.H1, fcrit) # set up properties for integration by Simpson's rule n = 2 old_prob = 1 max_iterations = math.pow(64, 2) # start iteration end_condition = False while not end_condition: h = (self.H1 - self.H0) / n x = [] fx = [] for i in range(n): x.append(self.H0 + i * h) fx.append(y(x[i])) res = 0 for i in range(n): if i == 0 or i == n: res += fx[i] elif i % 2 != 0: res += 4 * fx[i] else: res += 2 * fx[i] res = res * (h / 3) t2 = res / 2 prob = t1 + t2 #check delta if n >= max_iterations: end_condition = True delta = math.fabs(prob - old_prob) r_limit = math.pow( 10, -6) * (math.fabs(prob) + math.fabs(old_prob)) * 0.5 if (delta <= r_limit) or (delta <= math.pow(10, -6)): end_condition = True old_prob = prob n = n * 2 return prob, None
def dci_skeleton( X1, X2, difference_ug: list, nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0, lam: float = 0, progress: bool = False ): """ Estimates the skeleton of the difference-DAG. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. difference_ug: list List of tuples that represents edges in the difference undirected graph. nodes_cond_set: set Nodes to be considered as conditioning sets. rh1: RegressionHelper, default = None Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper, default = None Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha results in sparser difference graph. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. lam: float, default = 0 Amount of regularization for regression (becomes ridge regression if nonzero). See Also -------- dci, dci_undirected_graph, dci_orient Returns ------- skeleton: set Set of edges in the skeleton of the difference-DAG. """ if verbose > 0: print("DCI skeleton estimation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] skeleton = {(i, j) for i, j in difference_ug} difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug for i, j in difference_ug: for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > alpha if i_invariant: if verbose > 1: print( f"Removing edge {j}->{i} since p-value={pval_i:.5f} > alpha={alpha:.5f} with cond set {cond_set_i}") skeleton.remove((i, j)) break elif verbose > 1: print( f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} < alpha={alpha:.5f} with cond set {cond_set_i}") # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > alpha if j_invariant: if verbose > 1: print( f"Removing edge {i}->{j} since p-value={pval_j:.5f} > alpha={alpha:.5f} with cond set {cond_set_j}") skeleton.remove((i, j)) break elif verbose > 1: print( f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f} < alpha={alpha:.5f} with cond set {cond_set_j}") return skeleton
def dci_skeleton_multiple( X1, X2, alpha_skeleton_grid: list = [0.1, 0.5], max_set_size: int = 3, difference_ug: list = None, nodes_cond_set: set = None, rh1: RegressionHelper = None, rh2: RegressionHelper = None, verbose: int = 0, lam: float = 0, progress: bool = False, true_diff: Optional[Set] = None ): if verbose > 0: print("DCI skeleton estimation...") if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for alpha in alpha_skeleton_grid: assert 0 <= alpha <= 1, "alpha must be in [0,1] range." min_alpha = min(alpha_skeleton_grid) skeletons = {alpha: {(i, j) for i, j in difference_ug} for alpha in alpha_skeleton_grid} difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug for i, j in difference_ug: for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > min_alpha if i_invariant: removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_i > alpha] if verbose > 1: print( f"Removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.5f} with cond set {cond_set_i}") for alpha in removed_alphas: skeletons[alpha].discard((i, j)) if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff: print( f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.6f} with cond set {cond_set_i}") if len(removed_alphas) == len(alpha_skeleton_grid): break elif verbose > 1: print(f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} with cond set {cond_set_i}") # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > min_alpha if j_invariant: removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_j > alpha] if verbose > 1: print( f"Removing edge {i}->{j} for alpha={removed_alphas} since p-value={pval_j:.5f} with cond set {cond_set_j}") for alpha in removed_alphas: skeletons[alpha].discard((i, j)) if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff: print( f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_j:.6f} with cond set {cond_set_i}") if len(removed_alphas) == len(alpha_skeleton_grid): break elif verbose > 1: print(f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f}with cond set {cond_set_j}") return skeletons
def _cdf(self, x, dfn, dfd, nc): return special.ncfdtr(dfn,dfd,nc,x)
def dci_orient( skeleton: set, rh1: RegressionHelper, rh2: RegressionHelper, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Orients edges in the skeleton of the difference DAG. Parameters ---------- skeleton: set Set of edges in the skeleton of the difference-DAG. rh1: RegressionHelper Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_skeleton Returns ------- oriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could be determined. unoriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could not be determined. """ if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." nodes = {i for i, j in skeleton} | {j for i, j in skeleton} oriented_edges = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for i, j in skeleton: for cond_i, cond_j in zip(powerset(nodes - {i}, r_max=max_set_size), powerset(nodes - {j}, r_max=max_set_size)): # compute residual variances for i beta1_i, var1_i, _ = rh1.regression(i, cond_i) beta2_i, var2_i, _ = rh2.regression(i, cond_i) # compute p-value for invariance of residual variances for i pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) # compute residual variances for j beta1_j, var1_j, _ = rh1.regression(j, cond_j) beta2_j, var2_j, _ = rh2.regression(j, cond_j) # compute p-value for invariance of residual variances for j pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j) pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j) if ((pvalue_i > alpha) | (pvalue_j > alpha)): # orient the edge according to highest p-value if pvalue_i > pvalue_j: edge = (j, i) if j in cond_i else (i, j) else: edge = (i, j) if i in cond_j else (j, i) oriented_edges.add(edge) if verbose > 0: print("Oriented (%d, %d) as %s" % (i, j, edge)) break unoriented_edges = skeleton - {frozenset({i, j}) for i, j in oriented_edges} return oriented_edges, unoriented_edges
def dci_skeleton( difference_ug: list, rh1: RegressionHelper, rh2: RegressionHelper, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Estimates the skeleton of the difference-DAG. Parameters ---------- difference_ug: list List of tuples that represents edges in the difference undirected graph. rh1: RegressionHelper Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining presence of edges in the skeleton of the difference graph. Lower alpha results in sparser difference graph. max_set_size: int, default = None Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_orient Returns ------- skeleton: set Set of edges in the skeleton of the difference-DAG. """ if verbose > 0: print("DCI skeleton estimation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] nodes = get_nodes_in_graph(difference_ug) skeleton = {frozenset({i, j}) for i, j in difference_ug} for i, j in difference_ug: for cond_set in powerset(nodes - {i, j}, r_max=max_set_size): cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i] # calculate regression coefficients (j regressed on cond_set_j) for both datasets beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i) beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i) # compute statistic and p-value j_ix = cond_set_i.index(j) stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \ inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix] pval_i = ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i) pval_i = 2 * min(pval_i, 1 - pval_i) # remove i-j from skeleton if i regressed on (j, cond_set) is invariant i_invariant = pval_i > alpha if i_invariant: if verbose > 0: print("Removing edge %d-%d since p-value=%.5f < alpha=%.5f" % (i, j, pval_i, alpha)) skeleton.remove(frozenset({i, j})) break # calculate regression coefficients (i regressed on cond_set_i) for both datasets beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j) beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j) # compute statistic and p-value i_ix = cond_set_j.index(i) stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \ inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix] pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j) pval_j = 2 * min(pval_j, 1 - pval_j) # remove i-j from skeleton if j regressed on (i, cond_set) is invariant j_invariant = pval_j > alpha if j_invariant: if verbose > 0: print("Removing edge %d-%d since p-value=%.5f < alpha=%.5f" % (i, j, pval_j, alpha)) skeleton.remove(frozenset({i, j})) break return skeleton
def calc_anova(*samples, **kwargs): """Calculates statistical power for a one way ANOVA This is based on Lui, X.S. (2014) *Statistical power analysis for the social and behavioral sciences: basic and advanced techniques.* New York: Routledge. 378 pg. Parameters ---------- samples : ndarrays Arrays of observations to be tested. counts : array the number of observations per sample to be used to test the power alpha : float The critical value for power calculations Returns ------- ndarray This describes the probability of seeing a signifigant difference between the samples for the specified number of observations (count) and critical value. """ # Checks the keywords kwds = {'counts': None, 'alpha': 0.05} for k, v in kwargs.items(): kwds[k] = v if kwds['counts'] is None: raise ValueError('counts is undefined!') counts = kwds['counts'] alpha = kwds['alpha'] # Converts the samples to arrays samples = [np.asarray(sample) for sample in samples] k = len(samples) grand_mean = np.hstack(samples).mean() pooled = np.sqrt( np.sum([np.square(x.std()) * (len(x) - 1) for x in samples]) / (np.sum([len(x) for x in samples]) - 2) ) df1 = k - 1 df2 = k * (counts - 1) # Calculates the noncentrality paramter noncentrality = np.array([ np.square((sample.mean() - grand_mean) / pooled) for sample in samples ]).sum() * counts # noncentrality = cohen_f2(*samples) * counts fu = stats.f.ppf(1 - alpha, df1, df2) # Calculates the power using the non-central F distribution power = (1 - sp.ncfdtr(df1, df2, noncentrality, fu)) # the non central F distribution does not return a value of 1, # so we replace nans with a value of 1. power[np.isnan(power)] = 1 return power
def gauss_invariance_test(suffstat, context, i: int, cond_set: Optional[Union[List[int], int]] = None, alpha: float = 0.05, new=True, zero_mean=False, zero_coeffs=False): """ Test the null hypothesis that two Gaussian distributions are equal. Parameters ---------- suffstat: dictionary containing: 'obs' -- number of samples 'G' -- Gram matrix 'contexts' context: which context to test. i: position of marginal distribution. cond_set: positions of conditioning set in correlation matrix. alpha: Significance level. Return ------ dictionary containing ttest_stat, ftest_stat, f_pvalue, t_pvalue, and reject. """ cond_set = to_list(cond_set) obs_samples = suffstat['obs']['samples'] iv_samples = suffstat['contexts'][context]['samples'] n1, p = obs_samples.shape n2 = iv_samples.shape[0] # === FIND REGRESSION COEFFICIENTS AND RESIDUALS if len(cond_set) != 0: cond_ix = cond_set if zero_mean else [*cond_set, -1] gram1 = suffstat['obs']['G'][np.ix_(cond_ix, cond_ix)] gram2 = suffstat['contexts'][context]['G'][np.ix_(cond_ix, cond_ix)] coefs1 = np.linalg.inv(gram1) @ obs_samples[:, cond_ix].T @ obs_samples[:, i] coefs2 = np.linalg.inv(gram2) @ iv_samples[:, cond_ix].T @ iv_samples[:, i] residuals1 = obs_samples[:, i] - obs_samples[:, cond_ix] @ coefs1 residuals2 = iv_samples[:, i] - iv_samples[:, cond_ix] @ coefs2 elif not zero_mean: gram1 = n1 * np.ones([1, 1]) gram2 = n2 * np.ones([1, 1]) cond_ix = [-1] coefs1 = np.array([np.mean(obs_samples[:, i])]) if not zero_mean else 0 coefs2 = np.array([np.mean(iv_samples[:, i])]) if not zero_mean else 0 residuals1 = obs_samples[:, i] - coefs1 residuals2 = iv_samples[:, i] - coefs2 else: residuals1 = obs_samples[:, i] residuals2 = iv_samples[:, i] # means and variances of residuals var1, var2 = np.var(residuals1, ddof=len(cond_ix)), np.var(residuals2, ddof=len(cond_ix)) # calculate regression coefficient invariance statistic if len(cond_ix) != 0: p = len(cond_ix) rc_stat = (coefs1 - coefs2) @ inv(var1 * inv(gram1) + var2 * inv(gram2)) @ (coefs1 - coefs2).T / p rc_pvalue = ncfdtr(p, n1 + n2 - p, 0, rc_stat) rc_pvalue = 2 * min(rc_pvalue, 1 - rc_pvalue) # calculate statistic for F-Test ftest_stat = var1 / var2 f_pvalue = ncfdtr(n1 - 1, n2 - 1, 0, ftest_stat) f_pvalue = 2 * min(f_pvalue, 1 - f_pvalue) # === ACCEPT/REJECT INVARIANCE HYPOTHESIS BASED ON P-VALUES WITH BONFERRONI CORRECTION if len(cond_ix) != 0: reject = f_pvalue < alpha / 2 or rc_pvalue < alpha / 2 else: reject = f_pvalue < alpha # === FORM RESULT DICT AND RETUR result_dict = dict(ftest_stat=ftest_stat, f_pvalue=f_pvalue, reject=reject) if len(cond_ix) > 0: result_dict['rc_stat'] = rc_stat result_dict['rc_pvalue'] = rc_pvalue return result_dict
from scipy import special from scipy import stats import matplotlib.pyplot as plt # Plot the CDF of the non-central F distribution, for nc=0. Compare with the # F-distribution from scipy.stats: x = np.linspace(-1, 8, num=500) dfn = 3 dfd = 2 ncf_stats = stats.f.cdf(x, dfn, dfd) ncf_special = special.ncfdtr(dfn, dfd, 0, x) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(x, ncf_stats, 'b-', lw=3) ax.plot(x, ncf_special, 'r-') plt.show()
def calc_anova(*samples, **kwargs): """Calculates statistical power for a one way ANOVA This is based on Lui, X.S. (2014) *Statistical power analysis for the social and behavioral sciences: basic and advanced techniques.* New York: Routledge. 378 pg. Parameters ---------- samples : ndarrays Arrays of observations to be tested. counts : array the number of observations per sample to be used to test the power alpha : float The critical value for power calculations Returns ------- ndarray This describes the probability of seeing a signifigant difference between the samples for the specified number of observations (count) and critical value. """ # Checks the keywords kwds = {'counts': None, 'alpha': 0.05} for k, v in kwargs.items(): kwds[k] = v if kwds['counts'] is None: raise ValueError('counts is undefined!') counts = kwds['counts'] alpha = kwds['alpha'] # Converts the samples to arrays samples = [np.asarray(sample) for sample in samples] k = len(samples) df1 = k - 1 df2 = k * (counts - 1) # Calculates the noncentrality paramter grand_mean = np.hstack(samples).mean() pooled = np.sqrt( np.sum([np.square(x.std()) * (len(x) - 1) for x in samples]) / (np.sum([len(x) for x in samples]) - 2)) # Calculates the noncentrality paramter noncentrality = np.array([ np.square((sample.mean() - grand_mean) / pooled) for sample in samples ]).sum() * counts # noncentrality = cohen_f2(*samples) * counts fu = stats.f.ppf(1 - alpha, df1, df2) # Calculates the power using the non-central F distribution power = (1 - sp.ncfdtr(df1, df2, noncentrality, fu)) # the non central F distribution does not return a value of 1, # so we replace nans with a value of 1. power[np.isnan(power)] = 1 return power
def dci_orient_order_independent( X1, X2, skeletons: Union[Dict[float, set], set], nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) if isinstance(skeletons, dict): return { alpha: dci_orient_order_independent( X1, X2, skeleton, nodes_cond_set, rh1, rh2, alpha=alpha, max_set_size=max_set_size ) for alpha, skeleton in skeletons.items() } skeleton = {frozenset({i, j}) for i, j in skeletons} nodes = {i for i, j in skeleton} | {j for i, j in skeleton} d_nx = nx.DiGraph() d_nx.add_nodes_from(nodes) nodes_with_decided_parents = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for parent_set_size in range(max_set_size + 2): if verbose > 0: print(f"Trying parent sets of size {parent_set_size}") pvalue_dict = dict() for i in nodes - nodes_with_decided_parents: for cond_i in itertools.combinations(nodes_cond_set - {i}, parent_set_size): beta1_i, var1_i, _ = rh1.regression(i, list(cond_i)) beta2_i, var2_i, _ = rh2.regression(i, list(cond_i)) pvalue_i = ncfdtr(n1 - parent_set_size, n2 - parent_set_size, 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) pvalue_dict[(i, frozenset(cond_i))] = pvalue_i # sort p-value dict sorted_pvalue_dict = [ (pvalue, i, cond_i) for (i, cond_i), pvalue in sorted(pvalue_dict.items(), key=op.itemgetter(1), reverse=True) if pvalue > alpha ] while sorted_pvalue_dict: _, i, cond_i = sorted_pvalue_dict.pop(0) i_children = {j for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton} # don't use this parent set if it contradicts the existing edges if any(j in d_nx.successors(i) for j in cond_i): continue if any(j in d_nx.predecessors(i) for j in i_children): continue # don't use this parent set if it creates a cycle if any(j in nx.descendants(d_nx, i) for j in cond_i): continue if any(j in nx.ancestors(d_nx, i) for j in i_children): continue edges = {(j, i) for j in cond_i if frozenset({i, j}) in skeleton} | \ {(i, j) for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton} nodes_with_decided_parents.add(i) if verbose > 0: print(f"Adding {edges}") d_nx.add_edges_from(edges) # orient edges via graph traversal oriented_edges = set(d_nx.edges) unoriented_edges_before_traversal = skeleton - {frozenset({j, i}) for i, j in oriented_edges} unoriented_edges = unoriented_edges_before_traversal.copy() g = nx.DiGraph() for i, j in oriented_edges: g.add_edge(i, j) g.add_nodes_from(nodes) for i, j in unoriented_edges_before_traversal: chain_path = list(nx.all_simple_paths(g, source=i, target=j)) if len(chain_path) > 0: oriented_edges.add((i, j)) unoriented_edges.remove(frozenset({i, j})) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j))) else: chain_path = list(nx.all_simple_paths(g, source=j, target=i)) if len(chain_path) > 0: oriented_edges.add((j, i)) unoriented_edges.remove(frozenset({i, j})) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i))) # form an adjacency matrix containing directed and undirected edges num_nodes = X1.shape[1] adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes, oriented_edges, undirected=False) return adjacency_matrix
def _nonadjusted(df1, df2, fcrit, noncen): """CDF function (no approximation)""" prob = special.ncfdtr(df1, df2, noncen, fcrit) fmethod = Constants.FMETHOD_NOAPPROXIMATION return prob, fmethod
def dci_orient( X1, X2, skeleton: set, nodes_cond_set: set, rh1: RegressionHelper = None, rh2: RegressionHelper = None, alpha: float = 0.1, max_set_size: int = 3, verbose: int = 0 ): """ Orients edges in the skeleton of the difference DAG. Parameters ---------- X1: array, shape = [n_samples, n_features] First dataset. X2: array, shape = [n_samples, n_features] Second dataset. skeleton: set Set of edges in the skeleton of the difference-DAG. nodes_cond_set: set Nodes to be considered as conditioning sets. rh1: RegressionHelper, default = None Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class. rh2: RegressionHelper, default = None Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class. alpha: float, default = 0.1 Significance level parameter for determining orientation of an edge. Lower alpha results in more directed edges in the difference-DAG. max_set_size: int, default = 3 Maximum conditioning set size used to test regression invariance. Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3. verbose: int, default = 0 The verbosity level of logging messages. See Also -------- dci, dci_undirected_graph, dci_skeleton Returns ------- oriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could be determined. unoriented_edges: set Set of edges in the skeleton of the difference-DAG for which directionality could not be determined. """ if verbose > 0: print("DCI edge orientation...") assert 0 <= alpha <= 1, "alpha must be in [0,1] range." if rh1 is None or rh2 is None: # obtain sufficient statistics suffstat1 = gauss_ci_suffstat(X1) suffstat2 = gauss_ci_suffstat(X2) rh1 = RegressionHelper(suffstat1) rh2 = RegressionHelper(suffstat2) nodes = {i for i, j in skeleton} | {j for i, j in skeleton} oriented_edges = set() n1 = rh1.suffstat['n'] n2 = rh2.suffstat['n'] for i, j in skeleton: for cond_i, cond_j in zip(powerset(nodes_cond_set - {i}, r_max=max_set_size), powerset(nodes_cond_set - {j}, r_max=max_set_size)): # compute residual variances for i beta1_i, var1_i, _ = rh1.regression(i, list(cond_i)) beta2_i, var2_i, _ = rh2.regression(i, list(cond_i)) # compute p-value for invariance of residual variances for i pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i) pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i) # compute residual variances for j beta1_j, var1_j, _ = rh1.regression(j, list(cond_j)) beta2_j, var2_j, _ = rh2.regression(j, list(cond_j)) # compute p-value for invariance of residual variances for j pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j) pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j) if ((pvalue_i > alpha) | (pvalue_j > alpha)): # orient the edge according to highest p-value if pvalue_i > pvalue_j: edge = (j, i) if j in cond_i else (i, j) pvalue_used = pvalue_i else: edge = (i, j) if i in cond_j else (j, i) pvalue_used = pvalue_j oriented_edges.add(edge) if verbose > 0: print("Oriented (%d, %d) as %s since p-value=%.5f > alpha=%.5f" % (i, j, edge, pvalue_used, alpha)) break # orient edges via graph traversal unoriented_edges_before_traversal = skeleton - oriented_edges - {(j, i) for i, j in oriented_edges} unoriented_edges = unoriented_edges_before_traversal.copy() g = nx.DiGraph() for i, j in oriented_edges: g.add_edge(i, j) g.add_nodes_from(nodes) for i, j in unoriented_edges_before_traversal: chain_path = list(nx.all_simple_paths(g, source=i, target=j)) if len(chain_path) > 0: oriented_edges.add((i, j)) unoriented_edges.remove((i, j)) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j))) else: chain_path = list(nx.all_simple_paths(g, source=j, target=i)) if len(chain_path) > 0: oriented_edges.add((j, i)) unoriented_edges.remove((i, j)) if verbose > 0: print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i))) # form an adjacency matrix containing directed and undirected edges num_nodes = X1.shape[1] adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes, oriented_edges, undirected=False) return adjacency_matrix
def my_func(v, power): return 1 - special.ncfdtr(u, v, f2 * (u + v + 1), f.ppf(1 - sig_level, u, v)) - power