def __unconditional_power_simpson_term(self, fcrit, df1, df2, t):
     """
     calculate the integration performed in glueck and muller 200?? eq ??
     """
     # check bounds H0 ,H1
     if self.H1 < self.H0:
         raise GlimmpseValidationException("H1 is greater than H0")
     elif round(self.H1, 12) == round(self.H0, 12):
         return 0
     else:
         t1 = special.ncfdtr(df1, df2, t, fcrit)
         t2_fcrit = (fcrit * df1) / (df1 + 2)
         t2 = special.ncfdtr(df1 + 2, df2, t, t2_fcrit)
         return self.cdf(t) * (t1 - t2)
Ejemplo n.º 2
0
def _tiku_approximation(df1, df2, fcrit, noncen):
    """Tiku approximation (best approximation)"""
    h_tiku = 2 * (df1 + noncen)**3 + 3 * (df1 + noncen) * (
        df1 + 2 * noncen) * (df2 - 2) + (df1 + 3 * noncen) * (df2 - 2)**2
    k_tiku = (df1 + noncen)**2 + (df2 - 2) * (df1 + 2 * noncen)
    df1_tiku = math.floor(0.5 * (df2 - 2) *
                          ((h_tiku**2 / (h_tiku**2 - 4 * k_tiku**3))**0.5 - 1))
    c_tiku = (df1_tiku / df1) / (2 * df1_tiku + df2 - 2) * (h_tiku / k_tiku)
    b_tiku = -df2 / (df2 - 2) * (c_tiku - 1 - noncen / df1)
    fcrit_tiku = (fcrit - b_tiku) / c_tiku
    prob = special.ncfdtr(df1_tiku, df2, 0, fcrit_tiku)
    fmethod = Constants.FMETHOD_TIKU
    return prob, fmethod
Ejemplo n.º 3
0
def calc_anova(*samples, **kwargs):
    """Calculates statistical power for a one way ANOVA"""

    # Checks the keywords
    kwds = {'counts': None, 'alpha': 0.05}
    for k, v in kwargs.iteritems():
        kwds[k] = v
    if kwds['counts'] is None:
        raise ValueError('counts is undefined!')
    counts = kwds['counts']
    alpha = kwds['alpha']

    # Converts the samples to arrays
    samples = [np.asarray(sample) for sample in samples]

    # Determines the group sizes and characteristics
    k = len(samples)
    grand_mean = np.concatenate(samples).mean()

    df1 = k - 1
    df2 = k * (counts - 1)

    # Calculates the noncentrality paramter
    noncentrality = np.array([
        np.square((sample.mean() - grand_mean) / sample.std())
        for sample in samples
    ]).sum() * counts

    fl = stats.f.ppf(alpha / 2, df1, df2)
    fu = stats.f.ppf(1 - alpha / 2, df1, df2)

    # Calculates the power using the non-central F distribution
    power = (1 - sp.ncfdtr(df1, df2, noncentrality, fu) +
             sp.ncfdtr(df1, df2, noncentrality, fl))
    # the non central F distribution does not return a value of 1,
    # so we replace nans with a value of 1.
    power[np.isnan(power)] = 1

    return power
    def unconditional_power_simpson(self, fcrit, df1, df2):
        """
        Calculates unconditional power using integration by simpsons rule.
        """
        y = lambda x: self.__unconditional_power_simpson_term(
            fcrit=fcrit, df1=df1, df2=df2, t=x)
        bounds = [self.H0, self.H1]

        t1 = special.ncfdtr(df1, df2, self.H1, fcrit)

        # set up properties for integration by Simpson's rule
        n = 2
        old_prob = 1
        max_iterations = math.pow(64, 2)

        # start iteration
        end_condition = False
        while not end_condition:
            h = (self.H1 - self.H0) / n
            x = []
            fx = []
            for i in range(n):
                x.append(self.H0 + i * h)
                fx.append(y(x[i]))

            res = 0
            for i in range(n):
                if i == 0 or i == n:
                    res += fx[i]
                elif i % 2 != 0:
                    res += 4 * fx[i]
                else:
                    res += 2 * fx[i]

            res = res * (h / 3)
            t2 = res / 2
            prob = t1 + t2

            #check delta
            if n >= max_iterations:
                end_condition = True
            delta = math.fabs(prob - old_prob)
            r_limit = math.pow(
                10, -6) * (math.fabs(prob) + math.fabs(old_prob)) * 0.5
            if (delta <= r_limit) or (delta <= math.pow(10, -6)):
                end_condition = True
            old_prob = prob
            n = n * 2

        return prob, None
def dci_skeleton(
        X1,
        X2,
        difference_ug: list,
        nodes_cond_set: set,
        rh1: RegressionHelper = None,
        rh2: RegressionHelper = None,
        alpha: float = 0.1,
        max_set_size: int = 3,
        verbose: int = 0,
        lam: float = 0,
        progress: bool = False
):
    """
    Estimates the skeleton of the difference-DAG.

    Parameters
    ----------
    X1: array, shape = [n_samples, n_features]
        First dataset.    
    X2: array, shape = [n_samples, n_features]
        Second dataset.
    difference_ug: list
        List of tuples that represents edges in the difference undirected graph.
    nodes_cond_set: set
        Nodes to be considered as conditioning sets.
    rh1: RegressionHelper, default = None
        Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class.
    rh2: RegressionHelper, default = None
        Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class.
    alpha: float, default = 0.1
        Significance level parameter for determining presence of edges in the skeleton of the difference graph.
        Lower alpha results in sparser difference graph.
    max_set_size: int, default = 3
        Maximum conditioning set size used to test regression invariance.
        Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3.
    verbose: int, default = 0
        The verbosity level of logging messages.
    lam: float, default = 0
        Amount of regularization for regression (becomes ridge regression if nonzero).

    See Also
    --------
    dci, dci_undirected_graph, dci_orient

    Returns
    -------
    skeleton: set
        Set of edges in the skeleton of the difference-DAG.
    """

    if verbose > 0:
        print("DCI skeleton estimation...")

    assert 0 <= alpha <= 1, "alpha must be in [0,1] range."

    if rh1 is None or rh2 is None:
        # obtain sufficient statistics
        suffstat1 = gauss_ci_suffstat(X1)
        suffstat2 = gauss_ci_suffstat(X2)
        rh1 = RegressionHelper(suffstat1)
        rh2 = RegressionHelper(suffstat2)

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']

    skeleton = {(i, j) for i, j in difference_ug}

    difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug
    for i, j in difference_ug:
        for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size):
            cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i]

            # calculate regression coefficients (j regressed on cond_set_j) for both datasets
            beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam)
            beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam)

            # compute statistic and p-value
            j_ix = cond_set_i.index(j)
            stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \
                     inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix]
            pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i)

            #  remove i-j from skeleton if i regressed on (j, cond_set) is invariant
            i_invariant = pval_i > alpha
            if i_invariant:
                if verbose > 1:
                    print(
                        f"Removing edge {j}->{i} since p-value={pval_i:.5f} > alpha={alpha:.5f} with cond set {cond_set_i}")
                skeleton.remove((i, j))
                break
            elif verbose > 1:
                print(
                    f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} < alpha={alpha:.5f} with cond set {cond_set_i}")

            # calculate regression coefficients (i regressed on cond_set_i) for both datasets
            beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j)
            beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j)

            # compute statistic and p-value
            i_ix = cond_set_j.index(i)
            stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \
                     inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix]
            pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j)

            #  remove i-j from skeleton if j regressed on (i, cond_set) is invariant
            j_invariant = pval_j > alpha
            if j_invariant:
                if verbose > 1:
                    print(
                        f"Removing edge {i}->{j} since p-value={pval_j:.5f} > alpha={alpha:.5f} with cond set {cond_set_j}")
                skeleton.remove((i, j))
                break
            elif verbose > 1:
                print(
                    f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f} < alpha={alpha:.5f} with cond set {cond_set_j}")

    return skeleton
def dci_skeleton_multiple(
        X1,
        X2,
        alpha_skeleton_grid: list = [0.1, 0.5],
        max_set_size: int = 3,
        difference_ug: list = None,
        nodes_cond_set: set = None,
        rh1: RegressionHelper = None,
        rh2: RegressionHelper = None,
        verbose: int = 0,
        lam: float = 0,
        progress: bool = False,
        true_diff: Optional[Set] = None
):
    if verbose > 0:
        print("DCI skeleton estimation...")

    if rh1 is None or rh2 is None:
        # obtain sufficient statistics
        suffstat1 = gauss_ci_suffstat(X1)
        suffstat2 = gauss_ci_suffstat(X2)
        rh1 = RegressionHelper(suffstat1)
        rh2 = RegressionHelper(suffstat2)

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']

    for alpha in alpha_skeleton_grid:
        assert 0 <= alpha <= 1, "alpha must be in [0,1] range."
    min_alpha = min(alpha_skeleton_grid)

    skeletons = {alpha: {(i, j) for i, j in difference_ug} for alpha in alpha_skeleton_grid}
    difference_ug = tqdm(difference_ug) if (progress and len(difference_ug) != 0) else difference_ug

    for i, j in difference_ug:
        for cond_set in powerset(nodes_cond_set - {i, j}, r_max=max_set_size):
            cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i]

            # calculate regression coefficients (j regressed on cond_set_j) for both datasets
            beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i, lam=lam)
            beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i, lam=lam)

            # compute statistic and p-value
            j_ix = cond_set_i.index(j)
            stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \
                     inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix]
            pval_i = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i)

            #  remove i-j from skeleton if i regressed on (j, cond_set) is invariant
            i_invariant = pval_i > min_alpha
            if i_invariant:
                removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_i > alpha]
                if verbose > 1:
                    print(
                        f"Removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.5f} with cond set {cond_set_i}")
                for alpha in removed_alphas:
                    skeletons[alpha].discard((i, j))
                if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff:
                    print(
                        f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_i:.6f} with cond set {cond_set_i}")
                if len(removed_alphas) == len(alpha_skeleton_grid):
                    break
            elif verbose > 1:
                print(f"Keeping edge {i}-{j} for now, since p-value={pval_i:.5f} with cond set {cond_set_i}")

            # calculate regression coefficients (i regressed on cond_set_i) for both datasets
            beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j)
            beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j)

            # compute statistic and p-value
            i_ix = cond_set_j.index(i)
            stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \
                     inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix]
            pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j)

            #  remove i-j from skeleton if j regressed on (i, cond_set) is invariant
            j_invariant = pval_j > min_alpha
            if j_invariant:
                removed_alphas = [alpha for alpha in alpha_skeleton_grid if pval_j > alpha]
                if verbose > 1:
                    print(
                        f"Removing edge {i}->{j} for alpha={removed_alphas} since p-value={pval_j:.5f} with cond set {cond_set_j}")
                for alpha in removed_alphas:
                    skeletons[alpha].discard((i, j))
                if true_diff is not None and (i, j) in true_diff or (j, i) in true_diff:
                    print(
                        f"Incorrectly removing edge {j}->{i} for alpha={removed_alphas} since p-value={pval_j:.6f} with cond set {cond_set_i}")
                if len(removed_alphas) == len(alpha_skeleton_grid):
                    break
            elif verbose > 1:
                print(f"Keeping edge {i}-{j} for now, since p-value={pval_j:.5f}with cond set {cond_set_j}")

    return skeletons
Ejemplo n.º 7
0
 def _cdf(self, x, dfn, dfd, nc):
     return special.ncfdtr(dfn,dfd,nc,x)
Ejemplo n.º 8
0
 def _cdf(self, x, dfn, dfd, nc):
     return special.ncfdtr(dfn,dfd,nc,x)
Ejemplo n.º 9
0
def dci_orient(
        skeleton: set,
        rh1: RegressionHelper,
        rh2: RegressionHelper,
        alpha: float = 0.1,
        max_set_size: int = 3,
        verbose: int = 0
):
    """
    Orients edges in the skeleton of the difference DAG.

    Parameters
    ----------
    skeleton: set
        Set of edges in the skeleton of the difference-DAG.
    rh1: RegressionHelper
        Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class.
    rh2: RegressionHelper
        Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class.
    alpha: float, default = 0.1
        Significance level parameter for determining orientation of an edge.
        Lower alpha results in more directed edges in the difference-DAG.
    max_set_size: int, default = 3
        Maximum conditioning set size used to test regression invariance.
        Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3.
    verbose: int, default = 0
        The verbosity level of logging messages.

    See Also
    --------
    dci, dci_undirected_graph, dci_skeleton

    Returns
    -------
    oriented_edges: set
        Set of edges in the skeleton of the difference-DAG for which directionality could be determined.
    unoriented_edges: set
        Set of edges in the skeleton of the difference-DAG for which directionality could not be determined.
    """

    if verbose > 0:
        print("DCI edge orientation...")

    assert 0 <= alpha <= 1, "alpha must be in [0,1] range."

    nodes = {i for i, j in skeleton} | {j for i, j in skeleton}
    oriented_edges = set()

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']
    for i, j in skeleton:
        for cond_i, cond_j in zip(powerset(nodes - {i}, r_max=max_set_size), powerset(nodes - {j}, r_max=max_set_size)):
            # compute residual variances for i
            beta1_i, var1_i, _ = rh1.regression(i, cond_i)
            beta2_i, var2_i, _ = rh2.regression(i, cond_i)
            # compute p-value for invariance of residual variances for i
            pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i)
            pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i)

            # compute residual variances for j
            beta1_j, var1_j, _ = rh1.regression(j, cond_j)
            beta2_j, var2_j, _ = rh2.regression(j, cond_j)
            # compute p-value for invariance of residual variances for j
            pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j)
            pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j)

            if ((pvalue_i > alpha) | (pvalue_j > alpha)):
                # orient the edge according to highest p-value
                if pvalue_i > pvalue_j:
                    edge = (j, i) if j in cond_i else (i, j)
                else:
                    edge = (i, j) if i in cond_j else (j, i)
                oriented_edges.add(edge)

                if verbose > 0:
                    print("Oriented (%d, %d) as %s" % (i, j, edge))
                break

    unoriented_edges = skeleton - {frozenset({i, j}) for i, j in oriented_edges}
    return oriented_edges, unoriented_edges
Ejemplo n.º 10
0
def dci_skeleton(
        difference_ug: list,
        rh1: RegressionHelper,
        rh2: RegressionHelper,
        alpha: float = 0.1,
        max_set_size: int = 3,
        verbose: int = 0
):
    """
    Estimates the skeleton of the difference-DAG.

    Parameters
    ----------
    difference_ug: list
        List of tuples that represents edges in the difference undirected graph.
    rh1: RegressionHelper
        Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class.
    rh2: RegressionHelper
        Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class.
    alpha: float, default = 0.1
        Significance level parameter for determining presence of edges in the skeleton of the difference graph.
        Lower alpha results in sparser difference graph.
    max_set_size: int, default = None
        Maximum conditioning set size used to test regression invariance.
        Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3.
    verbose: int, default = 0
        The verbosity level of logging messages.

    See Also
    --------
    dci, dci_undirected_graph, dci_orient

    Returns
    -------
    skeleton: set
        Set of edges in the skeleton of the difference-DAG.
    """

    if verbose > 0:
        print("DCI skeleton estimation...")

    assert 0 <= alpha <= 1, "alpha must be in [0,1] range."

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']
    nodes = get_nodes_in_graph(difference_ug)
    skeleton = {frozenset({i, j}) for i, j in difference_ug}

    for i, j in difference_ug:
        for cond_set in powerset(nodes - {i, j}, r_max=max_set_size):
            cond_set_i, cond_set_j = [*cond_set, j], [*cond_set, i]

            # calculate regression coefficients (j regressed on cond_set_j) for both datasets
            beta1_i, var1_i, precision1 = rh1.regression(i, cond_set_i)
            beta2_i, var2_i, precision2 = rh2.regression(i, cond_set_i)

            # compute statistic and p-value
            j_ix = cond_set_i.index(j)
            stat_i = (beta1_i[j_ix] - beta2_i[j_ix]) ** 2 * \
                     inv(var1_i * precision1 / (n1 - 1) + var2_i * precision2 / (n2 - 1))[j_ix, j_ix]
            pval_i = ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_i)
            pval_i = 2 * min(pval_i, 1 - pval_i)

            #  remove i-j from skeleton if i regressed on (j, cond_set) is invariant
            i_invariant = pval_i > alpha
            if i_invariant:
                if verbose > 0:
                    print("Removing edge %d-%d since p-value=%.5f < alpha=%.5f" % (i, j, pval_i, alpha))
                skeleton.remove(frozenset({i, j}))
                break

            # calculate regression coefficients (i regressed on cond_set_i) for both datasets
            beta1_j, var1_j, precision1 = rh1.regression(j, cond_set_j)
            beta2_j, var2_j, precision2 = rh2.regression(j, cond_set_j)

            # compute statistic and p-value
            i_ix = cond_set_j.index(i)
            stat_j = (beta1_j[i_ix] - beta2_j[i_ix]) ** 2 * \
                     inv(var1_j * precision1 / (n1 - 1) + var2_j * precision2 / (n2 - 1))[i_ix, i_ix]
            pval_j = 1 - ncfdtr(1, n1 + n2 - len(cond_set_i) - len(cond_set_j), 0, stat_j)
            pval_j = 2 * min(pval_j, 1 - pval_j)

            #  remove i-j from skeleton if j regressed on (i, cond_set) is invariant
            j_invariant = pval_j > alpha
            if j_invariant:
                if verbose > 0:
                    print("Removing edge %d-%d since p-value=%.5f < alpha=%.5f" % (i, j, pval_j, alpha))
                skeleton.remove(frozenset({i, j}))
                break

    return skeleton
Ejemplo n.º 11
0
def calc_anova(*samples, **kwargs):
    """Calculates statistical power for a one way ANOVA

    This is based on
        Lui, X.S. (2014) *Statistical power analysis for the social and
        behavioral sciences: basic and advanced techniques.* New York:
        Routledge. 378 pg.

    Parameters
    ----------
    samples : ndarrays
        Arrays of observations to be tested.
    counts : array
        the number of observations per sample to be used to test the power
    alpha : float
        The critical value for power calculations

    Returns
    -------
    ndarray
        This describes the probability of seeing a signifigant difference
        between the samples for the specified number of observations
        (count) and critical value.
    """

    # Checks the keywords
    kwds = {'counts': None,
            'alpha': 0.05}
    for k, v in kwargs.items():
        kwds[k] = v
    if kwds['counts'] is None:
        raise ValueError('counts is undefined!')
    counts = kwds['counts']
    alpha = kwds['alpha']

    # Converts the samples to arrays
    samples = [np.asarray(sample) for sample in samples]

    k = len(samples)
    grand_mean = np.hstack(samples).mean()
    pooled = np.sqrt(
        np.sum([np.square(x.std()) * (len(x) - 1) for x in samples]) /
        (np.sum([len(x) for x in samples]) - 2)
        )

    df1 = k - 1
    df2 = k * (counts - 1)

    # Calculates the noncentrality paramter
    noncentrality = np.array([
        np.square((sample.mean() - grand_mean) / pooled)
        for sample in samples
        ]).sum() * counts
    # noncentrality = cohen_f2(*samples) * counts

    fu = stats.f.ppf(1 - alpha, df1, df2)

    # Calculates the power using the non-central F distribution
    power = (1 - sp.ncfdtr(df1, df2, noncentrality, fu))
    # the non central F distribution does not return a value of 1,
    # so we replace nans with a value of 1.
    power[np.isnan(power)] = 1

    return power
Ejemplo n.º 12
0
def gauss_invariance_test(suffstat,
                          context,
                          i: int,
                          cond_set: Optional[Union[List[int], int]] = None,
                          alpha: float = 0.05,
                          new=True,
                          zero_mean=False,
                          zero_coeffs=False):
    """
    Test the null hypothesis that two Gaussian distributions are equal.

    Parameters
    ----------
    suffstat:
        dictionary containing:
        'obs' -- number of samples
            'G' -- Gram matrix
        'contexts'
    context:
        which context to test.
    i:
        position of marginal distribution.
    cond_set:
        positions of conditioning set in correlation matrix.
    alpha:
        Significance level.

    Return
    ------
    dictionary containing ttest_stat, ftest_stat, f_pvalue, t_pvalue, and reject.
    """
    cond_set = to_list(cond_set)
    obs_samples = suffstat['obs']['samples']
    iv_samples = suffstat['contexts'][context]['samples']
    n1, p = obs_samples.shape
    n2 = iv_samples.shape[0]

    # === FIND REGRESSION COEFFICIENTS AND RESIDUALS
    if len(cond_set) != 0:
        cond_ix = cond_set if zero_mean else [*cond_set, -1]
        gram1 = suffstat['obs']['G'][np.ix_(cond_ix, cond_ix)]
        gram2 = suffstat['contexts'][context]['G'][np.ix_(cond_ix, cond_ix)]
        coefs1 = np.linalg.inv(gram1) @ obs_samples[:,
                                                    cond_ix].T @ obs_samples[:,
                                                                             i]
        coefs2 = np.linalg.inv(gram2) @ iv_samples[:,
                                                   cond_ix].T @ iv_samples[:,
                                                                           i]

        residuals1 = obs_samples[:, i] - obs_samples[:, cond_ix] @ coefs1
        residuals2 = iv_samples[:, i] - iv_samples[:, cond_ix] @ coefs2
    elif not zero_mean:
        gram1 = n1 * np.ones([1, 1])
        gram2 = n2 * np.ones([1, 1])
        cond_ix = [-1]
        coefs1 = np.array([np.mean(obs_samples[:, i])]) if not zero_mean else 0
        coefs2 = np.array([np.mean(iv_samples[:, i])]) if not zero_mean else 0
        residuals1 = obs_samples[:, i] - coefs1
        residuals2 = iv_samples[:, i] - coefs2
    else:
        residuals1 = obs_samples[:, i]
        residuals2 = iv_samples[:, i]

    # means and variances of residuals
    var1, var2 = np.var(residuals1,
                        ddof=len(cond_ix)), np.var(residuals2,
                                                   ddof=len(cond_ix))

    # calculate regression coefficient invariance statistic
    if len(cond_ix) != 0:
        p = len(cond_ix)
        rc_stat = (coefs1 - coefs2) @ inv(var1 * inv(gram1) + var2 *
                                          inv(gram2)) @ (coefs1 - coefs2).T / p
        rc_pvalue = ncfdtr(p, n1 + n2 - p, 0, rc_stat)
        rc_pvalue = 2 * min(rc_pvalue, 1 - rc_pvalue)

    # calculate statistic for F-Test
    ftest_stat = var1 / var2
    f_pvalue = ncfdtr(n1 - 1, n2 - 1, 0, ftest_stat)
    f_pvalue = 2 * min(f_pvalue, 1 - f_pvalue)

    # === ACCEPT/REJECT INVARIANCE HYPOTHESIS BASED ON P-VALUES WITH BONFERRONI CORRECTION
    if len(cond_ix) != 0:
        reject = f_pvalue < alpha / 2 or rc_pvalue < alpha / 2
    else:
        reject = f_pvalue < alpha

    # === FORM RESULT DICT AND RETUR
    result_dict = dict(ftest_stat=ftest_stat, f_pvalue=f_pvalue, reject=reject)
    if len(cond_ix) > 0:
        result_dict['rc_stat'] = rc_stat
        result_dict['rc_pvalue'] = rc_pvalue

    return result_dict
Ejemplo n.º 13
0
from scipy import special
from scipy import stats
import matplotlib.pyplot as plt

# Plot the CDF of the non-central F distribution, for nc=0.  Compare with the
# F-distribution from scipy.stats:

x = np.linspace(-1, 8, num=500)
dfn = 3
dfd = 2
ncf_stats = stats.f.cdf(x, dfn, dfd)
ncf_special = special.ncfdtr(dfn, dfd, 0, x)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, ncf_stats, 'b-', lw=3)
ax.plot(x, ncf_special, 'r-')
plt.show()
Ejemplo n.º 14
0
def calc_anova(*samples, **kwargs):
    """Calculates statistical power for a one way ANOVA

    This is based on
        Lui, X.S. (2014) *Statistical power analysis for the social and
        behavioral sciences: basic and advanced techniques.* New York:
        Routledge. 378 pg.

    Parameters
    ----------
    samples : ndarrays
        Arrays of observations to be tested.
    counts : array
        the number of observations per sample to be used to test the power
    alpha : float
        The critical value for power calculations

    Returns
    -------
    ndarray
        This describes the probability of seeing a signifigant difference
        between the samples for the specified number of observations
        (count) and critical value.
    """

    # Checks the keywords
    kwds = {'counts': None, 'alpha': 0.05}
    for k, v in kwargs.items():
        kwds[k] = v
    if kwds['counts'] is None:
        raise ValueError('counts is undefined!')
    counts = kwds['counts']
    alpha = kwds['alpha']

    # Converts the samples to arrays
    samples = [np.asarray(sample) for sample in samples]

    k = len(samples)
    df1 = k - 1
    df2 = k * (counts - 1)

    # Calculates the noncentrality paramter
    grand_mean = np.hstack(samples).mean()
    pooled = np.sqrt(
        np.sum([np.square(x.std()) * (len(x) - 1)
                for x in samples]) / (np.sum([len(x) for x in samples]) - 2))

    # Calculates the noncentrality paramter
    noncentrality = np.array([
        np.square((sample.mean() - grand_mean) / pooled) for sample in samples
    ]).sum() * counts
    # noncentrality = cohen_f2(*samples) * counts

    fu = stats.f.ppf(1 - alpha, df1, df2)

    # Calculates the power using the non-central F distribution
    power = (1 - sp.ncfdtr(df1, df2, noncentrality, fu))
    # the non central F distribution does not return a value of 1,
    # so we replace nans with a value of 1.
    power[np.isnan(power)] = 1

    return power
def dci_orient_order_independent(
        X1,
        X2,
        skeletons: Union[Dict[float, set], set],
        nodes_cond_set: set,
        rh1: RegressionHelper = None,
        rh2: RegressionHelper = None,
        alpha: float = 0.1,
        max_set_size: int = 3,
        verbose: int = 0
):
    if verbose > 0:
        print("DCI edge orientation...")

    assert 0 <= alpha <= 1, "alpha must be in [0,1] range."

    if rh1 is None or rh2 is None:
        # obtain sufficient statistics
        suffstat1 = gauss_ci_suffstat(X1)
        suffstat2 = gauss_ci_suffstat(X2)
        rh1 = RegressionHelper(suffstat1)
        rh2 = RegressionHelper(suffstat2)

    if isinstance(skeletons, dict):
        return {
            alpha: dci_orient_order_independent(
                X1,
                X2,
                skeleton,
                nodes_cond_set,
                rh1,
                rh2,
                alpha=alpha,
                max_set_size=max_set_size
            )
            for alpha, skeleton in skeletons.items()
        }

    skeleton = {frozenset({i, j}) for i, j in skeletons}
    nodes = {i for i, j in skeleton} | {j for i, j in skeleton}
    d_nx = nx.DiGraph()
    d_nx.add_nodes_from(nodes)
    nodes_with_decided_parents = set()

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']
    for parent_set_size in range(max_set_size + 2):
        if verbose > 0: print(f"Trying parent sets of size {parent_set_size}")
        pvalue_dict = dict()
        for i in nodes - nodes_with_decided_parents:
            for cond_i in itertools.combinations(nodes_cond_set - {i}, parent_set_size):
                beta1_i, var1_i, _ = rh1.regression(i, list(cond_i))
                beta2_i, var2_i, _ = rh2.regression(i, list(cond_i))
                pvalue_i = ncfdtr(n1 - parent_set_size, n2 - parent_set_size, 0, var1_i / var2_i)
                pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i)
                pvalue_dict[(i, frozenset(cond_i))] = pvalue_i
        # sort p-value dict
        sorted_pvalue_dict = [
            (pvalue, i, cond_i)
            for (i, cond_i), pvalue in sorted(pvalue_dict.items(), key=op.itemgetter(1), reverse=True)
            if pvalue > alpha
        ]
        while sorted_pvalue_dict:
            _, i, cond_i = sorted_pvalue_dict.pop(0)
            i_children = {j for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton}

            # don't use this parent set if it contradicts the existing edges
            if any(j in d_nx.successors(i) for j in cond_i):
                continue
            if any(j in d_nx.predecessors(i) for j in i_children):
                continue

            # don't use this parent set if it creates a cycle
            if any(j in nx.descendants(d_nx, i) for j in cond_i):
                continue
            if any(j in nx.ancestors(d_nx, i) for j in i_children):
                continue

            edges = {(j, i) for j in cond_i if frozenset({i, j}) in skeleton} | \
                    {(i, j) for j in nodes - cond_i - {i} if frozenset({i, j}) in skeleton}
            nodes_with_decided_parents.add(i)
            if verbose > 0: print(f"Adding {edges}")
            d_nx.add_edges_from(edges)

    # orient edges via graph traversal
    oriented_edges = set(d_nx.edges)
    unoriented_edges_before_traversal = skeleton - {frozenset({j, i}) for i, j in oriented_edges}
    unoriented_edges = unoriented_edges_before_traversal.copy()
    g = nx.DiGraph()
    for i, j in oriented_edges:
        g.add_edge(i, j)
    g.add_nodes_from(nodes)

    for i, j in unoriented_edges_before_traversal:
        chain_path = list(nx.all_simple_paths(g, source=i, target=j))
        if len(chain_path) > 0:
            oriented_edges.add((i, j))
            unoriented_edges.remove(frozenset({i, j}))
            if verbose > 0:
                print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j)))
        else:
            chain_path = list(nx.all_simple_paths(g, source=j, target=i))
            if len(chain_path) > 0:
                oriented_edges.add((j, i))
                unoriented_edges.remove(frozenset({i, j}))
                if verbose > 0:
                    print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i)))

    # form an adjacency matrix containing directed and undirected edges
    num_nodes = X1.shape[1]
    adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes,
                                                                                                       oriented_edges,
                                                                                                       undirected=False)
    return adjacency_matrix
Ejemplo n.º 16
0
def _nonadjusted(df1, df2, fcrit, noncen):
    """CDF function (no approximation)"""
    prob = special.ncfdtr(df1, df2, noncen, fcrit)
    fmethod = Constants.FMETHOD_NOAPPROXIMATION
    return prob, fmethod
def dci_orient(
        X1,
        X2,
        skeleton: set,
        nodes_cond_set: set,
        rh1: RegressionHelper = None,
        rh2: RegressionHelper = None,
        alpha: float = 0.1,
        max_set_size: int = 3,
        verbose: int = 0
):
    """
    Orients edges in the skeleton of the difference DAG.

    Parameters
    ----------
    X1: array, shape = [n_samples, n_features]
        First dataset.    
    X2: array, shape = [n_samples, n_features]
        Second dataset.
    skeleton: set
        Set of edges in the skeleton of the difference-DAG.
    nodes_cond_set: set
        Nodes to be considered as conditioning sets.
    rh1: RegressionHelper, default = None
        Sufficient statistics estimated based on samples in the first dataset, stored in RegressionHelper class.
    rh2: RegressionHelper, default = None
        Sufficient statistics estimated based on samples in the second dataset, stored in RegressionHelper class.
    alpha: float, default = 0.1
        Significance level parameter for determining orientation of an edge.
        Lower alpha results in more directed edges in the difference-DAG.
    max_set_size: int, default = 3
        Maximum conditioning set size used to test regression invariance.
        Smaller maximum conditioning set size results in faster computation time. For large datasets recommended max_set_size is 3.
    verbose: int, default = 0
        The verbosity level of logging messages.

    See Also
    --------
    dci, dci_undirected_graph, dci_skeleton

    Returns
    -------
    oriented_edges: set
        Set of edges in the skeleton of the difference-DAG for which directionality could be determined.
    unoriented_edges: set
        Set of edges in the skeleton of the difference-DAG for which directionality could not be determined.
    """

    if verbose > 0:
        print("DCI edge orientation...")

    assert 0 <= alpha <= 1, "alpha must be in [0,1] range."

    if rh1 is None or rh2 is None:
        # obtain sufficient statistics
        suffstat1 = gauss_ci_suffstat(X1)
        suffstat2 = gauss_ci_suffstat(X2)
        rh1 = RegressionHelper(suffstat1)
        rh2 = RegressionHelper(suffstat2)

    nodes = {i for i, j in skeleton} | {j for i, j in skeleton}
    oriented_edges = set()

    n1 = rh1.suffstat['n']
    n2 = rh2.suffstat['n']
    for i, j in skeleton:
        for cond_i, cond_j in zip(powerset(nodes_cond_set - {i}, r_max=max_set_size),
                                  powerset(nodes_cond_set - {j}, r_max=max_set_size)):
            # compute residual variances for i
            beta1_i, var1_i, _ = rh1.regression(i, list(cond_i))
            beta2_i, var2_i, _ = rh2.regression(i, list(cond_i))
            # compute p-value for invariance of residual variances for i
            pvalue_i = ncfdtr(n1 - len(cond_i), n2 - len(cond_i), 0, var1_i / var2_i)
            pvalue_i = 2 * min(pvalue_i, 1 - pvalue_i)

            # compute residual variances for j
            beta1_j, var1_j, _ = rh1.regression(j, list(cond_j))
            beta2_j, var2_j, _ = rh2.regression(j, list(cond_j))
            # compute p-value for invariance of residual variances for j
            pvalue_j = ncfdtr(n1 - len(cond_j), n2 - len(cond_j), 0, var1_j / var2_j)
            pvalue_j = 2 * min(pvalue_j, 1 - pvalue_j)

            if ((pvalue_i > alpha) | (pvalue_j > alpha)):
                # orient the edge according to highest p-value
                if pvalue_i > pvalue_j:
                    edge = (j, i) if j in cond_i else (i, j)
                    pvalue_used = pvalue_i
                else:
                    edge = (i, j) if i in cond_j else (j, i)
                    pvalue_used = pvalue_j
                oriented_edges.add(edge)

                if verbose > 0:
                    print("Oriented (%d, %d) as %s since p-value=%.5f > alpha=%.5f" % (i, j, edge, pvalue_used, alpha))
                break

    # orient edges via graph traversal
    unoriented_edges_before_traversal = skeleton - oriented_edges - {(j, i) for i, j in oriented_edges}
    unoriented_edges = unoriented_edges_before_traversal.copy()
    g = nx.DiGraph()
    for i, j in oriented_edges:
        g.add_edge(i, j)
    g.add_nodes_from(nodes)

    for i, j in unoriented_edges_before_traversal:
        chain_path = list(nx.all_simple_paths(g, source=i, target=j))
        if len(chain_path) > 0:
            oriented_edges.add((i, j))
            unoriented_edges.remove((i, j))
            if verbose > 0:
                print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (i, j)))
        else:
            chain_path = list(nx.all_simple_paths(g, source=j, target=i))
            if len(chain_path) > 0:
                oriented_edges.add((j, i))
                unoriented_edges.remove((i, j))
                if verbose > 0:
                    print("Oriented (%d, %d) as %s with graph traversal" % (i, j, (j, i)))

    # form an adjacency matrix containing directed and undirected edges
    num_nodes = X1.shape[1]
    adjacency_matrix = edges2adjacency(num_nodes, unoriented_edges, undirected=True) + edges2adjacency(num_nodes,
                                                                                                       oriented_edges,
                                                                                                       undirected=False)
    return adjacency_matrix
Ejemplo n.º 18
0
 def my_func(v, power):
     return 1 - special.ncfdtr(u, v, f2 * (u + v + 1),
                               f.ppf(1 - sig_level, u, v)) - power