Exemple #1
0
def _boot_func(x, y, func, func_args, paired, random_state):
    """For use in parallel boot_func"""
    random_state = _check_random_state(random_state)
    if paired:
        idx = np.random.choice(np.arange(len(x)), size=x.size, replace=True)
        x, y = x[idx], y[idx]
    else:
        x = random_state.choice(x, size=x.size, replace=True)
        y = random_state.choice(y, size=y.size, replace=True)
    return boot_func(x, y, func, func_args, paired=paired, n_boot=0)
Exemple #2
0
def _cohens_d(x, y, paired, equal_var, value, random_state):
    """For use in parallel cohens_d"""
    random_state = _check_random_state(random_state)
    if paired:
        idx = np.random.choice(np.arange(len(x)), size=x.size, replace=True)
        x, y = x[idx], y[idx]
    else:
        x = random_state.choice(x, size=x.size, replace=True)
        if y is not None:
            y = random_state.choice(y, size=y.size, replace=True)
    return cohens_d(x, y, 0, equal_var, value)
Exemple #3
0
def boot_func(x,
              y=None,
              func=None,
              func_args={},
              paired=False,
              n_boot=500,
              n_jobs=1,
              seed=None):
    """
    Bootstrap an arbitrary function by resampling from x and y independently or jointly.

    Args:
        x (list-like): list of values for first group
        y (list-like): list of values for second group; optional
        function (callable): function that accepts x or y
        paired (bool): whether to resample x and y jointly or independently
        n_boot (int): number of bootstrap iterations
        n_jobs (int): number of parallel cores; default 1

    Returns:
        Multiple:
        
            - **original_stat** (*float*): function result with given data

            - **ci** (*np.array*): lower and upper bounds of 95% confidence intervals

    """

    if not callable(func):
        raise TypeError("func must be a valid callable function")

    orig_result = func(x, y, **func_args)
    if n_boot:
        random_state = _check_random_state(seed)
        seeds = random_state.randint(MAX_INT, size=n_boot)
        par_for = Parallel(n_jobs=n_jobs, backend="multiprocessing")
        boots = par_for(
            delayed(_boot_func)
            (x, y, func, func_args, paired, **func_args, random_state=seeds[i])
            for i in range(n_boot))
        ci_u = np.percentile(boots, 97.5, axis=0)
        ci_l = np.percentile(boots, 2.5, axis=0)
        return orig_result, (ci_l, ci_u)
    else:
        return orig_result
Exemple #4
0
def _perm_test(x, y, stat, equal_var, random_state):
    """For use in parallel perm_test"""
    random_state = _check_random_state(random_state)
    if stat in ["pearsonr", "spearmanr"]:
        y = random_state.permutation(y)
    elif stat in ["tstat", "cohensd", "mean"]:
        if y is None:
            x = x * random_state.choice([1, -1], len(x))
        elif isinstance(y, (float, int)):
            x -= y
            x = x * random_state.choice([1, -1], len(x))
        else:
            shuffled_combined = random_state.permutation(np.hstack([x, y]))
            x, y = shuffled_combined[: x.size], shuffled_combined[x.size :]
    elif (stat == "tstat-paired") or (y is None):
        x = x * random_state.choice([1, -1], len(x))

    return perm_test(x, y, stat, equal_var=equal_var, n_perm=0)
Exemple #5
0
def cohens_d(
    x, y=None, paired=False, n_boot=1000, equal_var=False, value=0, n_jobs=1, seed=None
):
    """
    Compute Cohen's d for one or two samples (paired or independent). For paired samples Cohen's Dz is computed (ref: https://bit.ly/2J54P61). If x and y are not the same size this will use the same pooled SD calculation in Welch's ttest to account for unequal variances. Unequal variance calculation will almost always produce a *smaller* estimate than the standard formula, except as the variance of the group with fewer observations increases. In that case, this estimate can be *larger* than the standard formula. This can be turned off with the equal_var=True argument. Percentile boot-strapped confidence intervals can also be returned

    Args:
        x (list-like): array or list of observations from first group
        y (list-like): array or list of observations from second group or second set of observations from the same group; optional
        paired (bool): whether to treat x any y (if provided) as paired or independent
        n_boot (int): number of bootstrap samples to run; set to 0 to skip computing
        equal_var (bool): should we pool standard deviation as in Welch's t-test
        value (float): a value to see if the effect size is bigger than; `eff size - value` will be computed; default 0
        n_jobs (int): number of parallel cores to use for bootstraping; default 1
        seed (int or None): numerical seed for reproducibility of bootstrapping

    Returns:
        Multiple:
        
            - **effect_size** (*float*): cohen's d    

            - **ci** (*np.array*): lower and upper bounds of 95% bootstrapped confidence intervals; optional

    """

    if y is None:
        eff = x.mean() / x.std(ddof=1)
    else:
        if paired:
            # Cohen's Dz
            if (y is None) or (len(x) != len(y)):
                raise ValueError(
                    "with paired=True, both and x and y must be provided and must have the same number of observations"
                )
            numerator = np.subtract(x, y).mean() - value
            denominator = x.std(ddof=1) - y.std(ddof=1)
            eff = numerator / denominator

        else:
            # Cohen's D
            m1, s1, ss1, m2, s2, ss2 = (
                x.mean(),
                x.var(ddof=1),
                x.size,
                y.mean(),
                y.var(ddof=1),
                y.size,
            )

            if equal_var:
                pooled_sd = np.sqrt(np.mean([s1, s2]))
            else:
                pooled_sd = np.sqrt(
                    (((ss1 - 1) * s1 + ((ss2 - 1) * s2))) / (ss1 + ss2 - 2)
                )

            numerator = m1 - m2 - value
            eff = numerator / pooled_sd

    if n_boot:
        random_state = _check_random_state(seed)
        seeds = random_state.randint(MAX_INT, size=n_boot)
        par_for = Parallel(n_jobs=n_jobs, backend="multiprocessing")
        boots = par_for(
            delayed(_cohens_d)(x, y, paired, equal_var, value, random_state=seeds[i])
            for i in range(n_boot)
        )
        ci_u = np.percentile(boots, 97.5, axis=0)
        ci_l = np.percentile(boots, 2.5, axis=0)
        return eff, (ci_l, ci_u)
    else:
        return eff
Exemple #6
0
def perm_test(
    x,
    y=None,
    stat="tstat",
    n_perm=1000,
    equal_var=False,
    tails=2,
    return_dist=False,
    n_jobs=1,
    seed=None,
):
    """
    General purpose permutation test between two samples. Can handle a wide varierty of permutation tests including ttest, paired ttest, mean diff test, cohens d, pearson r, spearman r.

    Args:
        x (list-like): array or list of observations from first group
        y (list-like): array or list of observations from second group
        stat (string): one of ['tstat', 'tstat-paired', 'mean', 'cohensd', 'pearsonr', 'spearmanr']; 'mean' will just compute permutations on the difference between the mean of x and mean of y. Useful if statistics are precomputed (e.g. x and y contain correlation values, or t-stats).
        n_perm (int): number of permutations; set to 0 to return parametric results
        equal_var (bool): should assume equal variances for tstat and cohensd
        tails (int): perform one or two-tailed p-value computations; default 2
        return_dists (bool): return permutation distribution
        n_jobs (int): number of parallel cores to use for bootstraping; default 1
        seed (int): for reproducing results

    Returns:
        Multiple:
        
            - **original_stat** (*float*): the original statistic  

            - **perm_p_val** (*float*): the permuted p-value

            - **perm_dist** (*np.array*): array of permuted statistic; optional

    """

    if ((y is None) or isinstance(y, (float, int))) and (
        stat in ["pearsonr", "spearmanr"]
    ):
        raise ValueError("y must be provided for 'pearsonr' or 'spearmanr'")

    if stat == "tstat":
        if isinstance(y, (list, np.ndarray)):
            func = partial(ttest_ind, equal_var=equal_var)
        else:
            if y is None:
                y = 0
            func = partial(ttest_1samp)
        multi_return = True
    elif stat == "tstat-paired":
        func = ttest_rel
        multi_return = True
        if len(x) != len(y):
            raise ValueError("x and y must be the same length")
    elif stat == "mean":

        def func(x, y):
            if y is not None:
                if isinstance(y, (list, np.ndarray)):
                    return x.mean() - y.mean()
                elif isinstance(y, (float, int)):
                    raise NotImplementedError(
                        "One-sample mean test with a scalar y is not currently supported"
                    )
            else:
                return x.mean()

        multi_return = False
    elif stat == "cohensd":
        func = partial(cohens_d, equal_var=equal_var, n_boot=0)
        multi_return = False
    elif stat == "pearsonr":
        func = pearsonr
        multi_return = True
        if len(x) != len(y):
            raise ValueError("x and y must be the same length")
    elif stat == "spearmanr":
        func = spearmanr
        multi_return = True
        if len(x) != len(y):
            raise ValueError("x and y must be the same length")
    else:
        raise ValueError(
            "stat must be in ['tstat', 'tstat-paired', 'mean', 'cohensd', 'pearsonr', 'spearmanr']"
        )

    # Get original statistic
    original_stat = func(x, y)
    if multi_return:
        original_stat = original_stat[0]

    # Permute
    if n_perm == 0:
        return func(x, y)
    else:
        random_state = _check_random_state(seed)
        seeds = random_state.randint(MAX_INT, size=n_perm)
        par_for = Parallel(n_jobs=n_jobs, backend="multiprocessing")
        perms = par_for(
            delayed(_perm_test)(x, y, stat, equal_var, random_state=seeds[i])
            for i in range(n_perm)
        )
        if multi_return:
            perms = [elem[0] for elem in perms]

        denom = float(len(perms)) + 1

        if tails == 2:
            numer = np.sum(np.abs(perms) >= np.abs(original_stat)) + 1
        elif tails == 1:
            if original_stat >= 0:
                numer = np.sum(perms >= original_stat) + 1
            else:
                numer = np.sum(perms <= original_stat) + 1
        else:
            raise ValueError("tails must be 1 or 2")
        p = numer / denom
        if return_dist:
            return original_stat, p, perms
        else:
            return original_stat, p