def efficient_pearsonr(a, b): """ Computes correlation of matching columns in `a` and `b` Parameters ---------- a,b : array_like Sample observations. These arrays must have the same length and either an equivalent number of columns or be broadcastable Returns ------- corr : float or numpy.ndarray Pearson's correlation coefficient between matching columns of inputs pval : float or numpy.ndarray Two-tailed p-values Examples -------- >>> from netneurotools import datasets, stats Generate some not-very-correlated and some highly-correlated data: >>> np.random.seed(12345678) # set random seed for reproducible results >>> x1, y1 = datasets.make_correlated_xy(corr=0.1, size=100) >>> x2, y2 = datasets.make_correlated_xy(corr=0.8, size=100) Calculate both correlations simultaneously: >>> x = np.column_stack((x1, x2)) >>> y = np.column_stack((y1, y2)) >>> stats.efficient_pearsonr(x, y) (array([0.10032565, 0.79961189]), array([3.20636135e-01, 1.97429944e-23])) """ a, b, axis = _chk2_asarray(a, b, 0) if len(a) != len(b): raise ValueError('Provided arrays do not have same length') if a.size == 0 or b.size == 0: return np.nan, np.nan a, b = a.reshape(len(a), -1), b.reshape(len(b), -1) if (a.shape[1] != b.shape[1]): a, b = np.broadcast_arrays(a, b) with np.errstate(invalid='ignore'): corr = sstats.zscore(a, ddof=1) * sstats.zscore(b, ddof=1) corr = np.sum(corr, axis=0) / (len(a) - 1) corr = np.squeeze(np.clip(corr, -1, 1)) / 1 # taken from scipy.stats ab = (len(a) / 2) - 1 prob = 2 * special.btdtr(ab, ab, 0.5 * (1 - np.abs(corr))) return corr, prob
def efficient_pearsonr(a, b, ddof=1, nan_policy='propagate'): """ Computes correlation of matching columns in `a` and `b` Parameters ---------- a,b : array_like Sample observations. These arrays must have the same length and either an equivalent number of columns or be broadcastable ddof : int, optional Degrees of freedom correction in the calculation of the standard deviation. Default: 1 nan_policy : bool, optional Defines how to handle when input contains nan. 'propagate' returns nan, 'raise' throws an error, 'omit' performs the calculations ignoring nan values. Default: 'propagate' Returns ------- corr : float or numpy.ndarray Pearson's correlation coefficient between matching columns of inputs pval : float or numpy.ndarray Two-tailed p-values Notes ----- If either input contains nan and nan_policy is set to 'omit', both arrays will be masked to omit the nan entries. Examples -------- >>> from netneurotools import datasets, stats Generate some not-very-correlated and some highly-correlated data: >>> np.random.seed(12345678) # set random seed for reproducible results >>> x1, y1 = datasets.make_correlated_xy(corr=0.1, size=100) >>> x2, y2 = datasets.make_correlated_xy(corr=0.8, size=100) Calculate both correlations simultaneously: >>> stats.efficient_pearsonr(np.c_[x1, x2], np.c_[y1, y2]) (array([0.10032565, 0.79961189]), array([3.20636135e-01, 1.97429944e-23])) """ a, b, axis = _chk2_asarray(a, b, 0) if len(a) != len(b): raise ValueError('Provided arrays do not have same length') if a.size == 0 or b.size == 0: return np.nan, np.nan if nan_policy not in ('propagate', 'raise', 'omit'): raise ValueError(f'Value for nan_policy "{nan_policy}" not allowed') a, b = a.reshape(len(a), -1), b.reshape(len(b), -1) if (a.shape[1] != b.shape[1]): a, b = np.broadcast_arrays(a, b) mask = np.logical_or(np.isnan(a), np.isnan(b)) if nan_policy == 'raise' and np.any(mask): raise ValueError('Input cannot contain NaN when nan_policy is "omit"') elif nan_policy == 'omit': # avoid making copies of the data, if possible a = np.ma.masked_array(a, mask, copy=False, fill_value=np.nan) b = np.ma.masked_array(b, mask, copy=False, fill_value=np.nan) with np.errstate(invalid='ignore'): corr = (sstats.zscore(a, ddof=ddof, nan_policy=nan_policy) * sstats.zscore(b, ddof=ddof, nan_policy=nan_policy)) sumfunc, n_obs = np.sum, len(a) if nan_policy == 'omit': corr = corr.filled(np.nan) sumfunc = np.nansum n_obs = np.squeeze(np.sum(np.logical_not(np.isnan(corr)), axis=0)) corr = sumfunc(corr, axis=0) / (n_obs - 1) corr = np.squeeze(np.clip(corr, -1, 1)) / 1 # taken from scipy.stats ab = (n_obs / 2) - 1 prob = 2 * special.btdtr(ab, ab, 0.5 * (1 - np.abs(corr))) return corr, prob
def permtest_pearsonr(a, b, axis=0, n_perm=1000, resamples=None, seed=0): """ Non-parametric equivalent of :py:func:`scipy.stats.pearsonr` Generates two-tailed p-value for hypothesis of whether samples `a` and `b` are correlated using permutation tests Parameters ---------- a,b : (N[, M]) array_like Sample observations. These arrays must have the same length and either an equivalent number of columns or be broadcastable axis : int or None, optional Axis along which to compute test. If None, compute over whole arrays of `a` and `b`. Default: 0 n_perm : int, optional Number of permutations to assess. Unless `a` and `b` are very small along `axis` this will approximate a randomization test via Monte Carlo simulations. Default: 1000 resamples : (N, P) array_like, optional Resampling array used to shuffle `a` when generating null distribution of correlations. This array must have the same length as `a` and `b` and should have at least the same number of columns as `n_perm` (if it has more then only `n_perm` columns will be used. When not specified a standard permutation is used to shuffle `a`. Default: None seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Set to None for "randomness". Default: 0 Returns ------- corr : float or numpyndarray Correlations pvalue : float or numpy.ndarray Non-parametric p-value Notes ----- The lowest p-value that can be returned by this function is equal to 1 / (`n_perm` + 1). Examples -------- >>> from netneurotools import datasets, stats >>> np.random.seed(12345678) # set random seed for reproducible results >>> x, y = datasets.make_correlated_xy(corr=0.1, size=100) >>> stats.permtest_pearsonr(x, y) # doctest: +SKIP (0.10032564626876286, 0.3046953046953047) >>> x, y = datasets.make_correlated_xy(corr=0.5, size=100) >>> stats.permtest_pearsonr(x, y) # doctest: +SKIP (0.500040365781984, 0.000999000999000999) Also works with multiple columns by either broadcasting the smaller array to the larger: >>> z = x + np.random.normal(loc=1, size=100) >>> stats.permtest_pearsonr(x, np.column_stack([y, z])) (array([0.50004037, 0.25843187]), array([0.000999 , 0.01098901])) or by using matching columns in the two arrays (e.g., `x` and `y` vs `a` and `b`): >>> a, b = datasets.make_correlated_xy(corr=0.9, size=100) >>> stats.permtest_pearsonr(np.column_stack([x, a]), np.column_stack([y, b])) (array([0.50004037, 0.89927523]), array([0.000999, 0.000999])) """ # noqa a, b, axis = _chk2_asarray(a, b, axis) rs = check_random_state(seed) if len(a) != len(b): raise ValueError('Provided arrays do not have same length') if a.size == 0 or b.size == 0: return np.nan, np.nan if resamples is not None: if n_perm > resamples.shape[-1]: raise ValueError('Number of permutations requested exceeds size ' 'of resampling array.') # divide by one forces coercion to float if ndim = 0 true_corr = efficient_pearsonr(a, b)[0] / 1 abs_true = np.abs(true_corr) permutations = np.ones(true_corr.shape) for perm in range(n_perm): # permute `a` and determine whether correlations exceed original if resamples is None: ap = a[rs.permutation(len(a))] else: ap = a[resamples[:, perm]] permutations += np.abs(efficient_pearsonr(ap, b)[0]) >= abs_true pvals = permutations / (n_perm + 1) # + 1 in denom accounts for true_corr return true_corr, pvals
def permtest_rel(a, b, axis=0, n_perm=1000, seed=0): """ Non-parametric equivalent of :py:func:`scipy.stats.ttest_rel` Generates two-tailed p-value for hypothesis of whether related samples `a` and `b` differ using permutation tests Parameters ---------- a, b : array_like Sample observations. These arrays must have the same shape. axis : int or None, optional Axis along which to compute test. If None, compute over whole arrays of `a` and `b`. Default: 0 n_perm : int, optional Number of permutations to assess. Unless `a` and `b` are very small along `axis` this will approximate a randomization test via Monte Carlo simulations. Default: 1000 seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Set to None for "randomness". Default: 0 Returns ------- stat : float or numpy.ndarray Average difference between `a` and `b` pvalue : float or numpy.ndarray Non-parametric p-value Notes ----- The lowest p-value that can be returned by this function is equal to 1 / (`n_perm` + 1). Examples -------- >>> from netneurotools import stats >>> np.random.seed(12345678) # set random seed for reproducible results >>> rvs1 = np.random.normal(loc=5, scale=10, size=500) >>> rvs2 = (np.random.normal(loc=5, scale=10, size=500) ... + np.random.normal(scale=0.2, size=500)) >>> stats.permtest_rel(rvs1, rvs2) # doctest: +SKIP (-0.16506275161572695, 0.8021978021978022) >>> rvs3 = (np.random.normal(loc=8, scale=10, size=500) ... + np.random.normal(scale=0.2, size=500)) >>> stats.permtest_rel(rvs1, rvs3) # doctest: +SKIP (2.40533726097883, 0.000999000999000999) """ a, b, axis = _chk2_asarray(a, b, axis) rs = check_random_state(seed) if a.shape[axis] != b.shape[axis]: raise ValueError('Provided arrays do not have same length along axis') if a.size == 0 or b.size == 0: return np.nan, np.nan # calculate original difference in means ab = np.stack([a, b], axis=0) if ab.ndim < 3: ab = np.expand_dims(ab, axis=-1) true_diff = np.squeeze(np.diff(ab, axis=0)).mean(axis=axis) / 1 abs_true = np.abs(true_diff) # idx array reidx = np.meshgrid(*[range(f) for f in ab.shape], indexing='ij') permutations = np.ones(true_diff.shape) for perm in range(n_perm): # use this to re-index (i.e., swap along) the first axis of `ab` swap = rs.random_sample(ab.shape[:-1]).argsort(axis=axis) reidx[0] = np.repeat(swap[..., np.newaxis], ab.shape[-1], axis=-1) # recompute difference between `a` and `b` (i.e., first axis of `ab`) pdiff = np.squeeze(np.diff(ab[tuple(reidx)], axis=0)).mean(axis=axis) permutations += np.abs(pdiff) >= abs_true pvals = permutations / (n_perm + 1) # + 1 in denom accounts for true_diff return true_diff, pvals
def permtest_1samp(a, popmean, axis=0, n_perm=1000, seed=0): """ Non-parametric equivalent of :py:func:`scipy.stats.ttest_1samp` Generates two-tailed p-value for hypothesis of whether `a` differs from `popmean` using permutation tests Parameters ---------- a : array_like Sample observations popmean : float or array_like Expected valued in null hypothesis. If array_like then it must have the same shape as `a` excluding the `axis` dimension axis : int or None, optional Axis along which to compute test. If None, compute over the whole array of `a`. Default: 0 n_perm : int, optional Number of permutations to assess. Unless `a` is very small along `axis` this will approximate a randomization test via Monte Carlo simulations. Default: 1000 seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Set to None for "randomness". Default: 0 Returns ------- stat : float or numpy.ndarray Difference from `popmean` pvalue : float or numpy.ndarray Non-parametric p-value Notes ----- Providing multiple values to `popmean` to run *independent* tests in parallel is not currently supported. The lowest p-value that can be returned by this function is equal to 1 / (`n_perm` + 1). Examples -------- >>> from netneurotools import stats >>> np.random.seed(7654567) # set random seed for reproducible results >>> rvs = np.random.normal(loc=5, scale=10, size=(50, 2)) Test if mean of random sample is equal to true mean, and different mean. We reject the null hypothesis in the second case and don't reject it in the first case. >>> stats.permtest_1samp(rvs, 5.0) (array([-0.985602 , -0.05204969]), array([0.48551449, 0.95904096])) >>> stats.permtest_1samp(rvs, 0.0) (array([4.014398 , 4.94795031]), array([0.00699301, 0.000999 ])) Example using axis and non-scalar dimension for population mean >>> stats.permtest_1samp(rvs, [5.0, 0.0]) (array([-0.985602 , 4.94795031]), array([0.48551449, 0.000999 ])) >>> stats.permtest_1samp(rvs.T, [5.0, 0.0], axis=1) (array([-0.985602 , 4.94795031]), array([0.51548452, 0.000999 ])) """ a, popmean, axis = _chk2_asarray(a, popmean, axis) rs = check_random_state(seed) if a.size == 0: return np.nan, np.nan # ensure popmean will broadcast to `a` correctly if popmean.ndim != a.ndim: popmean = np.expand_dims(popmean, axis=axis) # center `a` around `popmean` and calculate original mean zeroed = a - popmean true_mean = zeroed.mean(axis=axis) / 1 abs_mean = np.abs(true_mean) # this for loop is not _the fastest_ but is memory efficient # the broadcasting alt. would mean storing zeroed.size * n_perm in memory permutations = np.ones(true_mean.shape) for perm in range(n_perm): flipped = zeroed * rs.choice([-1, 1], size=zeroed.shape) # sign flip permutations += np.abs(flipped.mean(axis=axis)) >= abs_mean pvals = permutations / (n_perm + 1) # + 1 in denom accounts for true_mean return true_mean, pvals
def permtest_rel(a, b, axis=0, n_perm=1000, seed=0): """ Non-parametric equivalent of :py:func:`scipy.stats.ttest_rel` Generates two-tailed p-value for hypothesis of whether related samples `a` and `b` differ Parameters ---------- a, b : array_like Sample observations. These arrays must have the same shape. axis : int or None, optional Axis along which to compute test. If None, compute over whole arrays of `a` and `b`. Default: 0 n_perm : int, optional Number of permutations to assess. Unless `a` and `b` are very small along `axis` this will approximate a randomization test via Monte Carlo simulations. Default: 1000 seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Set to None for "randomness". Default: 0 Returns ------- pvalue : float or numpy.ndarray Non-parametric p-value Examples -------- >>> from netneurotools import stats >>> np.random.seed(12345678) # set random seed for reproducible results >>> rvs1 = np.random.normal(loc=5, scale=10, size=500) >>> rvs2 = (np.random.normal(loc=5, scale=10, size=500) ... + np.random.normal(scale=0.2, size=500)) >>> stats.permtest_rel(rvs1, rvs2) 0.8021978021978022 >>> rvs3 = (np.random.normal(loc=8, scale=10, size=500) ... + np.random.normal(scale=0.2, size=500)) >>> stats.permtest_rel(rvs1, rvs3) 0.000999000999000999 """ a, b, axis = _chk2_asarray(a, b, axis) rs = check_random_state(seed) # calculate original difference in means ab = np.stack([a, b], axis=0) true_diff = np.diff(ab, axis=0).squeeze().mean(axis=axis) # array to hold counts; use 1s instead of 0s to account for original value permutations = np.ones(np.delete(a.shape, axis)) if axis is not None else 1 # idx array reidx = np.meshgrid(*[range(f) for f in ab.shape], indexing='ij') for perm in range(n_perm): # swap matched samples between `a` and `b` randomly and recompute diff reidx[0] = rs.random_sample(ab.shape).argsort(axis=0) perm_diff = np.diff(ab[tuple(reidx)], axis=0).squeeze().mean(axis=axis) permutations += np.abs(perm_diff) >= np.abs(true_diff) return permutations / (n_perm + 1) # + 1 in denom accounts for true_diff