def skipped(x, y, method='spearman'): """ Skipped correlation (Rousselet and Pernet 2012). Parameters ---------- x, y : array_like First and second set of observations. x and y must be independent. method : str Method used to compute the correlation after outlier removal. Can be either 'spearman' (default) or 'pearson'. Returns ------- r : float Skipped correlation coefficient. pval : float Two-tailed p-value. outliers : array of bool Indicate if value is an outlier or not Notes ----- The skipped correlation involves multivariate outlier detection using a projection technique (Wilcox, 2004, 2005). First, a robust estimator of multivariate location and scatter, for instance the minimum covariance determinant estimator (MCD; Rousseeuw, 1984; Rousseeuw and van Driessen, 1999; Hubert et al., 2008) is computed. Second, data points are orthogonally projected on lines joining each of the data point to the location estimator. Third, outliers are detected using a robust technique. Finally, Spearman correlations are computed on the remaining data points and calculations are adjusted by taking into account the dependency among the remaining data points. Code inspired by Matlab code from Cyril Pernet and Guillaume Rousselet [1]_. Requires scikit-learn. References ---------- .. [1] Pernet CR, Wilcox R, Rousselet GA. Robust Correlation Analyses: False Positive and Power Validation Using a New Open Source Matlab Toolbox. Frontiers in Psychology. 2012;3:606. doi:10.3389/fpsyg.2012.00606. """ # Check that sklearn is installed from pingouin.utils import is_sklearn_installed is_sklearn_installed(raise_error=True) from scipy.stats import chi2 from sklearn.covariance import MinCovDet X = np.column_stack((x, y)) center = MinCovDet().fit(X).location_ # Detect outliers based on robust covariance nrows, ncols = X.shape gval = np.sqrt(chi2.ppf(0.975, 2)) # Loop over rows record = np.zeros(shape=(nrows, nrows)) for i in np.arange(nrows): dis = np.zeros(nrows) B = (X[i, :] - center).T bot = np.sum(B**2) if bot != 0: for j in np.arange(nrows): A = X[j, :] - center dis[j] = np.linalg.norm(A * B / bot * B) # Apply the MAD median rule MAD = mad(dis) outliers = madmedianrule(dis) record[i, :] = dis > (np.median(dis) + gval * MAD) outliers = np.sum(record, axis=0) >= 1 # Compute correlation on remaining data if method == 'spearman': r, pval = spearmanr(X[~outliers, 0], X[~outliers, 1]) else: r, pval = pearsonr(X[~outliers, 0], X[~outliers, 1]) return r, pval, outliers
def logistic_regression(X, y, coef_only=False, alpha=0.05, as_dataframe=True, **kwargs): """(Multiple) Binary logistic regression. Parameters ---------- X : np.array or list Predictor(s). Shape = (n_samples, n_features) or (n_samples,). y : np.array or list Dependent variable. Shape = (n_samples). Must be binary. coef_only : bool If True, return only the regression coefficients. alpha : float Alpha value used for the confidence intervals. CI = [alpha / 2 ; 1 - alpha / 2] as_dataframe : bool If True, returns a pandas DataFrame. If False, returns a dictionnary. **kwargs : optional Optional arguments passed to sklearn.linear_model.LogisticRegression Returns ------- stats : dataframe or dict Logistic regression summary:: 'names' : name of variable(s) in the model (e.g. x1, x2...) 'coef' : regression coefficients 'se' : standard error 'z' : z-scores 'pval' : two-tailed p-values 'CI[2.5%]' : lower confidence interval 'CI[97.5%]' : upper confidence interval Notes ----- This is a wrapper around the sklearn.linear_model.LogisticRegression class. Results have been compared against statsmodels and JASP. Note that the first coefficient is always the constant term (intercept) of the model. Adapted from a code found at https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d Examples -------- 1. Simple binary logistic regression >>> import numpy as np >>> from pingouin import logistic_regression >>> np.random.seed(123) >>> x = np.random.normal(size=30) >>> y = np.random.randint(0, 2, size=30) >>> lom = logistic_regression(x, y) >>> print(lom['coef'].values) [-0.27122371 0.05927182] 2. Multiple binary logistic regression >>> np.random.seed(42) >>> z = np.random.normal(size=30) >>> X = np.column_stack((x, z)) >>> lom = logistic_regression(X, y) >>> print(lom['coef'].values) [-0.34933805 -0.0226106 -0.39453532] 3. Using a Pandas DataFrame >>> import pandas as pd >>> df = pd.DataFrame({'x': x, 'y': y, 'z': z}) >>> lom = logistic_regression(df[['x', 'z']], df['y']) >>> print(lom['coef'].values) [-0.34933805 -0.0226106 -0.39453532] 4. Return only the coefficients >>> logistic_regression(X, y, coef_only=True) array([-0.34933805, -0.0226106 , -0.39453532]) 4. Passing custom parameters to sklearn >>> lom = logistic_regression(X, y, solver='sag', max_iter=10000) >>> print(lom['coef'].values) [-0.34941889 -0.02261911 -0.39451064] """ # Check that sklearn is installed from pingouin.utils import is_sklearn_installed is_sklearn_installed(raise_error=True) from sklearn.linear_model import LogisticRegression # Extract names if X is a Dataframe or Series if isinstance(X, pd.DataFrame): names = X.keys().tolist() elif isinstance(X, pd.Series): names = [X.name] else: names = [] assert 0 < alpha < 1 # Convert to numpy array X = np.asarray(X) y = np.asarray(y) if np.unique(y).size != 2: raise ValueError('Dependent variable must be binary.') # Add axis if only one-dimensional array if X.ndim == 1: X = X[..., np.newaxis] if not names: names = ['x' + str(i + 1) for i in range(X.shape[1])] # Add intercept in names names.insert(0, "Intercept") # Initialize and fit if 'solver' not in kwargs: kwargs['solver'] = 'lbfgs' if 'multi_class' not in kwargs: kwargs['multi_class'] = 'auto' lom = LogisticRegression(**kwargs) lom.fit(X, y) coef = np.append(lom.intercept_, lom.coef_) if coef_only: return coef # Design matrix -- add intercept X_design = np.column_stack((np.ones(X.shape[0]), X)) n, p = X_design.shape # Fisher Information Matrix denom = (2 * (1 + np.cosh(lom.decision_function(X)))) denom = np.tile(denom, (p, 1)).T fim = np.dot((X_design / denom).T, X_design) crao = np.linalg.inv(fim) # Standard error and Z-scores se = np.sqrt(np.diag(crao)) z_scores = coef / se # Two-tailed p-values pval = np.array([2 * norm.sf(abs(z)) for z in z_scores]) # Confidence intervals crit = norm.ppf(1 - alpha / 2) ll = coef - crit * se ul = coef + crit * se # Rename CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Create dict stats = {'names': names, 'coef': coef, 'se': se, 'z': z_scores, 'pval': pval, ll_name: ll, ul_name: ul} if as_dataframe: return pd.DataFrame.from_dict(stats) else: return stats
def is_sklearn_installed(self): """Test function is_statsmodels_installed.""" assert isinstance(is_sklearn_installed(), bool)