def pairwise_corr(data, columns=None, tail='two-sided', method='pearson', padjust='none', export_filename=None): '''Pairwise correlations between columns of a pandas dataframe. Parameters ---------- data : pandas DataFrame DataFrame columns : list or str Column names in data :: '["a", "b", "c"]' : combination between columns a, b, and c '["a"]' : product between a and all the other numeric columns '[["a"], ["b", "c"]]' : product between ["a"] and ["b", "c"] '[["a", "d"], ["b", "c"]]' : product between ["a", "d"] and ["b", "c"] '[["a", "d"], None]' : product between ["a", "d"] and all other columns Note that if column is not specified, then the function will return the pairwise correlation between the combination of all the numeric columns in data. See the examples section for more details on this. tail : string Indicates whether to return the 'two-sided' or 'one-sided' p-values method : string Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) padjust : string Method used for testing and adjustment of pvalues. Available methods are :: 'none' : no correction 'bonferroni' : one-step Bonferroni correction 'holm' : step-down method using Bonferroni adjustments 'fdr_bh' : Benjamini/Hochberg FDR correction 'fdr_by' : Benjamini/Yekutieli FDR correction export_filename : string Filename (without extension) for the output file. If None, do not export the table. By default, the file will be created in the current python console directory. To change that, specify the filename with full path. Returns ------- stats : DataFrame Stats summary :: 'X' : Name(s) of first columns 'Y' : Name(s) of second columns 'method' : method used to compute the correlation 'tail' : indicates whether the p-values are one-sided or two-sided 'n' : Sample size (after NaN removal) 'r' : Correlation coefficients 'CI95' : 95% parametric confidence intervals 'r2' : R-squared values 'adj_r2' : Adjusted R-squared values 'z' : Standardized correlation coefficients 'p-unc' : uncorrected one or two tailed p-values 'p-corr' : corrected one or two tailed p-values 'p-adjust' : Correction method Notes ----- Please refer to the `pingouin.corr()` function for a description of the different methods. NaN are automatically removed from the data. This function is more flexible and gives a much more detailed output than the `pandas.DataFrame.corr()` method (i.e. p-values, confidence interval, Bayes Factor..). This comes however at an increased computational cost. While this should not be discernible for dataframe with less than 10,000 rows and/or less than 20 columns, this function can be slow for very large dataset. For speed purpose, the Bayes Factor is only computed when the sample size is less than 1000 (and method='pearson'). Examples -------- 1. One-tailed spearman correlation corrected for multiple comparisons >>> from pingouin.datasets import read_dataset >>> from pingouin import pairwise_corr >>> data = read_dataset('pairwise_corr').iloc[:, 1:] >>> stats = pairwise_corr(data, method='spearman', tail='two-sided', >>> padjust='bonf') >>> stats 2. Robust two-sided correlation with uncorrected p-values >>> pairwise_corr(data, columns=['Openness', 'Extraversion', >>> 'Neuroticism'], method='percbend') 3. Export the results to a .csv file >>> pairwise_corr(data, export_filename='pairwise_corr.csv') 4. One-versus-others pairwise correlations >>> pairwise_corr(data, columns=['Neuroticism']) 5. Pairwise correlations between two lists of columns (cartesian product) >>> pairwise_corr(data, columns=[['Neuroticism', 'Extraversion'], >>> ['Openness', 'Agreeableness']) ''' from pingouin.correlation import corr if tail not in ['one-sided', 'two-sided']: raise ValueError('Tail not recognized') # Keep only numeric columns data = data._get_numeric_data() keys = data.keys().tolist() # Initialize empty DataFrame stats = pd.DataFrame() # First ensure that columns is a list if isinstance(columns, str): columns = [columns] # Then define combinations / products between columns if columns is None: # Case A: column is not defined --> corr between all numeric columns combs = list(combinations(keys, 2)) else: # Case B: column is specified if isinstance(columns[0], list): group1 = [e for e in columns[0] if e in keys] # Assert that column is two-dimensional if len(columns) == 1: columns.append(None) if isinstance(columns[1], list) and len(columns[1]): # B1: [['a', 'b'], ['c', 'd']] group2 = [e for e in columns[1] if e in keys] else: # B2: [['a', 'b']], [['a', 'b'], None] or [['a', 'b'], 'all'] group2 = [e for e in keys if e not in group1] combs = list(product(group1, group2)) else: # Column is a simple list if len(columns) == 1: # Case B3: one-versus-all, e.g. ['a'] or 'a' others = [e for e in keys if e != columns[0]] combs = list(product(columns, others)) else: # Combinations between all specified columns ['a', 'b', 'c'] # Make sure that we keep numeric columns columns = np.intersect1d(keys, columns) if len(columns) == 1: # If only one-column is left, equivalent to ['a'] others = [e for e in keys if e != columns[0]] combs = list(product(columns, others)) else: # combinations between ['a', 'b', 'c'] combs = list(combinations(columns, 2)) # Assert that all columns do exist in DataFrame # If you see this error, check for column name errors in `columns=[]` for comb in combs: assert comb[0] in keys assert comb[1] in keys # Initialize vectors for comb in combs: col1, col2 = comb # Avoid errors when one of the two columns has only one unique value if data[col1].unique().size == 1 or data[col2].unique().size == 1: continue cor_st = corr(data[col1].values, data[col2].values, tail=tail, method=method).reset_index(drop=True) stats = stats.append( { 'X': col1, 'Y': col2, 'method': method, 'tail': tail, 'n': cor_st['n'][0], 'r': cor_st['r'][0], 'CI95%': cor_st['CI95%'][0], 'r2': cor_st['r2'][0], 'adj_r2': cor_st['adj_r2'][0], 'p-unc': cor_st['p-val'][0], 'BF10': cor_st['BF10'][0] if 'BF10' in cor_st.keys() else np.nan, 'power': cor_st['power'][0] }, ignore_index=True) # Multiple comparisons padjust = None if stats['p-unc'].size <= 1 else padjust if padjust is not None: if padjust.lower() != 'none': reject, stats['p-corr'] = multicomp(stats['p-unc'].values, method=padjust) stats['p-adjust'] = padjust else: stats['p-corr'] = None stats['p-adjust'] = None # Standardize correlation coefficients (Fisher z-transformation) stats['z'] = np.arctanh(stats['r'].values) # Round values for c in ['r', 'r2', 'adj_r2', 'z']: stats[c] = stats[c].round(3) col_order = [ 'X', 'Y', 'method', 'tail', 'n', 'r', 'CI95%', 'r2', 'adj_r2', 'z', 'p-unc', 'p-corr', 'p-adjust', 'BF10', 'power' ] # Convert n to int stats['n'] = stats['n'].astype(int) stats = stats.reindex(columns=col_order) stats.dropna(how='all', axis=1, inplace=True) if export_filename is not None: _export_table(stats, export_filename) return stats
def test_corr(self): """Test function corr""" np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T x[3], y[5] = 12, -8 corr(x, y, method='pearson', tail='one-sided') corr(x, y, method='spearman', tail='two-sided') corr(x, y, method='kendall') corr(x, y, method='shepherd', tail='two-sided') # Compare with robust corr toolbox stats = corr(x, y, method='skipped') assert np.round(stats['r'].to_numpy(), 3) == 0.512 assert stats['outliers'].to_numpy() == 2 stats = corr(x, y, method='shepherd') assert stats['outliers'].to_numpy() == 2 _, _, outliers = skipped(x, y, method='pearson') assert outliers.size == x.size assert stats['n'].to_numpy() == 30 stats = corr(x, y, method='percbend') assert np.round(stats['r'].to_numpy(), 3) == 0.484 # Compare biweight correlation to astropy stats = corr(x, y, method='bicor') assert np.isclose(stats['r'].to_numpy(), 0.4951417784979) # Not normally distributed z = np.random.uniform(size=30) corr(x, z, method='pearson') # With NaN values x[3] = np.nan corr(x, y) # With the same array # Disabled because of AppVeyor failure # assert corr(x, x).loc['pearson', 'BF10'] == str(np.inf) # Wrong argument with pytest.raises(ValueError): corr(x, y, method='error') # Compare BF10 with JASP df = read_dataset('pairwise_corr') stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / float(stats['BF10'].to_numpy()), 1.478e-13) # When one column is a constant, the correlation is not defined # and Pingouin return a DataFrame full of NaN, except for ``n`` x, y = [1, 1, 1], [1, 2, 3] stats = corr(x, y) assert stats.at['pearson', 'n'] assert np.isnan(stats.at['pearson', 'r']) # Biweight midcorrelation returns NaN when MAD is not defined assert np.isnan(bicor(np.array([1, 1, 1, 1, 0, 1]), np.arange(6))[0])
def test_corr(self): """Test function corr""" np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T x[3], y[5] = 12, -8 corr(x, y, method='pearson', tail='one-sided') corr(x, y, method='spearman', tail='two-sided') corr(x, y, method='kendall') corr(x, y, method='shepherd', tail='two-sided') # Compare with robust corr toolbox stats = corr(x, y, method='skipped') assert np.round(stats['r'].values, 3) == 0.512 assert stats['outliers'].values == 2 stats = corr(x, y, method='shepherd') assert stats['outliers'].values == 2 _, _, outliers = skipped(x, y, method='pearson') assert outliers.size == x.size assert stats['n'].values == 30 stats = corr(x, y, method='percbend') assert np.round(stats['r'].values, 3) == 0.484 # Not normally distributed z = np.random.uniform(size=30) corr(x, z, method='pearson') # With NaN values x[3] = np.nan corr(x, y) # Wrong argument with pytest.raises(ValueError): corr(x, y, method='error') with pytest.raises(ValueError): corr(x, y[:-10]) # Compare with JASP df = read_dataset('pairwise_corr') stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / stats['BF10'].values, 1.478e-13) # With more than 100 values to see if BF10 is computed xx, yy = np.random.multivariate_normal(mean, cov, 1500).T c1500 = corr(xx, yy) assert 'BF10' not in c1500.keys()
def pairwise_corr(data, columns=None, covar=None, tail='two-sided', method='pearson', padjust='none', export_filename=None): '''Pairwise (partial) correlations between columns of a pandas dataframe. Parameters ---------- data : pandas DataFrame DataFrame. Note that this function can also directly be used as a Pandas method, in which case this argument is no longer needed. columns : list or str Column names in data :: '["a", "b", "c"]' : combination between columns a, b, and c '["a"]' : product between a and all the other numeric columns '[["a"], ["b", "c"]]' : product between ["a"] and ["b", "c"] '[["a", "d"], ["b", "c"]]' : product between ["a", "d"] and ["b", "c"] '[["a", "d"], None]' : product between ["a", "d"] and all other columns Note that if column is not specified, then the function will return the pairwise correlation between the combination of all the numeric columns in data. See the examples section for more details on this. covar : None, string or list Covariate(s) for partial correlation. Must be one or more columns in data. Use a list if there are more than one covariate. If ``covar`` is not None, a partial correlation will be computed using :py:func:`pingouin.partial_corr` function. tail : string Indicates whether to return the 'two-sided' or 'one-sided' p-values method : string Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) padjust : string Method used for testing and adjustment of pvalues. Available methods are :: 'none' : no correction 'bonferroni' : one-step Bonferroni correction 'holm' : step-down method using Bonferroni adjustments 'fdr_bh' : Benjamini/Hochberg FDR correction 'fdr_by' : Benjamini/Yekutieli FDR correction export_filename : string Filename (without extension) for the output file. If None, do not export the table. By default, the file will be created in the current python console directory. To change that, specify the filename with full path. Returns ------- stats : DataFrame Stats summary :: 'X' : Name(s) of first columns 'Y' : Name(s) of second columns 'method' : method used to compute the correlation 'covar' : List of specified covariate(s) (only for partial correlation) 'tail' : indicates whether the p-values are one-sided or two-sided 'n' : Sample size (after NaN removal) 'r' : Correlation coefficients 'CI95' : 95% parametric confidence intervals 'r2' : R-squared values 'adj_r2' : Adjusted R-squared values 'z' : Standardized correlation coefficients 'p-unc' : uncorrected one or two tailed p-values 'p-corr' : corrected one or two tailed p-values 'p-adjust' : Correction method Notes ----- Please refer to the :py:func:`pingouin.corr()` function for a description of the different methods. NaN are automatically removed from the data. This function is more flexible and gives a much more detailed output than the :py:func:`pandas.DataFrame.corr()` method (i.e. p-values, confidence interval, Bayes Factor..). This comes however at an increased computational cost. While this should not be discernible for dataframe with less than 10,000 rows and/or less than 20 columns, this function can be slow for very large dataset. For speed purpose, the Bayes Factor is only computed when the sample size is less than 1000 (and method='pearson'). This function also works with two-dimensional multi-index columns. In this case, columns must be list(s) of tuple(s). See the Jupyter notebook for more details: https://github.com/raphaelvallat/pingouin/blob/master/notebooks/04_Correlations.ipynb If ``covar`` is specified, this function will compute the pairwise partial correlation between the variables. If you are only interested in computing the partial correlation matrix (i.e. the raw pairwise partial correlation coefficient matrix, without the p-values, sample sizes, etc), a better alternative is to use the :py:func:`pingouin.pcorr` function (see example 7). Examples -------- 1. One-tailed spearman correlation corrected for multiple comparisons >>> from pingouin import pairwise_corr, read_dataset >>> data = read_dataset('pairwise_corr').iloc[:, 1:] >>> pairwise_corr(data, method='spearman', tail='two-sided', ... padjust='bonf') # doctest: +SKIP 2. Robust two-sided correlation with uncorrected p-values >>> pcor = pairwise_corr(data, columns=['Openness', 'Extraversion', ... 'Neuroticism'], method='percbend') 3. One-versus-all pairwise correlations >>> pairwise_corr(data, columns=['Neuroticism']) # doctest: +SKIP 4. Pairwise correlations between two lists of columns (cartesian product) >>> columns = [['Neuroticism', 'Extraversion'], ['Openness']] >>> pairwise_corr(data, columns) # doctest: +SKIP 5. As a Pandas method >>> pcor = data.pairwise_corr(covar='Neuroticism', method='spearman') 6. Pairwise partial correlation >>> pcor = pairwise_corr(data, covar='Neuroticism') # One covariate >>> pcor = pairwise_corr(data, covar=['Neuroticism', 'Openness']) # Two 7. Pairwise partial correlation matrix (only the r-values) >>> data[['Neuroticism', 'Openness', 'Extraversion']].pcorr() Neuroticism Openness Extraversion Neuroticism 1.000000 0.092097 -0.360421 Openness 0.092097 1.000000 0.281312 Extraversion -0.360421 0.281312 1.000000 ''' from pingouin.correlation import corr, partial_corr if tail not in ['one-sided', 'two-sided']: raise ValueError('Tail not recognized') # Keep only numeric columns data = data._get_numeric_data() # Remove columns with constant value and/or NaN data = data.loc[:, data.nunique(dropna=True) >= 2] # Extract columns names keys = data.columns.tolist() # First ensure that columns is a list if isinstance(columns, (str, tuple)): columns = [columns] def traverse(o, tree_types=(list, tuple)): """Helper function to flatten nested lists. From https://stackoverflow.com/a/6340578 """ if isinstance(o, tree_types): for value in o: for subvalue in traverse(value, tree_types): yield subvalue else: yield o # Check if columns index has multiple levels if isinstance(data.columns, pd.core.index.MultiIndex): multi_index = True if columns is not None: # Simple List with one element: [('L0', 'L1')] # Simple list with >= 2 elements: [('L0', 'L1'), ('L0', 'L2')] # Nested lists: [[('L0', 'L1')], ...] or [..., [('L0', 'L1')]] col_flatten = list(traverse(columns, tree_types=list)) assert all(isinstance(c, (tuple, type(None))) for c in col_flatten) else: multi_index = False # Then define combinations / products between columns if columns is None: # Case A: column is not defined --> corr between all numeric columns combs = list(combinations(keys, 2)) else: # Case B: column is specified if isinstance(columns[0], list): group1 = [e for e in columns[0] if e in keys] # Assert that column is two-dimensional if len(columns) == 1: columns.append(None) if isinstance(columns[1], list) and len(columns[1]): # B1: [['a', 'b'], ['c', 'd']] group2 = [e for e in columns[1] if e in keys] else: # B2: [['a', 'b']], [['a', 'b'], None] or [['a', 'b'], 'all'] group2 = [e for e in keys if e not in group1] combs = list(product(group1, group2)) else: # Column is a simple list if len(columns) == 1: # Case B3: one-versus-all, e.g. ['a'] or 'a' # Check that this column exist if columns[0] not in keys: msg = ('"%s" is not in data or is not numeric.' % columns[0]) raise ValueError(msg) others = [e for e in keys if e != columns[0]] combs = list(product(columns, others)) else: # Combinations between all specified columns ['a', 'b', 'c'] # Make sure that we keep numeric columns columns = [c for c in columns if c in keys] if len(columns) == 1: # If only one-column is left, equivalent to ['a'] others = [e for e in keys if e != columns[0]] combs = list(product(columns, others)) else: # combinations between ['a', 'b', 'c'] combs = list(combinations(columns, 2)) combs = np.array(combs) if len(combs) == 0: raise ValueError("No column combination found. Please make sure that " "the specified columns exist in the dataframe, are " "numeric, and contains at least two unique values.") # Initialize empty dataframe if multi_index: X = list(zip(combs[:, 0, 0], combs[:, 0, 1])) Y = list(zip(combs[:, 1, 0], combs[:, 1, 1])) else: X = combs[:, 0] Y = combs[:, 1] stats = pd.DataFrame({ 'X': X, 'Y': Y, 'method': method, 'tail': tail }, index=range(len(combs)), columns=[ 'X', 'Y', 'method', 'tail', 'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power' ]) # Now we check if covariates are present if covar is not None: assert isinstance(covar, (str, list)), 'covar must be list or string.' if isinstance(covar, str): covar = [covar] # Check that columns exist and are numeric assert all([c in keys for c in covar]), 'covar not in data or not num.' # And we make sure that X or Y does not contain covar stats = stats[~stats[['X', 'Y']].isin(covar).any(1)] stats = stats.reset_index(drop=True) if stats.shape[0] == 0: raise ValueError("No column combination found. Please make sure " "that the specified columns and covar exist in " "the dataframe, are numeric, and contains at " "least two unique values.") # Compute pairwise correlations and fill dataframe dvs = ['n', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'power'] dvs_out = dvs + ['outliers'] dvs_bf10 = dvs + ['BF10'] for i in range(stats.shape[0]): col1, col2 = stats.loc[i, 'X'], stats.loc[i, 'Y'] if covar is None: cor_st = corr(data[col1].values, data[col2].values, tail=tail, method=method) else: cor_st = partial_corr(data=data, x=col1, y=col2, covar=covar, tail=tail, method=method) cor_st_keys = cor_st.columns.tolist() if 'BF10' in cor_st_keys: stats.loc[i, dvs_bf10] = cor_st[dvs_bf10].values elif 'outliers' in cor_st_keys: stats.loc[i, dvs_out] = cor_st[dvs_out].values else: stats.loc[i, dvs] = cor_st[dvs].values # Force conversion to numeric stats = stats.astype({ 'r': float, 'r2': float, 'adj_r2': float, 'n': int, 'p-val': float, 'outliers': float, 'power': float }) # Multiple comparisons stats = stats.rename(columns={'p-val': 'p-unc'}) padjust = None if stats['p-unc'].size <= 1 else padjust if padjust is not None: if padjust.lower() != 'none': reject, stats['p-corr'] = multicomp(stats['p-unc'].values, method=padjust) stats['p-adjust'] = padjust else: stats['p-corr'] = None stats['p-adjust'] = None # Standardize correlation coefficients (Fisher z-transformation) stats['z'] = np.round(np.arctanh(stats['r'].values), 3) col_order = [ 'X', 'Y', 'method', 'tail', 'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'z', 'p-unc', 'p-corr', 'p-adjust', 'BF10', 'power' ] # Reorder columns and remove empty ones stats = stats.reindex(columns=col_order) stats = stats.dropna(how='all', axis=1) # Add covariates names if present if covar is not None: stats.insert(loc=3, column='covar', value=str(covar)) if export_filename is not None: _export_table(stats, export_filename) return stats
def test_corr(self): """Test function corr""" np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T x[3], y[5] = 12, -8 corr(x, y, method='pearson', tail='one-sided') corr(x, y, method='spearman', tail='two-sided') corr(x, y, method='kendall') corr(x, y, method='shepherd', tail='two-sided') # Compare with robust corr toolbox stats = corr(x, y, method='skipped') assert stats['r'].values == 0.512 assert stats['outliers'].values == 2 stats = corr(x, y, method='shepherd') assert stats['outliers'].values == 2 _, _, outliers = skipped(x, y, method='pearson') assert outliers.size == x.size assert stats['n'].values == 30 stats = corr(x, y, method='percbend') assert stats['r'].values == 0.484 # Not normally distributed z = np.random.uniform(size=30) corr(x, z, method='pearson') # With NaN values x[3] = np.nan corr(x, y) # With the same array # Disabled because of AppVeyor failure # assert corr(x, x).loc['pearson', 'BF10'] == str(np.inf) # Wrong argument with pytest.raises(ValueError): corr(x, y, method='error') with pytest.raises(ValueError): corr(x, y[:-10]) # Compare with JASP df = read_dataset('pairwise_corr') stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / float(stats['BF10'].values), 1.478e-13)
def test_corr(self): """Test function corr Compare to R `correlation` package. See test_correlation.R file. """ np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T x2, y2 = x.copy(), y.copy() x[3], y[5] = 12, -8 x2[3], y2[5] = 7, 2.6 # Pearson correlation stats = corr(x, y, method='pearson') assert np.isclose(stats.loc['pearson', 'r'], 0.1761221) assert np.isclose(stats.loc['pearson', 'p-val'], 0.3518659) assert stats.loc['pearson', 'CI95%'][0] == round(-0.1966232, 2) assert stats.loc['pearson', 'CI95%'][1] == round(0.5043872, 2) # - One-sided: greater stats = corr(x, y, method='pearson', alternative='greater') assert np.isclose(stats.loc['pearson', 'r'], 0.1761221) assert np.isclose(stats.loc['pearson', 'p-val'], 0.175933) assert stats.loc['pearson', 'CI95%'][0] == round(-0.1376942, 2) assert stats.loc['pearson', 'CI95%'][1] == 1 # - One-sided: less stats = corr(x, y, method='pearson', alternative='less') assert np.isclose(stats.loc['pearson', 'r'], 0.1761221) assert np.isclose(stats.loc['pearson', 'p-val'], 0.824067) assert stats.loc['pearson', 'CI95%'][0] == -1 assert stats.loc['pearson', 'CI95%'][1] == round(0.4578044, 2) # Spearman correlation stats = corr(x, y, method='spearman') assert np.isclose(stats.loc['spearman', 'r'], 0.4740823) assert np.isclose(stats.loc['spearman', 'p-val'], 0.008129768) # CI are calculated using a different formula for Spearman in R # assert stats.loc['spearman', 'CI95%'][0] == round(0.1262988, 2) # assert stats.loc['spearman', 'CI95%'][1] == round(0.7180799, 2) # Kendall correlation # R uses a different estimation method than scipy for the p-value stats = corr(x, y, method='kendall') assert np.isclose(stats.loc['kendall', 'r'], 0.3517241) # Skipped correlation -- compare with robust corr toolbox # https://sourceforge.net/projects/robustcorrtool/ stats = corr(x, y, method='skipped') assert round(stats.loc['skipped', 'r'], 4) == 0.5123 assert stats.loc['skipped', 'outliers'] == 2 sk_sp = corr(x2, y2, method='skipped') assert round(sk_sp.loc['skipped', 'r'], 4) == 0.5123 assert sk_sp.loc['skipped', 'outliers'] == 2 # Pearson skipped correlation sk_pe = corr(x2, y2, method='skipped', corr_type='pearson') assert np.round(sk_pe.loc['skipped', 'r'], 4) == 0.5254 assert sk_pe.loc['skipped', 'outliers'] == 2 assert not sk_sp.equals(sk_pe) # Shepherd stats = corr(x, y, method='shepherd') assert np.isclose(stats.loc['shepherd', 'r'], 0.5123153) assert np.isclose(stats.loc['shepherd', 'p-val'], 0.005316) assert stats.loc['shepherd', 'outliers'] == 2 _, _, outliers = skipped(x, y, corr_type='pearson') assert outliers.size == x.size assert stats.loc['shepherd', 'n'] == 30 # Percbend -- compare with robust corr toolbox stats = corr(x, y, method='percbend') assert round(stats.loc['percbend', 'r'], 4) == 0.4843 assert np.isclose(stats.loc['percbend', 'r'], 0.4842686) assert np.isclose(stats.loc['percbend', 'p-val'], 0.006693313) stats = corr(x2, y2, method='percbend') assert round(stats.loc['percbend', 'r'], 4) == 0.4843 stats = corr(x, y, method='percbend', beta=.5) assert round(stats.loc['percbend', 'r'], 4) == 0.4848 # Compare biweight correlation to astropy stats = corr(x, y, method='bicor') assert np.isclose(stats.loc['bicor', 'r'], 0.4951418) assert np.isclose(stats.loc['bicor', 'p-val'], 0.005403701) assert stats.loc['bicor', 'CI95%'][0] == round(0.1641553, 2) assert stats.loc['bicor', 'CI95%'][1] == round(0.7259185, 2) stats = corr(x, y, method='bicor', c=5) assert np.isclose(stats.loc['bicor', 'r'], 0.4940706950017) # Not normally distributed z = np.random.uniform(size=30) corr(x, z, method='pearson') # With NaN values x[3] = np.nan corr(x, y) # With the same array # Disabled because of AppVeyor failure # assert corr(x, x).loc['pearson', 'BF10'] == str(np.inf) # Wrong argument with pytest.raises(ValueError): corr(x, y, method='error') with pytest.raises(ValueError): corr(x, y, tail='error') # Compare BF10 with JASP df = read_dataset('pairwise_corr') stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / float(stats['BF10'].to_numpy()), 1.478e-13) # Perfect correlation, CI and power should be 1, BF should be Inf # https://github.com/raphaelvallat/pingouin/issues/195 stats = corr(x, x) assert np.isclose(stats.at['pearson', 'r'], 1) assert np.isclose(stats.at['pearson', 'power'], 1) # When one column is a constant, the correlation is not defined # and Pingouin return a DataFrame full of NaN, except for ``n`` x, y = [1, 1, 1], [1, 2, 3] stats = corr(x, y) assert stats.at['pearson', 'n'] assert np.isnan(stats.at['pearson', 'r']) # Biweight midcorrelation returns NaN when MAD is not defined assert np.isnan(bicor(np.array([1, 1, 1, 1, 0, 1]), np.arange(6))[0])
def test_corr(self): """Test function corr""" np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T x2, y2 = x.copy(), y.copy() x[3], y[5] = 12, -8 x2[3], y2[5] = 7, 2.6 corr(x, y, method='pearson', tail='one-sided') corr(x, y, method='spearman', tail='two-sided') corr(x, y, method='kendall') corr(x, y, method='shepherd', tail='two-sided') # Skipped correlation -- compare with robust corr toolbox # https://sourceforge.net/projects/robustcorrtool/ stats = corr(x, y, method='skipped') assert np.round(stats['r'].to_numpy(), 4) == 0.5123 assert stats['outliers'].to_numpy() == 2 sk_sp = corr(x2, y2, method='skipped') assert np.round(sk_sp['r'].to_numpy(), 4) == 0.5123 assert sk_sp['outliers'].to_numpy() == 2 # Pearson skipped correlation sk_pe = corr(x2, y2, method='skipped', corr_type='pearson') assert np.round(sk_pe['r'].to_numpy(), 4) == 0.5254 assert sk_pe['outliers'].to_numpy() == 2 assert not sk_sp.equals(sk_pe) # Shepherd -- cannot directly compare because based on random bootstrap stats = corr(x, y, method='shepherd') assert stats['outliers'].to_numpy() == 2 _, _, outliers = skipped(x, y, corr_type='pearson') assert outliers.size == x.size assert stats['n'].to_numpy() == 30 # Percbend -- compare with robust corr toolbox stats = corr(x, y, method='percbend') assert np.round(stats['r'].to_numpy(), 4) == 0.4843 stats = corr(x2, y2, method='percbend') assert np.round(stats['r'].to_numpy(), 4) == 0.4843 stats = corr(x, y, method='percbend', beta=.5) assert np.round(stats['r'].to_numpy(), 4) == 0.4848 # Compare biweight correlation to astropy stats = corr(x, y, method='bicor') assert np.isclose(stats['r'].to_numpy(), 0.4951417784979) stats = corr(x, y, method='bicor', c=5) assert np.isclose(stats['r'].to_numpy(), 0.4940706950017) # Not normally distributed z = np.random.uniform(size=30) corr(x, z, method='pearson') # With NaN values x[3] = np.nan corr(x, y) # With the same array # Disabled because of AppVeyor failure # assert corr(x, x).loc['pearson', 'BF10'] == str(np.inf) # Wrong argument with pytest.raises(ValueError): corr(x, y, method='error') # Compare BF10 with JASP df = read_dataset('pairwise_corr') stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / float(stats['BF10'].to_numpy()), 1.478e-13) # When one column is a constant, the correlation is not defined # and Pingouin return a DataFrame full of NaN, except for ``n`` x, y = [1, 1, 1], [1, 2, 3] stats = corr(x, y) assert stats.at['pearson', 'n'] assert np.isnan(stats.at['pearson', 'r']) # Biweight midcorrelation returns NaN when MAD is not defined assert np.isnan(bicor(np.array([1, 1, 1, 1, 0, 1]), np.arange(6))[0])
def pairwise_corr(data, columns=None, covar=None, tail='two-sided', method='pearson', padjust='none', nan_policy='pairwise'): """Pairwise (partial) correlations between columns of a pandas dataframe. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame. Note that this function can also directly be used as a Pandas method, in which case this argument is no longer needed. columns : list or str Column names in data: * ``["a", "b", "c"]``: combination between columns a, b, and c. * ``["a"]``: product between a and all the other numeric columns. * ``[["a"], ["b", "c"]]``: product between ["a"] and ["b", "c"]. * ``[["a", "d"], ["b", "c"]]``: product between ["a", "d"] and ["b", "c"]. * ``[["a", "d"], None]``: product between ["a", "d"] and all other numeric columns in dataframe. If column is None, the function will return the pairwise correlation between the combination of all the numeric columns in data. See the examples section for more details on this. covar : None, string or list Covariate(s) for partial correlation. Must be one or more columns in data. Use a list if there are more than one covariate. If ``covar`` is not None, a partial correlation will be computed using :py:func:`pingouin.partial_corr` function. tail : string Specify whether to return ``'one-sided'`` or ``'two-sided'`` p-value. Note that the former are simply half the latter. method : string Correlation type: * ``'pearson'``: Pearson :math:`r` product-moment correlation * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation * ``'kendall'``: Kendall's :math:`\\tau` correlation (for ordinal data) * ``'bicor'``: Biweight midcorrelation (robust) * ``'percbend'``: Percentage bend correlation (robust) * ``'shepherd'``: Shepherd's pi correlation (robust) * ``'skipped'``: Skipped correlation (robust) padjust : string Method used for testing and adjustment of pvalues. * ``'none'``: no correction * ``'bonf'``: one-step Bonferroni correction * ``'sidak'``: one-step Sidak correction * ``'holm'``: step-down method using Bonferroni adjustments * ``'fdr_bh'``: Benjamini/Hochberg FDR correction * ``'fdr_by'``: Benjamini/Yekutieli FDR correction nan_policy : string Can be ``'listwise'`` for listwise deletion of missing values (= complete-case analysis) or ``'pairwise'`` (default) for the more liberal pairwise deletion (= available-case analysis). .. versionadded:: 0.2.9 Returns ------- stats : :py:class:`pandas.DataFrame` * ``'X'``: Name(s) of first columns. * ``'Y'``: Name(s) of second columns. * ``'method'``: Correlation type. * ``'covar'``: List of specified covariate(s), only when covariates are passed. * ``'tail'``: Tail of the test. * ``'n'``: Sample size (after removal of missing values). * ``'r'``: Correlation coefficients. * ``'CI95'``: 95% parametric confidence intervals. * ``'r2'``: R-squared values. * ``'adj_r2'``: Adjusted R-squared values. * ``'z'``: Standardized correlation coefficients. * ``'p-unc'``: Uncorrected p-values. * ``'p-corr'``: Corrected p-values. * ``'p-adjust'``: P-values correction method. * ``'BF10'``: Bayes Factor of the alternative hypothesis (only for Pearson correlation) * ``'power'``: achieved power of the test (= 1 - type II error). Notes ----- Please refer to the :py:func:`pingouin.corr()` function for a description of the different methods. NaN are automatically removed from the data using a pairwise deletion. This function is more flexible and gives a much more detailed output than the :py:func:`pandas.DataFrame.corr()` method (i.e. p-values, confidence interval, Bayes Factor...). This comes however at an increased computational cost. While this should not be discernible for dataframe with less than 10,000 rows and/or less than 20 columns, this function can be slow for very large dataset. A faster alternative to get the r-values and p-values in a matrix format is to use the :py:func:`pingouin.rcorr` function, which works directly as a :py:class:`pandas.DataFrame` method (see example below). This function also works with two-dimensional multi-index columns. In this case, columns must be list(s) of tuple(s). Please refer to this `example Jupyter notebook <https://github.com/raphaelvallat/pingouin/blob/master/notebooks/04_Correlations.ipynb>`_ for more details. If ``covar`` is specified, this function will compute the pairwise partial correlation between the variables. If you are only interested in computing the partial correlation matrix (i.e. the raw pairwise partial correlation coefficient matrix, without the p-values, sample sizes, etc), a better alternative is to use the :py:func:`pingouin.pcorr` function (see example 7). Examples -------- 1. One-sided spearman correlation corrected for multiple comparisons >>> from pingouin import pairwise_corr, read_dataset >>> data = read_dataset('pairwise_corr').iloc[:, 1:] >>> pairwise_corr(data, method='spearman', tail='one-sided', ... padjust='bonf') # doctest: +SKIP 2. Robust two-sided biweight midcorrelation with uncorrected p-values >>> pcor = pairwise_corr(data, columns=['Openness', 'Extraversion', ... 'Neuroticism'], method='bicor') 3. One-versus-all pairwise correlations >>> pairwise_corr(data, columns=['Neuroticism']) # doctest: +SKIP 4. Pairwise correlations between two lists of columns (cartesian product) >>> columns = [['Neuroticism', 'Extraversion'], ['Openness']] >>> pairwise_corr(data, columns) # doctest: +SKIP 5. As a Pandas method >>> pcor = data.pairwise_corr(covar='Neuroticism', method='spearman') 6. Pairwise partial correlation >>> pcor = pairwise_corr(data, covar='Neuroticism') # One covariate >>> pcor = pairwise_corr(data, covar=['Neuroticism', 'Openness']) # Two 7. Pairwise partial correlation matrix using :py:func:`pingouin.pcorr` >>> data[['Neuroticism', 'Openness', 'Extraversion']].pcorr() Neuroticism Openness Extraversion Neuroticism 1.000000 0.092097 -0.360421 Openness 0.092097 1.000000 0.281312 Extraversion -0.360421 0.281312 1.000000 8. Correlation matrix with p-values using :py:func:`pingouin.rcorr` >>> data[['Neuroticism', 'Openness', 'Extraversion']].rcorr() Neuroticism Openness Extraversion Neuroticism - *** Openness -0.01 - *** Extraversion -0.35 0.267 - """ from pingouin.correlation import corr, partial_corr # Check arguments assert tail in ['one-sided', 'two-sided'] assert nan_policy in ['listwise', 'pairwise'] # Keep only numeric columns data = data._get_numeric_data() # Remove columns with constant value and/or NaN data = data.loc[:, data.nunique(dropna=True) >= 2] # Extract columns names keys = data.columns.tolist() # First ensure that columns is a list if isinstance(columns, (str, tuple)): columns = [columns] def traverse(o, tree_types=(list, tuple)): """Helper function to flatten nested lists. From https://stackoverflow.com/a/6340578 """ if isinstance(o, tree_types): for value in o: for subvalue in traverse(value, tree_types): yield subvalue else: yield o # Check if columns index has multiple levels pdv = pd.__version__ mindex = pd.MultiIndex if pdv.startswith('1') else pd.core.index.MultiIndex if isinstance(data.columns, mindex): multi_index = True if columns is not None: # Simple List with one element: [('L0', 'L1')] # Simple list with >= 2 elements: [('L0', 'L1'), ('L0', 'L2')] # Nested lists: [[('L0', 'L1')], ...] or [..., [('L0', 'L1')]] col_flatten = list(traverse(columns, tree_types=list)) assert all(isinstance(c, (tuple, type(None))) for c in col_flatten) else: multi_index = False # Then define combinations / products between columns if columns is None: # Case A: column is not defined --> corr between all numeric columns combs = list(combinations(keys, 2)) else: # Case B: column is specified if isinstance(columns[0], list): group1 = [e for e in columns[0] if e in keys] # Assert that column is two-dimensional if len(columns) == 1: columns.append(None) if isinstance(columns[1], list) and len(columns[1]): # B1: [['a', 'b'], ['c', 'd']] group2 = [e for e in columns[1] if e in keys] else: # B2: [['a', 'b']], [['a', 'b'], None] or [['a', 'b'], 'all'] group2 = [e for e in keys if e not in group1] combs = list(product(group1, group2)) else: # Column is a simple list if len(columns) == 1: # Case B3: one-versus-all, e.g. ['a'] or 'a' # Check that this column exist if columns[0] not in keys: msg = ('"%s" is not in data or is not numeric.' % columns[0]) raise ValueError(msg) others = [e for e in keys if e != columns[0]] combs = list(product(columns, others)) else: # Combinations between all specified columns ['a', 'b', 'c'] # Make sure that we keep numeric columns columns = [c for c in columns if c in keys] if len(columns) == 1: # If only one-column is left, equivalent to ['a'] others = [e for e in keys if e != columns[0]] combs = list(product(columns, others)) else: # combinations between ['a', 'b', 'c'] combs = list(combinations(columns, 2)) combs = np.array(combs) if len(combs) == 0: raise ValueError("No column combination found. Please make sure that " "the specified columns exist in the dataframe, are " "numeric, and contains at least two unique values.") # Initialize empty dataframe if multi_index: X = list(zip(combs[:, 0, 0], combs[:, 0, 1])) Y = list(zip(combs[:, 1, 0], combs[:, 1, 1])) else: X = combs[:, 0] Y = combs[:, 1] stats = pd.DataFrame({'X': X, 'Y': Y, 'method': method, 'tail': tail}, index=range(len(combs)), columns=['X', 'Y', 'method', 'tail', 'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power']) # Now we check if covariates are present if covar is not None: assert isinstance(covar, (str, list)), 'covar must be list or string.' if isinstance(covar, str): covar = [covar] # Check that columns exist and are numeric assert all([c in keys for c in covar]), 'covar not in data or not num.' # And we make sure that X or Y does not contain covar stats = stats[~stats[['X', 'Y']].isin(covar).any(1)] stats = stats.reset_index(drop=True) if stats.shape[0] == 0: raise ValueError("No column combination found. Please make sure " "that the specified columns and covar exist in " "the dataframe, are numeric, and contains at " "least two unique values.") # Listwise deletion of missing values if nan_policy == 'listwise': all_cols = np.unique(stats[['X', 'Y']].to_numpy()).tolist() if covar is not None: all_cols.extend(covar) data = data[all_cols].dropna() # Compute pairwise correlations and fill dataframe dvs = ['n', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'power'] dvs_out = dvs + ['outliers'] dvs_bf10 = dvs + ['BF10'] for i in range(stats.shape[0]): col1, col2 = stats.at[i, 'X'], stats.at[i, 'Y'] if covar is None: cor_st = corr(data[col1].to_numpy(), data[col2].to_numpy(), tail=tail, method=method) else: cor_st = partial_corr(data=data, x=col1, y=col2, covar=covar, tail=tail, method=method) cor_st_keys = cor_st.columns.tolist() if 'BF10' in cor_st_keys: stats.loc[i, dvs_bf10] = cor_st[dvs_bf10].to_numpy() elif 'outliers' in cor_st_keys: stats.loc[i, dvs_out] = cor_st[dvs_out].to_numpy() else: stats.loc[i, dvs] = cor_st[dvs].to_numpy() # Force conversion to numeric stats = stats.astype({'r': float, 'r2': float, 'adj_r2': float, 'n': int, 'p-val': float, 'outliers': float, 'power': float}) # Multiple comparisons stats = stats.rename(columns={'p-val': 'p-unc'}) padjust = None if stats['p-unc'].size <= 1 else padjust if padjust is not None: if padjust.lower() != 'none': reject, stats['p-corr'] = multicomp(stats['p-unc'].to_numpy(), method=padjust) stats['p-adjust'] = padjust else: stats['p-corr'] = None stats['p-adjust'] = None # Standardize correlation coefficients (Fisher z-transformation) stats['z'] = np.arctanh(stats['r'].to_numpy()) col_order = ['X', 'Y', 'method', 'tail', 'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'z', 'p-unc', 'p-corr', 'p-adjust', 'BF10', 'power'] # Reorder columns and remove empty ones stats = stats.reindex(columns=col_order).dropna(how='all', axis=1) # Add covariates names if present if covar is not None: stats.insert(loc=3, column='covar', value=str(covar)) return stats