def test_linear_regression(self): """Test function linear_regression.""" # Simple regression lm = linear_regression(df['X'], df['Y']) # Pingouin sc = linregress(df['X'].values, df['Y'].values) # SciPy assert_equal(lm['names'].values, ['Intercept', 'X']) assert_almost_equal(lm['coef'][1], sc.slope) assert_almost_equal(lm['coef'][0], sc.intercept) assert_almost_equal(lm['se'][1], sc.stderr) assert_almost_equal(lm['pval'][1], sc.pvalue) assert_almost_equal(np.sqrt(lm['r2'][0]), sc.rvalue) # Multiple regression with intercept X = df[['X', 'M']].values y = df['Y'].values lm = linear_regression(X, y, as_dataframe=False) # Pingouin sk = LinearRegression(fit_intercept=True).fit(X, y) # SkLearn assert_equal(lm['names'], ['Intercept', 'x1', 'x2']) assert_almost_equal(lm['coef'][1:], sk.coef_) assert_almost_equal(lm['coef'][0], sk.intercept_) assert_almost_equal(sk.score(X, y), lm['r2']) # Compare values to JASP assert_equal([.605, .110, .101], np.round(lm['se'], 3)) assert_equal([3.145, 0.361, 6.321], np.round(lm['T'], 3)) assert_equal([0.002, 0.719, 0.000], np.round(lm['pval'], 3)) assert_equal([.703, -.178, .436], np.round(lm['CI[2.5%]'], 3)) assert_equal([3.106, .257, .835], np.round(lm['CI[97.5%]'], 3)) # No intercept lm = linear_regression(X, y, add_intercept=False, as_dataframe=False) sk = LinearRegression(fit_intercept=False).fit(X, y) assert_almost_equal(lm['coef'], sk.coef_) assert_almost_equal(sk.score(X, y), lm['r2']) # Test other arguments linear_regression(df[['X', 'M']], df['Y'], coef_only=True) linear_regression(df[['X', 'M']], df['Y'], alpha=0.01) linear_regression(df[['X', 'M']], df['Y'], alpha=0.10)
def test_linear_regression(self): """Test function linear_regression.""" # Simple regression lm = linear_regression(df['X'], df['Y']) # Pingouin linear_regression(df['X'], df['Y'], add_intercept=False) sc = linregress(df['X'].values, df['Y'].values) # SciPy assert_equal(lm['names'].values, ['Intercept', 'X']) assert_almost_equal(lm['coef'][1], sc.slope) assert_almost_equal(lm['coef'][0], sc.intercept) assert_almost_equal(lm['se'][1], sc.stderr) assert_almost_equal(lm['pval'][1], sc.pvalue) assert_almost_equal(np.sqrt(lm['r2'][0]), sc.rvalue) assert lm.residuals_.size == df['Y'].size # Multiple regression with intercept X = df[['X', 'M']].values y = df['Y'].values lm = linear_regression(X, y, as_dataframe=False) # Pingouin sk = LinearRegression(fit_intercept=True).fit(X, y) # SkLearn assert_equal(lm['names'], ['Intercept', 'x1', 'x2']) assert_almost_equal(lm['coef'][1:], sk.coef_) assert_almost_equal(lm['coef'][0], sk.intercept_) assert_almost_equal(sk.score(X, y), lm['r2']) assert lm['residuals'].size == y.size # Compare values to JASP assert_equal([.605, .110, .101], np.round(lm['se'], 3)) assert_equal([3.145, 0.361, 6.321], np.round(lm['T'], 3)) assert_equal([0.002, 0.719, 0.000], np.round(lm['pval'], 3)) assert_equal([.703, -.178, .436], np.round(lm['CI[2.5%]'], 3)) assert_equal([3.106, .257, .835], np.round(lm['CI[97.5%]'], 3)) # No intercept lm = linear_regression(X, y, add_intercept=False, as_dataframe=False) sk = LinearRegression(fit_intercept=False).fit(X, y) assert_almost_equal(lm['coef'], sk.coef_) assert_almost_equal(sk.score(X, y), lm['r2']) # Test other arguments linear_regression(df[['X', 'M']], df['Y'], coef_only=True) linear_regression(df[['X', 'M']], df['Y'], alpha=0.01) linear_regression(df[['X', 'M']], df['Y'], alpha=0.10) # With missing values linear_regression(df_nan[['X', 'M']], df_nan['Y'], remove_na=True) # With columns with only one unique value lm1 = linear_regression(df[['X', 'M', 'One']], df['Y']) lm2 = linear_regression(df[['X', 'M', 'One']], df['Y'], add_intercept=False) assert lm1.shape[0] == 3 assert lm2.shape[0] == 3 assert np.isclose(lm1.at[0, 'r2'], lm2.at[0, 'r2']) # With zero-only column lm1 = linear_regression(df[['X', 'M', 'Zero', 'One']], df['Y']) lm2 = linear_regression(df[['X', 'M', 'Zero', 'One']], df['Y'].values, add_intercept=False) lm3 = linear_regression(df[['X', 'Zero', 'M', 'Zero']].values, df['Y'], add_intercept=False) assert np.array_equal(lm1.loc[:, 'names'], ['Intercept', 'X', 'M']) assert np.array_equal(lm2.loc[:, 'names'], ['X', 'M', 'One']) assert np.array_equal(lm3.loc[:, 'names'], ['x1', 'x3']) # With duplicate columns lm1 = linear_regression(df[['X', 'One', 'Zero', 'M', 'M', 'X']], df['Y']) lm2 = linear_regression(df[['X', 'One', 'Zero', 'M', 'M', 'X']].values, df['Y'], add_intercept=False) assert np.array_equal(lm1.loc[:, 'names'], ['Intercept', 'X', 'M']) assert np.array_equal(lm2.loc[:, 'names'], ['x1', 'x2', 'x4']) # Relative importance # Compare to R package relaimpo # >>> data <- read.csv('mediation.csv') # >>> lm1 <- lm(Y ~ X + M, data = data) # >>> calc.relimp(lm1, type=c("lmg")) lm = linear_regression(df[['X', 'M']], df['Y'], relimp=True) np.testing.assert_almost_equal(lm.loc[[1, 2], 'relimp'], [0.05778011, 0.31521913]) np.testing.assert_almost_equal(lm.loc[[1, 2], 'relimp_perc'], [15.49068, 84.50932], decimal=4) # Now we make sure that relimp_perc sums to 100% and relimp sums to r2 assert np.isclose(lm['relimp_perc'].sum(), 100.) assert np.isclose(lm['relimp'].sum(), lm.at[0, 'r2']) # 2 predictors, no intercept # Careful here, the sum of relimp is always the R^2 of the model # INCLUDING the intercept. Therefore, if the data are not normalized # and we set add_intercept to false, the sum of relimp will be # higher than the linear regression model. # A workaround is to standardize our data before: df_z = df[['X', 'M', 'Y']].apply(zscore) lm = linear_regression(df_z[['X', 'M']], df_z['Y'], add_intercept=False, as_dataframe=False, relimp=True) np.testing.assert_almost_equal(lm['relimp'], [0.05778011, 0.31521913], decimal=4) np.testing.assert_almost_equal(lm['relimp_perc'], [15.49068, 84.50932], decimal=4) assert np.isclose(np.sum(lm['relimp']), lm['r2']) # 3 predictors + intercept lm = linear_regression(df[['X', 'M', 'Ybin']], df['Y'], relimp=True) np.testing.assert_almost_equal(lm.loc[[1, 2, 3], 'relimp'], [0.06010737, 0.31724368, 0.01217479]) np.testing.assert_almost_equal(lm.loc[[1, 2, 3], 'relimp_perc'], [15.43091, 81.44355, 3.12554], decimal=4) assert np.isclose(lm['relimp'].sum(), lm.at[0, 'r2'])
def test_linear_regression(self): """Test function linear_regression.""" # Simple regression lm = linear_regression(df['X'], df['Y']) # Pingouin linear_regression(df['X'], df['Y'], add_intercept=False) sc = linregress(df['X'].values, df['Y'].values) # SciPy assert_equal(lm['names'].values, ['Intercept', 'X']) assert_almost_equal(lm['coef'][1], sc.slope) assert_almost_equal(lm['coef'][0], sc.intercept) assert_almost_equal(lm['se'][1], sc.stderr) assert_almost_equal(lm['pval'][1], sc.pvalue) assert_almost_equal(np.sqrt(lm['r2'][0]), sc.rvalue) assert lm.residuals_.size == df['Y'].size # Multiple regression with intercept X = df[['X', 'M']].values y = df['Y'].values lm = linear_regression(X, y, as_dataframe=False) # Pingouin sk = LinearRegression(fit_intercept=True).fit(X, y) # SkLearn assert_equal(lm['names'], ['Intercept', 'x1', 'x2']) assert_almost_equal(lm['coef'][1:], sk.coef_) assert_almost_equal(lm['coef'][0], sk.intercept_) assert_almost_equal(sk.score(X, y), lm['r2']) assert lm['residuals'].size == y.size # Compare values to JASP assert_equal([.605, .110, .101], np.round(lm['se'], 3)) assert_equal([3.145, 0.361, 6.321], np.round(lm['T'], 3)) assert_equal([0.002, 0.719, 0.000], np.round(lm['pval'], 3)) assert_equal([.703, -.178, .436], np.round(lm['CI[2.5%]'], 3)) assert_equal([3.106, .257, .835], np.round(lm['CI[97.5%]'], 3)) # No intercept lm = linear_regression(X, y, add_intercept=False, as_dataframe=False) sk = LinearRegression(fit_intercept=False).fit(X, y) assert_almost_equal(lm['coef'], sk.coef_) assert_almost_equal(sk.score(X, y), lm['r2']) # Test other arguments linear_regression(df[['X', 'M']], df['Y'], coef_only=True) linear_regression(df[['X', 'M']], df['Y'], alpha=0.01) linear_regression(df[['X', 'M']], df['Y'], alpha=0.10) # With missing values linear_regression(df_nan[['X', 'M']], df_nan['Y'], remove_na=True) # With columns with only one unique value lm1 = linear_regression(df[['X', 'M', 'One']], df['Y']) lm2 = linear_regression(df[['X', 'M', 'One']], df['Y'], add_intercept=False) assert lm1.shape[0] == 3 assert lm2.shape[0] == 3 assert np.isclose(lm1.at[0, 'r2'], lm2.at[0, 'r2']) # With zero-only column lm1 = linear_regression(df[['X', 'M', 'Zero', 'One']], df['Y']) lm2 = linear_regression(df[['X', 'M', 'Zero', 'One']], df['Y'].values, add_intercept=False) lm3 = linear_regression(df[['X', 'Zero', 'M', 'Zero']].values, df['Y'], add_intercept=False) assert np.array_equal(lm1.loc[:, 'names'], ['Intercept', 'X', 'M']) assert np.array_equal(lm2.loc[:, 'names'], ['X', 'M', 'One']) assert np.array_equal(lm3.loc[:, 'names'], ['x1', 'x3']) # With duplicate columns lm1 = linear_regression(df[['X', 'One', 'Zero', 'M', 'M', 'X']], df['Y']) lm2 = linear_regression(df[['X', 'One', 'Zero', 'M', 'M', 'X']].values, df['Y'], add_intercept=False) assert np.array_equal(lm1.loc[:, 'names'], ['Intercept', 'X', 'M']) assert np.array_equal(lm2.loc[:, 'names'], ['x1', 'x2', 'x4'])
def test_linear_regression(self): """Test function linear_regression. Compare against JASP and R lm() function. """ # Simple regression (compare to R lm()) lm = linear_regression(df['X'], df['Y']) # Pingouin sc = linregress(df['X'], df['Y']) # SciPy # When using assert_equal, we need to use .to_numpy() assert_equal(lm['names'].to_numpy(), ['Intercept', 'X']) assert_almost_equal(lm['coef'][1], sc.slope) assert_almost_equal(lm['coef'][0], sc.intercept) assert_almost_equal(lm['se'][1], sc.stderr) assert_almost_equal(lm['pval'][1], sc.pvalue) assert_almost_equal(np.sqrt(lm['r2'][0]), sc.rvalue) assert lm.residuals_.size == df['Y'].size assert_equal(lm['CI[2.5%]'].round(5).to_numpy(), [1.48155, 0.17553]) assert_equal(lm['CI[97.5%]'].round(5).to_numpy(), [4.23286, 0.61672]) assert round(lm['r2'].iloc[0], 4) == 0.1147 assert round(lm['adj_r2'].iloc[0], 4) == 0.1057 assert lm.df_model_ == 1 assert lm.df_resid_ == 98 # Multiple regression with intercept (compare to JASP) X = df[['X', 'M']].to_numpy() y = df['Y'].to_numpy() lm = linear_regression(X, y, as_dataframe=False) # Pingouin sk = LinearRegression(fit_intercept=True).fit(X, y) # SkLearn assert_equal(lm['names'], ['Intercept', 'x1', 'x2']) assert_almost_equal(lm['coef'][1:], sk.coef_) assert_almost_equal(lm['coef'][0], sk.intercept_) assert_almost_equal(sk.score(X, y), lm['r2']) assert lm['residuals'].size == y.size # No need for .to_numpy here because we're using a dict and not pandas assert_equal([.605, .110, .101], np.round(lm['se'], 3)) assert_equal([3.145, 0.361, 6.321], np.round(lm['T'], 3)) assert_equal([0.002, 0.719, 0.000], np.round(lm['pval'], 3)) assert_equal([.703, -.178, .436], np.round(lm['CI[2.5%]'], 3)) assert_equal([3.106, .257, .835], np.round(lm['CI[97.5%]'], 3)) # No intercept lm = linear_regression(X, y, add_intercept=False, as_dataframe=False) sk = LinearRegression(fit_intercept=False).fit(X, y) assert_almost_equal(lm['coef'], sk.coef_) # Scikit-learn gives wrong R^2 score when no intercept present because # sklearn.metrics.r2_score always assumes that an intercept is present # https://stackoverflow.com/questions/54614157/scikit-learn-statsmodels-which-r-squared-is-correct # assert_almost_equal(sk.score(X, y), lm['r2']) # Instead, we compare to R lm() function: assert round(lm['r2'], 4) == 0.9096 assert round(lm['adj_r2'], 4) == 0.9078 assert lm['df_model'] == 2 assert lm['df_resid'] == 98 # Test other arguments linear_regression(df[['X', 'M']], df['Y'], coef_only=True) linear_regression(df[['X', 'M']], df['Y'], alpha=0.01) linear_regression(df[['X', 'M']], df['Y'], alpha=0.10) # With missing values linear_regression(df_nan[['X', 'M']], df_nan['Y'], remove_na=True) # With columns with only one unique value lm1 = linear_regression(df[['X', 'M', 'One']], df['Y']) lm2 = linear_regression(df[['X', 'M', 'One']], df['Y'], add_intercept=False) assert lm1.shape[0] == 3 assert lm2.shape[0] == 3 assert np.isclose(lm1.at[0, 'r2'], lm2.at[0, 'r2']) # With zero-only column lm1 = linear_regression(df[['X', 'M', 'Zero', 'One']], df['Y']) lm2 = linear_regression(df[['X', 'M', 'Zero', 'One']], df['Y'].to_numpy(), add_intercept=False) lm3 = linear_regression(df[['X', 'Zero', 'M', 'Zero']].to_numpy(), df['Y'], add_intercept=False) assert_equal(lm1.loc[:, 'names'].to_numpy(), ['Intercept', 'X', 'M']) assert_equal(lm2.loc[:, 'names'].to_numpy(), ['X', 'M', 'One']) assert_equal(lm3.loc[:, 'names'].to_numpy(), ['x1', 'x3']) # With duplicate columns lm1 = linear_regression(df[['X', 'One', 'Zero', 'M', 'M', 'X']], df['Y']) lm2 = linear_regression( df[['X', 'One', 'Zero', 'M', 'M', 'X']].to_numpy(), df['Y'], add_intercept=False ) assert_equal(lm1.loc[:, 'names'].to_numpy(), ['Intercept', 'X', 'M']) assert_equal(lm2.loc[:, 'names'].to_numpy(), ['x1', 'x2', 'x4']) # Relative importance # Compare to R package relaimpo # >>> data <- read.csv('mediation.csv') # >>> lm1 <- lm(Y ~ X + M, data = data) # >>> calc.relimp(lm1, type=c("lmg")) lm = linear_regression(df[['X', 'M']], df['Y'], relimp=True) assert_almost_equal(lm.loc[[1, 2], 'relimp'], [0.05778011, 0.31521913]) assert_almost_equal(lm.loc[[1, 2], 'relimp_perc'], [15.49068, 84.50932], decimal=4) # Now we make sure that relimp_perc sums to 100% and relimp sums to r2 assert np.isclose(lm['relimp_perc'].sum(), 100.) assert np.isclose(lm['relimp'].sum(), lm.at[0, 'r2']) # 2 predictors, no intercept # Careful here, the sum of relimp is always the R^2 of the model # INCLUDING the intercept. Therefore, if the data are not normalized # and we set add_intercept to false, the sum of relimp will be # higher than the linear regression model. # A workaround is to standardize our data before: df_z = df[['X', 'M', 'Y']].apply(zscore) lm = linear_regression(df_z[['X', 'M']], df_z['Y'], add_intercept=False, as_dataframe=False, relimp=True) assert_almost_equal(lm['relimp'], [0.05778011, 0.31521913], decimal=4) assert_almost_equal(lm['relimp_perc'], [15.49068, 84.50932], decimal=4) assert np.isclose(np.sum(lm['relimp']), lm['r2']) # 3 predictors + intercept lm = linear_regression(df[['X', 'M', 'Ybin']], df['Y'], relimp=True) assert_almost_equal(lm.loc[[1, 2, 3], 'relimp'], [0.06010737, 0.31724368, 0.01217479]) assert_almost_equal(lm.loc[[1, 2, 3], 'relimp_perc'], [15.43091, 81.44355, 3.12554], decimal=4) assert np.isclose(lm['relimp'].sum(), lm.at[0, 'r2']) ###################################################################### # WEIGHTED REGRESSION - compare against R lm() function # Note that the summary function of R sometimes round to 4 decimals, # sometimes to 5, etc.. lm = linear_regression(df[['X', 'M']], df['Y'], weights=df['W2']) assert_equal(lm['coef'].round(5).to_numpy(), [1.89530, 0.03905, 0.63912]) assert_equal(lm['se'].round(5).to_numpy(), [0.60498, 0.10984, 0.10096]) assert_equal(lm['T'].round(3).to_numpy(), [3.133, 0.356, 6.331]) # R round to 3 assert_equal(lm['pval'].round(5).to_numpy(), [0.00229, 0.72296, 0.00000]) assert_equal(lm['CI[2.5%]'].round(5).to_numpy(), [0.69459, -0.17896, 0.43874]) assert_equal(lm['CI[97.5%]'].round(5).to_numpy(), [3.09602, 0.25706, 0.83949]) assert round(lm['r2'].iloc[0], 4) == 0.3742 assert round(lm['adj_r2'].iloc[0], 4) == 0.3613 assert lm.df_model_ == 2 assert lm.df_resid_ == 97 # No intercept lm = linear_regression(df[['X', 'M']], df['Y'], add_intercept=False, weights=df['W2']) assert_equal(lm['coef'].round(5).to_numpy(), [0.26924, 0.71733]) assert_equal(lm['se'].round(5).to_numpy(), [0.08525, 0.10213]) assert_equal(lm['T'].round(3).to_numpy(), [3.158, 7.024]) assert_equal(lm['pval'].round(5).to_numpy(), [0.00211, 0.00000]) assert_equal(lm['CI[2.5%]'].round(5).to_numpy(), [0.10007, 0.51466]) assert_equal(lm['CI[97.5%]'].round(4).to_numpy(), [0.4384, 0.9200]) assert round(lm['r2'].iloc[0], 4) == 0.9090 assert round(lm['adj_r2'].iloc[0], 4) == 0.9072 assert lm.df_model_ == 2 assert lm.df_resid_ == 98 # With some weights set to zero # Here, R gives slightl different results than statsmodels because # zero weights are not taken into account when calculating the degrees # of freedom. Pingouin is similar to R. lm = linear_regression(df[['X']], df['Y'], weights=df['W1']) assert_equal(lm['coef'].round(4).to_numpy(), [3.5597, 0.2820]) assert_equal(lm['se'].round(4).to_numpy(), [0.7355, 0.1222]) assert_equal(lm['pval'].round(4).to_numpy(), [0.0000, 0.0232]) assert_equal(lm['CI[2.5%]'].round(5).to_numpy(), [2.09935, 0.03943]) assert_equal(lm['CI[97.5%]'].round(5).to_numpy(), [5.02015, 0.52453]) assert round(lm['r2'].iloc[0], 5) == 0.05364 assert round(lm['adj_r2'].iloc[0], 5) == 0.04358 assert lm.df_model_ == 1 assert lm.df_resid_ == 94 # No intercept lm = linear_regression(df[['X']], df['Y'], add_intercept=False, weights=df['W1']) assert_equal(lm['coef'].round(5).to_numpy(), [0.85060]) assert_equal(lm['se'].round(5).to_numpy(), [0.03719]) assert_equal(lm['pval'].round(5).to_numpy(), [0.0000]) assert_equal(lm['CI[2.5%]'].round(5).to_numpy(), [0.77678]) assert_equal(lm['CI[97.5%]'].round(5).to_numpy(), [0.92443]) assert round(lm['r2'].iloc[0], 4) == 0.8463 assert round(lm['adj_r2'].iloc[0], 4) == 0.8447 assert lm.df_model_ == 1 assert lm.df_resid_ == 95 # With all weights to one, should be equal to OLS assert_frame_equal(linear_regression(df[['X', 'M']], df['Y']), linear_regression(df[['X', 'M']], df['Y'], weights=df['One'])) # Output is a dictionary linear_regression(df[['X', 'M']], df['Y'], weights=df['W2'], as_dataframe=False) with pytest.raises(ValueError): linear_regression(df[['X']], df['Y'], weights=df['W1'], relimp=True)