def _regress_(self, kind, obs, values, min_return=False, **kw): if not isinstance(obs, ndarray): obs = array(obs) if not isinstance(values, ndarray): values = array(values) X = sm.add_constant(obs) model = getattr(sm, kind)(values, X, **kw) return self._format_results_(model.fit(), min_return)
def GLMResults(df, outcome, predictors, adj=[], logistic=True): if logistic: family = sm.families.Binomial() coefFunc = np.exp cols = ['OR', 'LL', 'UL', 'pvalue', 'Diff', 'N'] else: family = sm.families.Gaussian() coefFunc = lambda x: x cols = ['Coef', 'LL', 'UL', 'pvalue', 'Diff', 'N'] k = len(predictors) assoc = np.zeros((k, 6)) params = [] pvalues = [] resObj = [] for i, predc in enumerate(predictors): exogVars = list(set([predc] + adj)) tmp = df[[outcome] + exogVars].dropna() model = sm.GLM(endog=tmp[outcome].astype(float), exog=sm.add_constant(tmp[exogVars].astype(float)), family=family) try: res = model.fit() assoc[i, 0] = coefFunc(res.params[predc]) assoc[i, 3] = res.pvalues[predc] assoc[i, 1:3] = coefFunc(res.conf_int().loc[predc]) assoc[i, 4] = tmp[predc].loc[tmp[outcome] == 1].mean( ) - tmp[predc].loc[tmp[outcome] == 0].mean() params.append(res.params.to_dict()) pvalues.append(res.pvalues.to_dict()) resObj.append(res) except sm.tools.sm_exceptions.PerfectSeparationError: assoc[i, 0] = np.nan assoc[i, 3] = 0 assoc[i, 1:3] = [np.nan, np.nan] assoc[i, 4] = tmp[predc].loc[tmp[outcome] == 1].mean( ) - tmp[predc].loc[tmp[outcome] == 0].mean() params.append({k: np.nan for k in [predc] + adj}) pvalues.append({k: np.nan for k in [predc] + adj}) resObj.append(None) print('PerfectSeparationError: %s with %s' % (predc, outcome)) assoc[i, 5] = tmp.shape[0] outDf = pd.DataFrame(assoc[:, :6], index=predictors, columns=cols) outDf['params'] = params outDf['pvalues'] = pvalues outDf['res'] = resObj return outDf
mytime = MyPackage.MyClass_Time.MyClass_Time() #时间类 myDA = MyPackage.MyClass_DataAnalysis.MyClass_DataAnalysis() #数据分析类 #------------------------------------------------------------ Path = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\数据及源代码" Path2 = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\习题解答" #1. import matplotlib.pyplot as plt x = list(range(1952, 2016, 4)) y = (29.3, 28.8, 28.5, 28.4, 29.4, 27.6, 27.7, 27.7, 27.8, 27.4, 27.8, 27.1, 27.3, 27.1, 27.0, 27.5) plt.plot(x, y) plt.show() import statsmodels.api as sm model = sm.OLS(y, sm.add_constant(x)).fit() print(model.summary()) data = pd.DataFrame({"x": x, "y": y}) data myDA.ols("y~x", data, True) #2. import pandas as pd Path2 = "C:\\Users\\i2011\\OneDrive\\Book_Code&Data\\量化投资以python为工具\\习题解答" EU = pd.read_csv(Path2 + '/Part2/005/EuStockMarkets.csv') plt.plot(EU.DAX, EU.FTSE, '.') plt.xlabel('DAX') plt.ylabel('FTSE')
df = pd.read_csv("data\world-happiness-report-2021.csv") y_feature = ["Ladder score"] * 6 features = [ "Logged GDP per capita", "Social support", "Healthy life expectancy", "Freedom to make life choices", "Generosity", "Perceptions of corruption" ] dependent = df["Ladder score"] independent = df[[ "Logged GDP per capita", "Social support", "Healthy life expectancy", "Freedom to make life choices", "Generosity", "Perceptions of corruption" ]] model = LinearRegression() model.fit(independent, dependent) LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) intercept = model.intercept_ coefficients = model.coef_ print("R2: ", model.score(independent, dependent)) print("Intercept: ", intercept) print("coefficients: ", coefficients) x = ssm.add_constant(independent) model = ssm.OLS(dependent, independent).fit() predictions = model.summary() print(predictions)
def diagnostic_plots(X, y, model_fit=None): """ Function to reproduce the 4 base plots of an OLS model in R. https://robert-alvarez.github.io/2018-06-04-diagnostic_plots/ --- Inputs: X: A numpy array or pandas dataframe of the features to use in building the linear regression model y: A numpy array or pandas series/dataframe of the target variable of the linear regression model model_fit [optional]: a statsmodel.api.OLS model after regressing y on X. If not provided, will be generated from X, y """ def _graph(formula, x_range, label=None): """ Helper function for plotting cook's distance lines """ x = x_range y = formula(x) plt.plot(x, y, label=label, lw=1, ls='--', color='red') if not model_fit: model_fit = sm.OLS(y, sm.add_constant(X)).fit() # create dataframe from X, y for easier plot handling dataframe = pd.concat([X, y], axis=1) # model values model_fitted_y = model_fit.fittedvalues # model residuals model_residuals = model_fit.resid # normalized residuals model_norm_residuals = model_fit.get_influence().resid_studentized_internal # absolute squared normalized residuals model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals)) # absolute residuals model_abs_resid = np.abs(model_residuals) # leverage, from statsmodels internals model_leverage = model_fit.get_influence().hat_matrix_diag # cook's distance, from statsmodels internals model_cooks = model_fit.get_influence().cooks_distance[0] plot_lm_1 = plt.figure() plot_lm_1.axes[0] = sns.residplot(model_fitted_y, dataframe.columns[-1], data=dataframe, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_1.axes[0].set_title('Residuals vs Fitted') plot_lm_1.axes[0].set_xlabel('Fitted values') plot_lm_1.axes[0].set_ylabel('Residuals') # annotations abs_resid = model_abs_resid.sort_values(ascending=False) abs_resid_top_3 = abs_resid[:3] for i in abs_resid_top_3.index: plot_lm_1.axes[0].annotate(i, xy=(model_fitted_y[i], model_residuals[i])) QQ = ProbPlot(model_norm_residuals) plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1) plot_lm_2.axes[0].set_title('Normal Q-Q') plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles') plot_lm_2.axes[0].set_ylabel('Standardized Residuals') # annotations abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for r, i in enumerate(abs_norm_resid_top_3): plot_lm_2.axes[0].annotate(i, xy=(np.flip(QQ.theoretical_quantiles, 0)[r], model_norm_residuals[i])) plot_lm_3 = plt.figure() plt.scatter(model_fitted_y, model_norm_residuals_abs_sqrt, alpha=0.5) sns.regplot(model_fitted_y, model_norm_residuals_abs_sqrt, scatter=False, ci=False, lowess=True, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_3.axes[0].set_title('Scale-Location') plot_lm_3.axes[0].set_xlabel('Fitted values') plot_lm_3.axes[0].set_ylabel('$\sqrt{|Standardized Residuals|}$') # annotations abs_sq_norm_resid = np.flip(np.argsort(model_norm_residuals_abs_sqrt), 0) abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3] for i in abs_sq_norm_resid_top_3: plot_lm_3.axes[0].annotate(i, xy=(model_fitted_y[i], model_norm_residuals_abs_sqrt[i])) plot_lm_4 = plt.figure() plt.scatter(model_leverage, model_norm_residuals, alpha=0.5) sns.regplot(model_leverage, model_norm_residuals, scatter=False, ci=False, lowess=True, line_kws={ 'color': 'red', 'lw': 1, 'alpha': 0.8 }) plot_lm_4.axes[0].set_xlim(0, max(model_leverage) + 0.01) plot_lm_4.axes[0].set_ylim(-3, 5) plot_lm_4.axes[0].set_title('Residuals vs Leverage') plot_lm_4.axes[0].set_xlabel('Leverage') plot_lm_4.axes[0].set_ylabel('Standardized Residuals') # annotations leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3] for i in leverage_top_3: plot_lm_4.axes[0].annotate(i, xy=(model_leverage[i], model_norm_residuals[i])) p = len(model_fit.params) # number of model parameters _graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x), np.linspace(0.001, max(model_leverage), 50), 'Cook\'s distance') # 0.5 line _graph(lambda x: np.sqrt((1 * p * (1 - x)) / x), np.linspace(0.001, max(model_leverage), 50)) # 1 line plot_lm_4.legend(loc='upper right')