def whites_test(results): # White's Test for Heteroscedasticity test = sms.het_white(results.resid, results.model.exog) names = ['White Statistic', 'p-value', 'f-value', 'f p-value'] lzip(names, test) white_results = pd.DataFrame([names, test]) print(white_results)
def breusch_pagan_test(results): # Breusch-Pagan test for Heteroscedasticity test = sms.het_breuschpagan(results.resid, results.model.exog) print("") names = ['Breusch Pagan Statistics', 'p-value', 'f-value', 'f p-value'] lzip(names, test) bp_results = pd.DataFrame([names, test]) print(bp_results) return bp_results
def ARCH(x, y): ols_results = ols(x, y) name = [ 'LM statistic', 'p-value of LM test', 'f-statistic of the hypothesis', 'f p-value' ] test = sms.het_arch(ols_results.resid, maxlag=1) return lzip(name, test)
def Breusch_Goldfrey(x, y): ols_results = ols(x, y) name = [ 'LM statistic', 'p-value of LM test', 'f-statistic of the hypothesis', 'f p-value' ] test = sms.acorr_breusch_godfrey(ols_results) return lzip(name, test)
def regress_bp(SP5002): Y = SP5002["SP500"] BetaHAT1 = SP5002["Dividend"] BetaHAT2 = SP5002["Earnings"] BetaHAT3 = SP5002["Consumer Price Index"] BetaHAT4 = SP5002["Long Interest Rate"] results = sm.ols(formula="Y ~ BetaHAT1 + BetaHAT2 + BetaHAT3 + BetaHAT4", data=SP5002).fit() print(results.summary()) names = [ 'Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value' ] test = sms.het_breuschpagan(results.resid, results.model.exog) print("") lzip(names, test) bp_results = pd.DataFrame([names, test]) print(bp_results) return results
def load(): """ Load the data and return a Dataset class instance. Returns ------- Dataset instance: See DATASET_PROPOSAL.txt for more information. """ data = _get_data() names = data.columns.tolist() dtype = lzip(names, ['a45', 'a3', 'a40', 'a14'] + ['<f8'] * 54) data = lmap(tuple, data.values.tolist()) dataset = du.Dataset(data=np.array(data, dtype=dtype).view(np.recarray), names=names) return dataset
def residuals_vs_fitted(predictions, residuals, out_path=None): """Create and return a scatter plot of a model's fitted values (predictions) versus the residuals Args: predictions: The predictions from a regression residuals: The residuals from a regression out_path: An optional path to save the graph to Returns: The residuals vs. fitted graph """ # Get Jarque-bera test of normality name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test = sm.stats.jarque_bera(residuals) jarque_bera = lzip(name, test) p_value = jarque_bera[1][1] mu = 0 variance = stats.variance(residuals) sigma = math.sqrt(variance) x = np.linspace(mu - 4 * sigma, mu + 4 * sigma, 100) # Build Scatterplot fig, ax = plt.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [3, 1]}) ax[0].scatter(predictions, residuals) ax[0].set_title("Residuals vs. Fitted Values") ax[0].set_xlabel("Fitted Values") ax[0].set_ylabel("Residuals") ax[0].axhline(0, c="k", linewidth=0.5) ax[1].hist(residuals, bins=30, orientation="horizontal") # ax[1].set_xticks(np.linspace(0, round(ax[1].get_xbound()[1]), 3)) ax2 = ax[1].twiny() # ax2.set_xticks(np.linspace(0, round(ax2.get_xbound()[1], 2), 3)) ax2.plot(sci_stats.norm.pdf(x, mu, sigma), x, color="red") ax[1].set_xlabel("Frequency") ax[1].set_title("Residual Distribution") fig.tight_layout() align_xaxis(ax[1], 0, ax2, 0) if out_path: fig.savefig(out_path) return fig
def actual_test(dict_split, params, definition, reg_type): data_reg = {'x_data': dict_split[0], 'y_data': dict_split[2]["co"]} reg = ols_reg(p_data=data_reg, p_params=params, p_model=reg_type, p_iter=100000) pred_test = reg["model"].predict(dict_split[1]) residuales = reg['results']['y_data'] - reg['results']['y_data_p'] #Pruebas de residuales vis.residual(residuales=residuales) vis.histograma(residuales) #heterocedasticidad hetero = check_hetero(residuales) #jungbox ljung = acorr_ljungbox(residuales, lags=7, return_df=True) #normality name = ["Jarque-Bera", "Chi2 two tail prob", "Skew", "Kurtosis"] test = sms.jarque_bera(residuales) jarquebera = lzip(name, test) rss = sum((dict_split[3]["co"] - pred_test)**2) return rss, data_reg, reg, definition, hetero, ljung, jarquebera
plt.plot(df['GDP'], predictions, color='teal') plt.title("Vehicles vs GDP") plt.xlabel("GDP") plt.plot(df['GDP'], y, marker='o', linewidth=0, markersize=1.6, color='black') # plt.savefig("stats_gdp.png") plt.show() plt.plot(df['Population'], predictions, color='teal') plt.title("Vehicles vs Population") plt.xlabel("Population") plt.plot(df['Population'], y, marker='o', linewidth=0, markersize=1.6, color='black') # plt.savefig("stats_pop.png") plt.show() vif_df = pd.DataFrame() vif_df["VIF Factor"] = [ variance_inflation_factor(X.values, i) for i in range(X.shape[1]) ] vif_df["features"] = X.columns print(vif_df) name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sm.stats.het_breuschpagan(model.resid, model.model.exog) test = lzip(name, test) pprint(test)
# In[20]: ##heteroskedasticity x = reg01.model.data.orig_exog print(x.head()) print('\n') print( reg01.resid.head()) white = sm.stats.diagnostic.het_white( reg01.resid, x) ret = ['Test Statistic', 'p-Value', 'F Statistic', 'p-Value'] xzip01 = zip(ret, white) print( '\nWhites Test for Heteroskedasticity') lzip(xzip01) # In[29]: ##vif indepvar = ['drugabuse', 'alcabuse', 'wage', 'mentalhealth', 'housing'] x = np.diag( np.linalg.inv( corr_m)) xzip = zip(indepvar, x) lzip( xzip)
weights = rob_crime_model.weights idx = weights > 0 X = rob_crime_model.model.exog[idx.values] ww = weights[idx] / weights[idx].mean() hat_matrix_diag = ww * (X * np.linalg.pinv(X).T).sum(1) resid = rob_crime_model.resid resid2 = resid**2 resid2 /= resid2.sum() nobs = int(idx.sum()) hm = hat_matrix_diag.mean() rm = resid2.mean() from statsmodels.graphics import utils fig, ax = plt.subplots(figsize=(12, 8)) ax.plot(resid2[idx], hat_matrix_diag, 'o') ax = utils.annotate_axes( range(nobs), labels=rob_crime_model.model.data.row_labels[idx], points=lzip(resid2[idx], hat_matrix_diag), offset_points=[(-5, 5)] * nobs, size="large", ax=ax) ax.set_xlabel("resid2") ax.set_ylabel("leverage") ylim = ax.get_ylim() ax.vlines(rm, *ylim) xlim = ax.get_xlim() ax.hlines(hm, *xlim) ax.margins(0, 0)
model = smf.ols("futureMargin ~ daysSinceLastOrder + margin + returnRatio + shareOwnBrand + shareVoucher + shareSale + itemsPerOrder", data = Clv).fit() model.summary() stats.probplot(model.resid, plot= plt) plt.title("Model1 Residuals Probability Plot") # Residuals are normally distributed! Woot! Hence inference tests can be used # Homoscedasticity or constant variance of residuals TestNames = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breuschpagan(model.resid, model.model.exog) lzip(TestNames, test) # Split the data into training/testing sets clv_X_train = Clv6[:-20] clv_X_test = Clv6[-20:] # Split the targets into training/testing sets clv_y_train = Clv.futureMargin[:-20] clv_y_test = Clv.futureMargin[-20:] # Create linear regression object regr = sk.linear_model.LinearRegression()
np.sqrt(np.mean(model.resid**2)) # 244 # checking normality of residuals stats.anderson(model.resid) # residuals are normal # checking auto-correlation of residuals from statsmodels.stats import diagnostic as diag diag.acorr_ljungbox(model.resid, lags=1) # pvalue is <0.05, so autocorrelation is present # checking heteroscedasticity import statsmodels.stats.api as sms from statsmodels.compat import lzip name = ['F-statistic','p-value'] gold_test = sms.het_goldfeldquandt(model.resid,model.model.exog) lzip(name,gold_test) # ('F-stat', 0.6722696289421596), ('p-value', 0.9999999999999999)], # go with null, residuals are homoscedastic: constant variance pred_price = model.predict(computer2.drop(['price'],axis=1)) pred_price[0:4] computer2.price[0:4] model.resid[0:4] 1499-1787 # except for autocorrelation all assumptions are satisfied # splitting data from sklearn.model_selection import train_test_split train,test = train_test_split(computer2,test_size=0.30, random_state=100) 1837/6122
def ols_test_breusch_pagan(self): names = [ 'Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value' ] bp = sms.het_breuschpagan(self.residuals, self.model.model.exog) return lzip(names, bp)
def harveyCollier(results): name = ['t value', 'p value'] test = sms.linear_harvey_collier(results) lzip(name, test)
# There aren't yet an influence diagnostics as part of RLM, but we can recreate them. (This depends on the status of [issue #888](https://github.com/statsmodels/statsmodels/issues/808)) weights = rob_crime_model.weights idx = weights > 0 X = rob_crime_model.model.exog[idx] ww = weights[idx] / weights[idx].mean() hat_matrix_diag = ww*(X*np.linalg.pinv(X).T).sum(1) resid = rob_crime_model.resid resid2 = resid**2 resid2 /= resid2.sum() nobs = int(idx.sum()) hm = hat_matrix_diag.mean() rm = resid2.mean() from statsmodels.graphics import utils fig, ax = plt.subplots(figsize=(12,8)) ax.plot(resid2[idx], hat_matrix_diag, 'o') ax = utils.annotate_axes(range(nobs), labels=rob_crime_model.model.data.row_labels[idx], points=lzip(resid2[idx], hat_matrix_diag), offset_points=[(-5,5)]*nobs, size="large", ax=ax) ax.set_xlabel("resid2") ax.set_ylabel("leverage") ylim = ax.get_ylim() ax.vlines(rm, *ylim) xlim = ax.get_xlim() ax.hlines(hm, *xlim) ax.margins(0,0)
def omniTest(residuals): name = ['Chi^2', 'Two-tail probability'] test = sms.omni_normtest(residuals) lzip(name, test)
def Goldfeld_Quant(X,y): ols_retults=ols(X,y) name = ['F statistic', 'p-value'] test = sms.HetGoldfeldQuandt().run(ols_results.model.endog, ols_results.model.exog, idx=None, \ split=0.25, drop =0.5, alternative ='two-sided', attach=True ) return lzip(name, test)
def Breush_Pagan(X,y): ols_retults=ols(X,y) name = ['LM statistic', 'p-value of LM test', 'f-statistic of the hypothesis', 'f p-value'] test = sms.het_breushpagan(ols_retults.resid, ols_retults.model.exog) return lzip(name, test)
plt.figure(figsize=(12, 5)) # Plot a simple histogram with binsize determined automatically sns.distplot(res_2.resid, 20) plt.title('Histogram of residuals') plt.xlabel('Residuals') plt.ylabel('Density') plt.grid(True) plt.show() from statsmodels.compat import lzip import statsmodels.stats.api as sms name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] results1 = sms.acorr_breusch_godfrey(res_2, 10) print(lzip(name, results1)) name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] JB, JBpv, skw, kurt = sm.stats.stattools.jarque_bera(res_2.resid) print(lzip(name, results1)) print(res_2.expected_durations) print(res_2.conf_int()) predict = res_2.predict() predict = pd.DataFrame(predict.tail(20)) predict.rename(columns={0: 'Predicted'}, inplace=True) predict.rename(columns={0: 'Predicted'}, inplace=True) combine = pd.concat([predict, data['forecast_variable'].tail(20)], axis=1) combine = combine.reset_index()
#%% DATA TRASFORMATION - LOGARITHMIC ''' some predictors and the target variable present a very skewed distribution. THerefore we should consider to apply the logarithmic transformation. This helps in turning the distribution is something more gaussian. Let's apply a log-log trasformation ''' import numpy as np from statsmodels.stats.stattools import jarque_bera as jb from statsmodels.stats.stattools import omni_normtest as omb from statsmodels.compat import lzip # Jarque-Bera normality test name = ['Jarque-Bera', 'Chi^2 two-tail probability', 'Skewness', 'Kurtosis'] test_results = jb(data.duration) lzip(name, test_results) # vote_count data.vote_count = np.log(data.vote_count + 1) # comment_count data.comment_count = np.log(data.comment_count + 1) # description_length data.description_length = np.log(data.description_length + 1) # watch-count data.watch_count = np.log(data.watch_count + 1) # duration data.duration = np.log(data.duration) # run test again test_results = jb(data.duration) lzip(name, test_results) # very improved! :)
dat = pd.read_csv(url) # Fit regression model (using the natural log of one of the regressaors) results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit() # Inspect the results print(results.summary()) # ## Normality of the residuals # Jarque-Bera test: name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test = sms.jarque_bera(results.resid) lzip(name, test) # Omni test: name = ['Chi^2', 'Two-tail probability'] test = sms.omni_normtest(results.resid) lzip(name, test) # ## Influence tests # # Once created, an object of class ``OLSInfluence`` holds attributes and methods that allow users to assess the influence of each observation. For example, we can compute and extract the first few rows of DFbetas by: from statsmodels.stats.outliers_influence import OLSInfluence test_class = OLSInfluence(results)
def goldfeldQuandtTest(residuals, exogVars): name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(residuals, exogVars) lzip(name, test)
print(session.s12.describe()) print(session.s2.describe()) print(session.on.describe()) #各種統計量の算出、比較 import statsmodels.api as sm import numpy as np from statsmodels.compat import lzip import statsmodels.stats.api as sms print(sm.tsa.adfuller(session.s1, regression='nc')[1]) #[1]はp値の検定結果 print(sm.tsa.adfuller(session.s1, regression='c')[1]) #[1]はp値の検定結果 print(sm.tsa.adfuller(session.s1, regression='ct')[1]) #[1]はp値の検定結果 print(session.s1.mean() / session.s1.std() * np.sqrt(session.s1.count())) estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis'] test = sms.jarque_bera(session.s1) print('s1: ', lzip(estimator, test)) print(sm.tsa.adfuller(session.s12, regression='nc')[1]) #[1]はp値の検定結果 print(sm.tsa.adfuller(session.s12, regression='c')[1]) #[1]はp値の検定結果 print(sm.tsa.adfuller(session.s12, regression='ct')[1]) #[1]はp値の検定結果 print(session.s1.mean() / session.s12.std() * np.sqrt(session.s1.count())) estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis'] test = sms.jarque_bera(session.s12) print('s12: ', lzip(estimator, test)) print(sm.tsa.adfuller(session.s2, regression='nc')[1]) #[1]はp値の検定結果 print(sm.tsa.adfuller(session.s2, regression='c')[1]) #[1]はp値の検定結果 print(sm.tsa.adfuller(session.s2, regression='ct')[1]) #[1]はp値の検定結果 print(session.s2.mean() / session.s2.std() * np.sqrt(session.s2.count())) estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis'] test = sms.jarque_bera(session.s2)
def jarqueBeraTest(residuals): name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test = sms.jarque_bera(residuals) lzip(name, test)
def linear_regression_analysis(linear_regression): """ Compute and plot a complete analysis of a linear regression computed with Stats Models. Args: linear_regression (Stats Models Results): the result obtained with Stats Models. """ # Data resid = linear_regression.resid_pearson.copy() resid_index = linear_regression.resid.index exog = linear_regression.model.exog endog = linear_regression.model.endog fitted_values = linear_regression.fittedvalues influences = outliers_influence.OLSInfluence(linear_regression) p = exog.shape[1] # Number of features n = len(resid) # Number of individuals # Paramètres color1 = "#3498db" color2 = "#e74c3c" ############################################################################## # Tests statistiques # ############################################################################## # Homoscédasticité - Test de Breusch-Pagan ########################################## names = ['Lagrande multiplier statistic', 'p-value', 'f-value', 'f p-value'] breusch_pagan = sm.stats.diagnostic.het_breuschpagan(resid, exog) print(lzip(names, breusch_pagan)) # Test de normalité - Shapiro-Wilk ################################### print(f"Shapiro pvalue : {st.shapiro(resid)[1]}") ############################################################################## # Analyses de forme # ############################################################################## # Histogramme des résidus ########################## data = resid data_filter = data[data < 5] data_filter = data[data > -5] len_data = len(data) len_data_filter = len(data_filter) ratio = len_data_filter / len_data fig, ax = plt.subplots() plt.hist(data_filter, bins=20, color=color1) plt.xlabel("Residual values") plt.ylabel("Number of residuals") plt.title(f"Histogramme des résidus de -5 à 5 ({ratio:.2%})") # Normal distribution vs residuals (QQ Plot, droite de Henry) ############################################################# data = pd.Series(resid).sort_values() len_data = len(data) normal = pd.Series(np.random.normal(size=len_data)).sort_values() fig, ax = plt.subplots() plt.scatter(data, normal, c=color1) plt.plot((-4,4), (-4, 4), c=color2) plt.xlabel("Residuals") plt.ylabel("Normal distribution") plt.xlim(-4, 4) plt.ylim(-4, 4) plt.title("Residuals vs Normal (QQ Plot)") # Fitted vs Residuals ###################### data = resid fig, ax = plt.subplots() plt.scatter(fitted_values, data, alpha=0.5, c=color1) plt.xlabel("Fitted values") plt.ylabel("Residuals") plt.title("Fitted vs Residuals") # Actual vs Predict plot fig, ax = plt.subplots() plt.scatter(endog, fitted_values, c=color1, alpha=0.5) plt.plot(endog, endog, c=color2) plt.xlabel("Actual values") plt.ylabel("Fitted values") plt.title("Acutal vs Predict") ############################################################################## # Analyse des outliers # ############################################################################## # Leviers (hii, diagonale de la matrice chapeau) ################################################ # Individus atypiques (distance à la moyenne des observations) # Calcul de la proportion data = influences.hat_matrix_diag seuil = 2*p/n len_data = len(data) data_filter = data[data <= seuil] len_data_filter = len(data_filter) ratio = len_data_filter / len_data # Plot fig, ax = plt.subplots() plt.plot(data) plt.plot((0, len_data), (seuil, seuil), c="#d35400") plt.ylabel("Leverage values (hii)") plt.title(f"Leviers avec seuil à 2*p/n ({ratio:.2%})") # Résidus studentisés ##################### # Individus mal représentés par le modèle # Calcul de la proportion data = influences.resid_studentized_internal len_data = len(data) data_filter = data[data <= 2] data_filter = data_filter[data_filter >= -2] len_data_filter = len(data_filter) ratio = len_data_filter / len_data # Plot fig, ax = plt.subplots() plt.plot(data) plt.plot((0, len_data), (2, 2), c="#d35400") plt.plot((0, len_data), (-2, -2), c="#d35400") plt.ylabel("Studentized Residuals") plt.title(f"Résidus studentisés avec seuil à 2 et -2 ({ratio:.2%})") # Distances de cook ################### # Outliers dont la supression influencent fortement le modèle # Calcul de la proportion data = influences.cooks_distance[0] seuil = 4/(n-p) len_data = len(data) data_filter = data[data <= seuil] len_data_filter = len(data_filter) ratio = len_data_filter / len_data # Plot fig, ax = plt.subplots() plt.plot(data) plt.plot((0, len_data), (seuil, seuil)) plt.ylabel("Cook Distance") plt.title(f"Distances de Cook avec seuil à 4/(n-p) ({ratio:.2%})") # Plot plt.show()
#================================================ #================================================ #======================Normality of the residuals sns.distplot(np.array(residual)); plt.show() sns.distplot(residual_z); plt.show() #======================Jarque-Bera test: import statsmodels.stats.api as sms from statsmodels.compat import lzip name1 = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test1 = sms.jarque_bera(lr.resid) lzip(name1, test1) #null hypothesis: the data is normally distributed. #======================Omni test: name2 = ['Chi^2', 'Two-tail probability'] test2 = sms.omni_normtest(lr.resid) lzip(name2, test2) ''' #================================================ #================================================ #=========================Heteroskedasticity test #======================Breush-Pagan test: name3 = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test3 = sms.het_breuschpagan(lr.resid, lr.model.exog) lzip(name3, test3)
def diagnostic_plots(self, linear_model): """ :param linear_model: Linear Model Fit on the Data :return: None This method validates the assumptions of Linear Model """ diagnostic_result = {} summary = linear_model.summary() #diagnostic_result['summary'] = str(summary) # fitted values fitted_y = linear_model.fittedvalues # model residuals residuals = linear_model.resid # normalized residuals residuals_normalized = linear_model.get_influence().resid_studentized_internal # absolute squared normalized residuals model_norm_residuals_abs_sqrt = np.sqrt(np.abs(residuals_normalized)) # leverage, from statsmodels internals leverage = linear_model.get_influence().hat_matrix_diag # cook's distance, from statsmodels internals cooks = linear_model.get_influence().cooks_distance[0] self.check_linearity_assumption(fitted_y, residuals) self.check_residual_normality(residuals_normalized) self.check_homoscedacticity(fitted_y, model_norm_residuals_abs_sqrt) self.check_influcence(leverage, cooks, residuals_normalized) # 1. Non-Linearity Test try: name = ['F value', 'p value'] test = sms.linear_harvey_collier(linear_model) linear_test_result = lzip(name, test) except Exception as e: linear_test_result = str(e) diagnostic_result['Non_Linearity_Test'] = linear_test_result # 2. Hetroskedasticity Test name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breuschpagan(linear_model.resid, linear_model.model.exog) test_val = lzip(name, test) diagnostic_result['Hetroskedasticity_Test'] = test_val # 3. Normality of Residuals name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test = sms.jarque_bera(linear_model.resid) test_val = lzip(name, test) diagnostic_result['Residual_Normality_Test'] = test_val # 4. MultiCollnearity Test test = np.linalg.cond(linear_model.model.exog) test_val = [('condition no',test)] diagnostic_result['MultiCollnearity_Test'] = test_val # 5. Residuals Auto-Correlation Tests test = sms.durbin_watson(linear_model.resid) test_val = [('p value', test)] diagnostic_result['Residual_AutoCorrelation_Test'] = test_val json_result = json.dumps(diagnostic_result) return summary, json_result
# correlation al_cor=sold_prediction.corr() al_cor=al_cor.unstack() al_cor["sale_price_raw_m"].sort_values(ascending=False) # Assumption of Independent Errors statsmodels.stats.stattools.durbin_watson(lm2.resid) name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test = sms.jarque_bera(lm2.resid) print(lzip(name, test)) # Assumption of Normality of the Residuals sold_prediction['sale_price_raw_m'].plot(kind='hist', title= 'Log of Sale Price Distribution') # Assumption of Normality of the Residuals sold_prediction['sale_price_raw_m_log'] = np.log(sold_prediction['sale_price_raw_m']) sold_prediction['sale_price_raw_m_log'].plot(kind='hist', title= 'Log of Sale Price Distribution')
# In[115]: #各種統計量の算出、比較 import statsmodels.api as sm import numpy as np from statsmodels.compat import lzip import statsmodels.stats.api as sms print('adf nc',sm.tsa.adfuller(session.s1,regression='nc')[1]) #[1]はp値の検定結果 print('adf c',sm.tsa.adfuller(session.s1,regression='c')[1]) #[1]はp値の検定結果 print('adf ct',sm.tsa.adfuller(session.s1,regression='ct')[1]) #[1]はp値の検定結果 print(session.s1.mean()/session.s1.std()*np.sqrt(session.s1.count())) estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis'] test = sms.jarque_bera(session.s1) print('s1: ',lzip(estimator, test)) # In[116]: print(sm.tsa.adfuller(session.s12,regression='nc')[1]) #[1]はp値の検定結果 print(sm.tsa.adfuller(session.s12,regression='c')[1]) #[1]はp値の検定結果 print(sm.tsa.adfuller(session.s12,regression='ct')[1]) #[1]はp値の検定結果 print(session.s1.mean()/session.s12.std()*np.sqrt(session.s1.count())) estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis'] test = sms.jarque_bera(session.s12) print('s12: ',lzip(estimator, test)) # In[117]:
# #888](https://github.com/statsmodels/statsmodels/issues/808)) weights = rob_crime_model.weights idx = weights > 0 X = rob_crime_model.model.exog[idx.values] ww = weights[idx] / weights[idx].mean() hat_matrix_diag = ww * (X * np.linalg.pinv(X).T).sum(1) resid = rob_crime_model.resid resid2 = resid**2 resid2 /= resid2.sum() nobs = int(idx.sum()) hm = hat_matrix_diag.mean() rm = resid2.mean() from statsmodels.graphics import utils fig, ax = plt.subplots(figsize=(12, 8)) ax.plot(resid2[idx], hat_matrix_diag, 'o') ax = utils.annotate_axes(range(nobs), labels=rob_crime_model.model.data.row_labels[idx], points=lzip(resid2[idx], hat_matrix_diag), offset_points=[(-5, 5)] * nobs, size="large", ax=ax) ax.set_xlabel("resid2") ax.set_ylabel("leverage") ylim = ax.get_ylim() ax.vlines(rm, *ylim) xlim = ax.get_xlim() ax.hlines(hm, *xlim) ax.margins(0, 0)
for d in range(len(Direction)): print(files[i][:-4] + Direction[d]) sheet = files[i][:-4] + Direction[d] dat = pd.read_excel('TempVDistance 2.xlsx', sheetname=sheet) Dis = dat['Distance'].tolist() T = dat['Temp'].tolist() # Fit regression model (using the natural log of one of the regressaors) results = smf.ols('Temp ~ Distance', data=dat).fit() name = [ 'Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value' ] #BP Test for HeteroScadicity test = sms.het_breushpagan(results.resid, results.model.exog) p_value = lzip(name, test)[1][1] #print(p_value) if p_value < .05: Dis = sm.add_constant(Dis) ols_resid = sm.OLS(T, Dis).fit().resid res_fit = sm.OLS(ols_resid[1:], ols_resid[:-1]).fit() rho = res_fit.params order = toeplitz(range(len(ols_resid))) sigma = rho**order gls_model = sm.GLS(T, Dis, sigma=sigma) gls_results = gls_model.fit() ws.cell(row=3 * i + 1, column=d + 2).value = gls_results.params[1] ws.cell(row=3 * i + 2, column=d + 2).value = gls_results.rsquared ws.cell(row=3 * i + 3, column=d + 2).value = gls_results.pvalues[1] else:
# %% [code] p = sns.scatterplot(y_pred, residuals) plt.xlabel('y_pred/predicted values') plt.ylabel('Residuals') plt.ylim(-10, 10) plt.xlim(0, 26) p = sns.lineplot([0, 26], [0, 0], color='blue') p = plt.title('Residuals vs fitted values plot for homoscedasticity check') # %% [code] import statsmodels.stats.api as sms from statsmodels.compat import lzip name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(residuals, X_train) lzip(name, test) # %% [code] from scipy.stats import bartlett test = bartlett(X_train, residuals) print(test) # %% [markdown] # ## <a id="normal">4. Check for Normality of error terms/residuals</a> # %% [code] p = sns.distplot(residuals, kde=True) p = plt.title('Normality of error terms/residuals') # %% [markdown] # ## <a id="auto">5. No autocorrelation of residuals</a>
def linear_regression_analysis(linear_regression): """ Compute and plot a complete analysis of a linear regression computed with Stats Models. Args: linear_regression (Stats Models Results): the result obtained with Stats Models. """ # Data resid = linear_regression.resid_pearson.copy() resid_index = linear_regression.resid.index exog = linear_regression.model.exog endog = linear_regression.model.endog fitted_values = linear_regression.fittedvalues influences = outliers_influence.OLSInfluence(linear_regression) p = exog.shape[1] # Number of features n = len(resid) # Number of individuals # Paramètres color1 = "#3498db" color2 = "#e74c3c" ############################################################################## # Tests statistiques # ############################################################################## # Homoscédasticité - Test de Breusch-Pagan ########################################## names = [ 'Lagrande multiplier statistic', 'p-value', 'f-value', 'f p-value' ] breusch_pagan = sm.stats.diagnostic.het_breuschpagan(resid, exog) print(lzip(names, breusch_pagan)) # Test de normalité - Shapiro-Wilk ################################### print(f"Shapiro pvalue : {st.shapiro(resid)[1]}") ############################################################################## # Analyses de forme # ############################################################################## # Histogramme des résidus ########################## data = resid data_filter = data[data < 5] data_filter = data[data > -5] len_data = len(data) len_data_filter = len(data_filter) ratio = len_data_filter / len_data fig, ax = plt.subplots() plt.hist(data_filter, bins=20, color=color1) plt.xlabel("Residual values") plt.ylabel("Number of residuals") plt.title(f"Histogramme des résidus de -5 à 5 ({ratio:.2%})") # Normal distribution vs residuals (QQ Plot, droite de Henry) ############################################################# data = pd.Series(resid).sort_values() len_data = len(data) normal = pd.Series(np.random.normal(size=len_data)).sort_values() fig, ax = plt.subplots() plt.scatter(data, normal, c=color1) plt.plot((-4, 4), (-4, 4), c=color2) plt.xlabel("Residuals") plt.ylabel("Normal distribution") plt.xlim(-4, 4) plt.ylim(-4, 4) plt.title("Residuals vs Normal (QQ Plot)") # Plot plt.show()