def _detect_outliers(self, max_prob): """Detects outlier time points. Find the dates in the data set that are recommended to be removed as outliers. Args: max_prob: (float between 0 and 1) Maximum acceptable probability of having observed percentile of maximum studentized residual be greater than the reference distribution. Returns: A list of dates (in the data set) that were detected to be outliers. """ excluded_dates = [] while True: data_subset = self._analysis_data.drop(excluded_dates) if data_subset.shape[0] == 0: break reg_fit = smf.ols('y ~ x', data=data_subset).fit() absresid = abs( OLSInfluence(reg_fit).get_resid_studentized_external()) pretest_len = data_subset.shape[0] - len(excluded_dates) beta_quantile = stats.beta.ppf(1 - max_prob, pretest_len, 1) threshold = stats.t.ppf((1 + beta_quantile) / 2, df=pretest_len - 3) max_resid = max(absresid) if max_resid < threshold: break exclude_date = list(data_subset.index[absresid == max_resid]) excluded_dates.extend(exclude_date) return excluded_dates
def _setup_cooks_distance(self, ols_results): ols_influence = OLSInfluence(ols_results) cooks_distance, _ = ols_influence.cooks_distance dates_index = self.input_data.regressors_df.index self.cooks_distance_tms = QFSeries(data=cooks_distance, index=dates_index.copy()) self.ols_influence = ols_influence
def sm_lin_fit_diagnostics(model, max_leverage=0.3, cook_th=0.25, cook_labels=None): """ Make diagnostic plots of linear regression. Parameters ---------- model: statsmodels.RegressionResult E.g. returned by `sm_lin_fit`. Returns ------- pyplot.Axes: On which diagnostics are drawn. """ def cook_distance(residuals, leverage, k_vars): return residuals**2 * leverage / (1 - leverage) / k_vars influence = OLSInfluence(model) k_vars = influence.k_vars standardized_resid = influence.resid_studentized_internal # Init plot gs_kw = {'hspace': 0.35, 'wspace': 0.35, 'right': 0.95, 'left': 0.1} _, ax = plt.subplots(2, 2, figsize=(8, 8), gridspec_kw=gs_kw) # Residuals vs. predicted panel ax[0, 0].plot(model.fittedvalues, model.resid, 'ko') ax[0, 0].set(xlabel='Fitted values', ylabel='Residuals') # Q-Q plot panel sm.qqplot(model.resid, ax=ax[0, 1], line='r') # Scale-location panel ax[1, 0].plot(model.fittedvalues, np.sqrt(standardized_resid**2), 'ko') ax[1, 0].set(xlabel='Fitted values', ylabel=r'$|$Standardized residuals$|^{\frac{1}{2}}$') # Leverage panel leverage = model.get_influence().hat_matrix_diag ax[1, 1].plot(leverage, standardized_resid, 'ko') ax[1, 1].set(xlabel='Leverage', ylabel='Standradized residuals') if cook_labels is not None: for n_, l_, r_ in zip(cook_labels, leverage, standardized_resid): if cook_distance(r_, l_, k_vars) > cook_th: ax[1, 1].text(l_, r_, n_) # --- Cook distance lines x = np.linspace(0, max_leverage, 100) y = np.linspace(-6, 6, 100) X, Y = np.meshgrid(x, y) C = cook_distance(Y, X, k_vars) ctr = ax[1, 1].contour(X, Y, C, [0.5, 1], colors='r', linestyles='--') plt.clabel(ctr, fmt='%.1f') return ax, cook_distance(standardized_resid, leverage, k_vars)
def plot_influence(model): residuals = pd.Series(model.resid, name="Residuals") leverage = pd.Series(OLSInfluence(model).influence, name="Leverage") _ = sns.regplot(residuals, leverage, fit_reg=False) plt.show() sm.graphics.influence_plot(model, alpha=0.05, criterion="cooks") plt.show() return leverage
def conf_pred_band_ex(regress_ex , poly, model, alpha=0.05): # Regressionsfunktion übernehmen und Matrix der neuen Stützstellen aufbauen # Neue Stützstellen übernehmen, alle Terme entsprechen einer Spalte poly_ex = ols(poly.formula,regress_ex) x0 = poly_ex.exog # Konfidenz- und Prognodebereich berechnen # Kenngrößen aus model verwenden, weil das zur ursprünglichen Regression verwendet wurde infl = OLSInfluence(model) d = np.dot(x0,np.dot(infl.results.model.normalized_cov_params,x0.T)) tppf = stats.t.isf(alpha/2, model.df_resid) lconf_ex = tppf*np.sqrt(np.diag(d)*model.mse_resid) lprog_ex=tppf *np.sqrt((1+np.diag(d))*model.mse_resid) return lconf_ex, lprog_ex
def get_cooks_d(lm): vals = OLSInfluence(lm).summary_frame() cooks_d = vals['cooks_d'].values return cooks_d
def get_standard_residuals(lm): vals = OLSInfluence(lm).summary_frame() std_resid = vals['standard_resid'].values return std_resid
if not coverage: if varflag: if 'VARTRUE' not in d['varflag']: # code here in case we decided to downweight differently later coverage=[0] else: coverage=[0] ys.append(sum(coverage)) X['intercept'] = np.ones(len(ys)) if removesyn: X.pop('syn', None) X = pd.DataFrame(X) results = sm.OLS(ys, X, hasconst=True).fit() resid = OLSInfluence(results).get_resid_studentized_external() #variables={} #variables['cpg']=X['CpG'] #variables['cov']=ys #variables['resid']=resid #variables['rawresid']=results.resid #variables['genes']=genes #variables['gerp']=gerp #variables['intercept']=results.params['intercept'] #variables['cpgcoef']=results.params['CpG'] #pickle.dump(variables, open("var.pickle", "wb")) lowestresidual=np.min(resid)-.001 #for i, row in enumerate(genes): # if "VARTRUE" in row[7] and varflag: #row[7] is varflag # resid[i]=lowestresidual
ax.plot(x, predictions.obs_ci_lower, color='0.75', label="Prediction Interval") ax.plot(x, predictions.obs_ci_upper, color='0.75', label="") # plot the high and low mean confidence intervals ax.plot(x, predictions.mean_ci_lower, color='r', label="Predicted Mean CI") ax.plot(x, predictions.mean_ci_upper, color='r', label="") ax.legend(loc='best') plt.xlabel('LSTAT') plt.ylabel('MEDV') plt.savefig(PATH + 'medv.png', dpi=300) plt.close() # We need this for leverage and studentized residuilas calculations. from statsmodels.stats.outliers_influence import OLSInfluence influence = OLSInfluence(lm_fit) leverage = influence.hat_matrix_diag stud_res = influence.resid_studentized_external # Create plots of residuals fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) # Plot the residual for each fitted value ax1.scatter(lm_fit.fittedvalues, lm_fit.resid, facecolors='none', edgecolors='b') ax1.set_xlabel('fitted values') ax1.set_ylabel('residuals') # The residual plot indicates significant nonlinearity (a u-shape pattern is clear)
#You can grab individual statistics from the model by accessing the result attributes such as .resid and .fvalue # Creating a dataframe of values for prediction, we will predict the medv # with lstats of 5, 10 and 15, as well as computing our confidence interval new = pd.DataFrame([[1,5], [1,10],[1,15]], columns = ['Intercept', 'lstat']) print(predict(result, new)); # Plotting the lstat and medv data in boston. The regplot automatically produces # an OLS fit. *fit_reg = True produces an estimate of the regression line uncorrelated with our model * #sns.regplot('lstat', 'medv', boston, line_kws = {"color": 'r'}, ci=100, fit_reg = True) #Data plot with estimated regression line # Pulling the fitted values and residuals and fitted_values = pd.Series(result.fittedvalues, name = "Fitted Values") residuals = pd.Series(result.resid, name = "Residuals") #sns.regplot(fitted_values, residuals, fit_reg=False) #Residuals Plot # Looking for high leverage points from statsmodels.stats.outliers_influence import OLSInfluence s_residuals = pd.Series(result.resid_pearson, name = "S. Residuals") #Normalized residuals can be retrieved with result.resid_pearson leverage = pd.Series(OLSInfluence(result).influence, name = "Leverage") sns.regplot(leverage, s_residuals, fit_reg = False)` plt.show()
# plot the high and low prediction intervals ax.plot(x, predictions.obs_ci_lower, color='0.75', label="Prediction Interval") ax.plot(x, predictions.obs_ci_upper, color='0.75', label="") # plot the high and low mean confidence intervals ax.plot(x, predictions.mean_ci_lower, color='r', label="Predicted Mean CI") ax.plot(x, predictions.mean_ci_upper, color='r', label="") ax.legend(loc='best') plt.xlabel('LSTAT') plt.ylabel('MEDV') # For checking the Linearity, homoskedasticity, Outliers influence = OLSInfluence(fit_model) leverage = influence.hat_matrix_diag studentized_res = influence.resid_studentized_external fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) # Plotting the residual for each fitted value ax1.scatter(fit_model.fittedvalues, fit_model.resid, facecolors='none', edgecolors='b') ax1.set_xlabel('fitted values') ax1.set_ylabel('residuals') # Plotting the studentized residuals ax2.scatter(fit_model.fittedvalues, studentized_res,
def get_cooks_d(self): vals = OLSInfluence(self.lm).summary_frame() cooks_d = vals["cooks_d"].values return cooks_d
def get_standard_residuals(self): vals = OLSInfluence(self.lm).summary_frame() std_resid = vals["standard_resid"].values return std_resid
residuals = df_flow.obs - obs_predicted import statsmodels.api as sm from statsmodels.formula.api import ols from statsmodels.stats.outliers_influence import OLSInfluence # Cook's distance threshold = 4 / len(df_flow) print('Cook\'s distance: ', round(threshold, 2)) # fit the regression model using statsmodels library f = 'obs ~ sim' model = ols(formula=f, data=df_flow).fit() # calculate the cooks_distance - the OLSInfluence object contains multiple influence measurements cook_distance = OLSInfluence(model).cooks_distance (distance, p_value) = cook_distance # Drawing graph plt.figure(figsize=(7, 7), edgecolor='black') # scatter plot - x axis (independent variable sim), y-axis (dependent variable obs), size and color of the marks according to its cook's distance sns.scatterplot(df_flow.sim, df_flow.obs, hue=distance, size=distance, sizes=(50, 200), edgecolor='black', linewidth=1) # labels and title
import statsmodels.api as sm from statsmodels.stats.outliers_influence import OLSInfluence n = 200 x1 = np.random.uniform(-10, 10, n) x2 = np.random.uniform(-4, 4, n) x3 = np.random.uniform(-2, 8, n) y = 2.89 * x1**2 + 4.33 * x2**2 + 6.1 * x1 * x2 + 5.9 * x2 * x3 + np.random.normal( size=n) data = pd.DataFrame(data=[x1, x2, y]).T def build_formula(label: str, features: str) -> str: featlist = features.split(',') quads = ' + '.join(map(lambda feat: 'I(' + feat + ' ** 2)', featlist)) ints = ' + '.join( map(lambda feat_pair: 'I(%s * %s)' % (feat_pair[0], feat_pair[1]), combinations(featlist, 2))) return "%s ~ %s + %s" % (label, quads, ints) label = 'y' features = 'x1,x2,x3' formula = build_formula(label, features) res = sm.OLS.from_formula(formula, data=data).fit() print(res.params) rst = OLSInfluence(res).summary_frame().student_resid
temp_index = ft.studentized_residual(exp['Income_ln'], exp['avg_exp_ln'], ['Income_ln'], 'avg_exp_ln', num=2) print(temp_index) exp.loc[temp_index] # In[] # 离群特征检测 ft.outlier_detection(exp, 'Income_ln', exp[['avg_exp_ln']], 'avg_exp_ln') # In[23]: # 7.3.2.2、statemodels包提供了更多强影响点判断指标 from statsmodels.stats.outliers_influence import OLSInfluence # 使用的就是 ln(exp)~ln(Income) 模型,计算 强影响点 OLSInfluence(ana3).summary_frame().head() # 7.3.3多重共线性分析:方差膨胀因子 # 增加变量 # 经过单变量线性回归的处理,我们基本对模型的性质有了一定的了解,接下来可以放入更多的连续型解释变量。在加入变量之前,要注意变量的函数形式转变。比如当地房屋均价、当地平均收入,其性质和个人收入一样,都需要取对数 # In[24]: # exp2 是已经剔除了 两个 强影响点 后的数据集 exp2['dist_home_val_ln'] = np.log(exp2['dist_home_val']) # 所住小区房屋均价(万元)| exp2['dist_avg_income_ln'] = np.log(exp2['dist_avg_income']) # 当地人均收入 # ols类计算 线性回归模型 # 第一次: Income_ln 和 dist_avg_income_ln 是强相关性,必须剔除一个(根据方差膨胀因子) R-squared=0.553 ana5 = ols( '''avg_exp_ln ~ Income_ln + dist_home_val_ln + dist_avg_income_ln''', exp2).fit() # 第二次
print("Normality (Jarque-Bera P-value)", round(jarque_bera(reg.resid)[1], 3)) print("Homoscedasticity (Breusch-Pagan P-value)", round(het_breuschpagan(reg.resid, reg.model.exog)[3], 3)) print() outlier = pd.DataFrame(reg.outlier_test(method="bonf", alpha=0.05)) outlier = outlier.rename(columns={ "student_resid": "resid", "unadj_p": "unadj_p", "bonf(p)": "bonf_p" }) print(outlier[outlier.bonf_p < 0.05]) print() leverage = OLSInfluence(reg).summary_frame().loc[:, ["hat_diag"]] print(leverage[leverage.hat_diag > 0.2]) print() influence = OLSInfluence(reg).summary_frame().loc[:, ["cooks_d"]] print(influence[influence.cooks_d > (4 / (len(df) - len(df.columns) - 1))]) print() fig, ax = plt.subplots() ax.scatter(x=df["Price"], y=reg.predict(sm.add_constant(df.ix[:, :-1])), c="Black", s=9) ax.set(xlim=(-4.5, -0.5), ylim=(-4.5, -0.5)) ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=1, color="Red", linestyle="--") plt.xlabel("Actual Price", fontsize=10)
def residual_analysis(self): self.residual_analysis = OLSInfluence(self.results).summary_frame()
name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test = sms.jarque_bera(results.resid) lzip(name, test) # Omni test: name = ['Chi^2', 'Two-tail probability'] test = sms.omni_normtest(results.resid) lzip(name, test) # ## Influence tests # # Once created, an object of class ``OLSInfluence`` holds attributes and methods that allow users to assess the influence of each observation. For example, we can compute and extract the first few rows of DFbetas by: from statsmodels.stats.outliers_influence import OLSInfluence test_class = OLSInfluence(results) test_class.dfbetas[:5, :] # Explore other options by typing ``dir(influence_test)`` # # Useful information on leverage can also be plotted: from statsmodels.graphics.regressionplots import plot_leverage_resid2 print(plot_leverage_resid2(results)) # Other plotting options can be found on the [Graphics page.](http://www.statsmodels.org/stable/graphics.html) # ## Multicollinearity # # Condition number:
def save_residuals(self, unstandardized=True, standardized=False, studentized=False, deleted=False, studentized_deleted=False, add_to_data=False): """ Produce values of various residuals. Residuals are returned only for data used to fit a model. Parameters ---------- unstandardized : bool Whether to save unstandardized (raw) residuals standardized : bool Whether to save standardized (z-scores) residuals studentized : bool Whether to save studentized residuals deleted : bool Whether to save deleted residuals studentized_deleted : bool Whether to save studentized deleted residuals add_to_data : bool Whether to merge new values with data. Currently, this option returns data with a sorted index Returns ------- pd.DataFrame Requested residuals """ columns_to_show = [f'{k.capitalize().replace("ized", ".").replace("eted", ".").replace("_", " ")} res.' \ for k, v in vars().items() if v==True and k!='add_to_data'] infl = OLSInfluence(self._model) result = [] res_unstand = infl.resid res_unstand.name = 'Unstandard. res.' res_stand = (res_unstand - res_unstand.mean()) / res_unstand.std() res_stand.name = 'Standard. res.' res_stud = infl.resid_studentized_internal res_stud.name = 'Student. res.' result.extend([res_unstand, res_stand, res_stud]) if deleted: res_del = infl.resid_press res_del.name = 'Del. res.' result.append(res_del) if studentized_deleted: res_stud_del = infl.resid_studentized_external res_stud_del.name = 'Student. del. res.' result.append(res_stud_del) result = pd.concat(result, axis=1) result = result[columns_to_show].copy() if add_to_data: result = pd.concat([self._data, result], axis=1) return result
# The _statsmodels_ package has the most developed support for outlier analysis. house_98105 = house.loc[house['ZipCode'] == 98105, ] predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade'] outcome = 'AdjSalePrice' house_outlier = sm.OLS(house_98105[outcome], house_98105[predictors].assign(const=1)) result_98105 = house_outlier.fit() print(result_98105.summary()) # The `OLSInfluence` class is initialized with the OLS regression results and gives access to a number of usefule properties. Here we use the studentized residuals. influence = OLSInfluence(result_98105) sresiduals = influence.resid_studentized_internal print(sresiduals.idxmin(), sresiduals.min()) print(result_98105.resid.loc[sresiduals.idxmin()]) outlier = house_98105.loc[sresiduals.idxmin(), :] print('AdjSalePrice', outlier[outcome]) print(outlier[predictors]) ### Influential values from scipy.stats import linregress np.random.seed(5)
def fit(self, data_frame: DataFrame): self.data_frame = data_frame t_ref = data_frame.reference_temperature t_ref_vector = t_ref * np.ones(len(data_frame.temp)) c_0 = data_frame.reference_value c_1 = data_frame.reference_cvalue (self.aux_values, self.aux_weights) = auxiliary_function(data_frame) updated_experiment = data_frame.experiment - c_0 * np.ones(len(data_frame.temp)) - \ c_1 * (data_frame.temp - t_ref_vector) self.updated_matrix = np.column_stack( [data_frame.temp ** i + (i - 1) * t_ref_vector ** i - i * data_frame.temp * t_ref_vector ** (i - 1) \ for i in range(self.min_power, self.max_power + 1) if i not in [0, 1]]) ols_result = sm.OLS(updated_experiment, self.updated_matrix).fit() # cooks_distance_influential = 4/(len(self.data_frame.temp - (self.max_power - self.min_power) - 1)) # ols_cooks_distance = OLSInfluence(ols_result).cooks_distance[1] ols_stud_residuals = OLSInfluence(ols_result).dfbetas # ols_influence = OLSInfluence(ols_result).influence w = np.ones(len(data_frame.temp)) for residual, weight in zip(ols_stud_residuals, w): if residual > 2: w = 0.1 else: w = 1 self.aux_fit = sm.WLS(updated_experiment, self.updated_matrix, weights=w).fit() self.aux_coefficients = self.aux_fit.params a_1 = c_1 - \ sum([i * self.aux_coefficients[i - self.min_power] * t_ref ** (i - 1) \ for i in range(self.min_power, 0)]) - \ sum([i * self.aux_coefficients[i - 2 - self.min_power] * t_ref ** (i - 1) \ for i in range(2, self.max_power + 1)]) a_0 = c_0 - a_1 * t_ref - \ sum([self.aux_coefficients[i - self.min_power] * t_ref ** i \ for i in range(self.min_power, 0)]) - \ sum([self.aux_coefficients[i - 2 - self.min_power] * t_ref ** i \ for i in range(2, self.max_power + 1)]) self.fit_coefficients = [] self.fit_coefficients.extend(self.aux_coefficients[:-self.min_power]) self.fit_coefficients.extend([a_0, a_1]) self.fit_coefficients.extend(self.aux_coefficients[-self.min_power:]) self.source_matrix = np.vstack([ data_frame.temp**i if i != 0 else np.ones(len(data_frame.temp)) for i in range(self.min_power, self.max_power + 1) ]).T self.fit = np.dot(self.source_matrix, self.fit_coefficients) self.heat_capacity_matrix = np.vstack([ i * data_frame.temp**(i - 1) for i in range(self.min_power, self.max_power + 1) ]).T self.fit_heat_capacity = np.dot(self.heat_capacity_matrix, self.fit_coefficients)