def ols_plot(y, x, add_intercept=True, alpha=0.05, xlim=None, ax=None): """ Generate a scatter plot with OLS prediction plus confidence intervals :param y: :param x: :param add_intercept: :param alpha: :param ax: :return: """ if ax is None: fig = plt.figure() ax = fig.add_subplot(111) try: x = x.astype(float) except Exception: pass if add_intercept: X = sm.add_constant(x) else: X = x model = sm.OLS(y, X) res = model.fit() # plot data ax.scatter(x, y, marker='o') if xlim is None: xlim = np.array(ax.get_xlim()) xx = np.linspace(xlim[0], xlim[1], 100) # compute prediction and confidence intervals if add_intercept: b0, b1 = res.params sdev, lower, upper = wls_prediction_std(res, sm.add_constant(xx), alpha=alpha) # b0_min, b0_max = res.conf_int(alpha=alpha)[0] # b1_min, b1_max = res.conf_int(alpha=alpha)[1] else: b1 = res.params[0] b0 = 0. sdev, lower, upper = wls_prediction_std(res, xx, alpha=alpha) # b0 = b0_min = b0_max = 0. # b1_min, b1_max = res.conf_int(alpha=alpha)[0] ax.plot(xx, b0 + b1 * xx, 'k-', lw=1.5) ax.fill_between(xx, lower, upper, edgecolor='b', facecolor='b', alpha=0.4) # lower = b0_min + b1_min * xlim # upper = b0_max + b1_max * xlim # ax.fill_between(xlim, lower, upper, edgecolor='b', facecolor='b', alpha=0.4) ax.set_xlim(xlim) return res, ax
def plot_regression(_output): global analysis def to_percent(x, position): s = str(100 * x) if plt.rcParams['text.usetex'] is True: return s + r'$\%$' else: return s + '%' mpl.style.use('classic') fig = plt.figure() x = np.array(analysis.iloc[:, 1], dtype=float) y = np.array(analysis.iloc[:, 2], dtype=float) * 100 model_x = sm.add_constant(x) model = sm.OLS(y, model_x) fitted = model.fit() fit = np.polyfit(x, y, 1) fit_function = np.poly1d(fit) sdev, lower, upper = wls_prediction_std(fitted, alpha=0.05) plt.fill_between(x, lower, upper, color='#67e0d7', alpha=0.3) plt.plot(x, fit_function(x), color='green') x = np.array(analysis.iloc[:, 4], dtype=float) y = np.array(analysis.iloc[:, 5], dtype=float) * 100 model_x = sm.add_constant(x) model = sm.OLS(y, model_x) fitted = model.fit() fit = np.polyfit(x, y, 1) fit_function = np.poly1d(fit) sdev, lower, upper = wls_prediction_std(fitted, alpha=0.05) plt.fill_between(x, lower, upper, color='#93e067', alpha=0.3) plt.plot(x, fit_function(x), color='black') formatter = FuncFormatter(to_percent) plt.gca().xaxis.set_major_formatter(formatter) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.xlim(0, 1) plt.ylim(0, 10) circle1 = mlines.Line2D( [], [], color='green', markerfacecolor="green", label='AEP Risk (CDC Definition)') # Make a circle for the legend circle2 = mlines.Line2D( [], [], color='black', markerfacecolor="black", label='AEP Risk (Actual)') # Make a circle for the legend plt.legend(handles=[circle1, circle2], numpoints=1, prop={'size': 15}, loc=2) plt.title('Compliance with CDC Recommendation (%)', fontsize=20) plt.tight_layout() plt.savefig(_output + '.pdf') plt.savefig(_output + '.jpg', bbox_inches='tight')
def _regional_prediction(X, X_pred, Y, i): mod = sm.OLS(np.log(Y[i]), X) res = mod.fit() y_pred = res.predict(X_pred) _, _, std_u = wls_prediction_std(res, exog=X_pred, alpha=1-0.6827) # 1 s.d. _, ci_l, ci_u = wls_prediction_std(res, exog=X_pred, alpha=1-0.95) # 95% CI return y_pred, std_u, ci_l, ci_u, res.params[1]
def predict(self, ID, ALPHA=0.5): list1 = get_data(ID) vector = self.vectorizer.transform([list1[0]]) vector = self.lsa.transform(vector) array = np.array([list1[1:4]])**2.0 / self.sum array = array**0.5 vector = np.hstack([vector, array]) length = vector.shape[1] ''' for i in range(length): tmp = vector[0][i] * vector[0][i] tmp = np.array([[tmp]]) vector = np.hstack([vector, tmp]) ''' for i in range(length): for j in range(i, length): tmp = vector[0][i] * vector[0][j] tmp = np.array([[tmp]]) vector = np.hstack([vector, tmp]) vector = del_vector(vector, self.dellist) estimated = self.results.predict(vector) prstdn, infa, supa = wls_prediction_std(self.results, vector, alpha=ALPHA) if infa[0] < 0: infa[0] = 0 return estimated[0]**2.0, infa[0]**2.0, supa[0]**2.0
def visualize_linear_regression(data, fit, stock, benchmark, axis_low=-0.1, axis_high=0.1, show_std=False): """Create a scatter plot and linear model of the stock returns vs. the benchmark returns. Arguments: data -- a tidy DataFrame, with columns stock_ret, bench_ret, and const fit -- a linear regression result stock -- name of the stock benchmark -- name of the benchmark index axis_low -- lowest value for x and y axes (defult -0.1) axis_high -- highest value for x and y axes (defult 0.1) show_std -- whether to show upper and lower bands around answer (default False) """ ax = data.plot(kind='scatter', x='bench_ret', y='stock_ret', title='1-Day Returns', xlim=(axis_low, axis_high), ylim=(axis_low, axis_high)) X_new = pd.DataFrame({'bench_ret': [axis_low, axis_high]}) X_new['const'] = 1 preds = fit.predict(X_new) plt.plot(X_new['bench_ret'], preds, 'r-') if show_std: _, lower, upper = wls_prediction_std(fit, X_new) plt.plot(X_new['bench_ret'], lower, 'r--', X_new['bench_ret'], upper, 'r--') ax.set_xlabel(benchmark) ax.set_ylabel(stock) ax.set_aspect(1)
def get_predicted_y_PI(self, x_pred, alpha=0.05): """ :returns prediction interval of the y at the provided x values """ X_pred = self.f.get_X(x_pred) sdev, lower, upper = wls_prediction_std(self.fitted, exog=X_pred, alpha=alpha) return lower, upper
def try_prod24h_before( columns=['Tout', 'vWind', 'vWindavg24', 'prod24h_before'], add_const=False, y=y): plt.close('all') X = all_data[columns] res = mlin_regression(y, X, add_const=add_const) timesteps = ens.gen_hourly_timesteps(dt.datetime(2015, 12, 17, 1), dt.datetime(2016, 1, 15, 0)) plt.subplot(2, 1, 1) plt.plot_date(timesteps, y, 'b', label='Actual prodution') plt.plot_date(timesteps, res.fittedvalues, 'r', label='Weather model') prstd, iv_l, iv_u = wls_prediction_std(res) plt.plot_date(timesteps, iv_u, 'r--', label='95% conf. int.') plt.plot_date(timesteps, iv_l, 'r--') plt.ylabel('MW') plt.legend(loc=2) plt.subplot(2, 1, 2) plt.plot_date(timesteps, res.resid, '-', label='Residual') plt.ylabel('MW') plt.legend() print "MAE = " + str(mae(res.resid)) print "MAPE = " + str(mape(res.resid, y)) print "RMSE = " + str(rmse(res.resid)) print res.summary() return res
def plot_locality_regression(snps, cob, gene_limit=10): # Get degree and bootstrap degree log('Fetching Empirical Degree') degree = cob.locality( cob.refgen.candidate_genes(snps, gene_limit=gene_limit, chain=True)).sort('local') log('Fetching BS Degree') #bsdegree = pd.concat([cob.locality(cob.refgen.bootstrap_candidate_genes(snps,gene_limit=gene_limit,chain=True)) for x in range(50)]).sort('local') # get OLS for the bootstrapped degree log('Fitting models') model = sm.OLS(degree['global'], degree.local) res = model.fit() std, iv_l, iv_u = wls_prediction_std(res) # plot the bootstrapped data fig, ax = pylab.subplots(figsize=(8, 6)) fig.hold(True) ax.set_xlim(0, max(degree.local)) ax.set_ylim(0, max(degree['global'])) # plot the bootstraps std # plot the true data log('Plotting Empirical') ax.plot(degree.local, degree['global'], 'o', label='Empirical') log('Plotting Residuals') ax.plot(degree.local, res.fittedvalues, '--') ax.plot(degree.local, res.fittedvalues + 2.5 * std, 'r--') ax.plot(degree.local, res.fittedvalues - 2.5 * std, 'r--') ax.set_xlabel('Number Local Interactions') ax.set_ylabel('Number Global Interactions') log('Saving Figure') fig.savefig('{}_locality.png'.format(cob.name))
def linreg_stock(stock_ticker='AAPL', start_date = '2019-12-01', end_date = '2020-02-05', visualize=False): """ Defines an ordinary linear regression, between time and stock price. The function returns the 95% confidence interval of the slope beta """ this_stock = Stock(stock_ticker) panel_data = this_stock.get_price_history( start_date=start_date, end_date=end_date) if panel_data.shape[0] == 0: return [np.nan, np.nan] panel_data['x_val'] = list(range(panel_data.shape[0])) X = panel_data['x_val'] y = panel_data['Open'].values/panel_data['Open'].values[0] X = sm.add_constant(X) model = sm.OLS(y, X) results = model.fit() A = results.conf_int(alpha=0.05, cols=None) beta_lower = A[0]['x_val'] beta_upper = A[1]['x_val'] if visualize: prstd, iv_l, iv_u = wls_prediction_std(results) plt.plot(panel_data['x_val'], y, 'ro') plt.plot(panel_data['x_val'], results.fittedvalues, 'r--.', label="OLS") plt.plot(panel_data['x_val'], iv_l, 'b--.') plt.plot(panel_data['x_val'], iv_u, 'b--.') return [beta_lower, beta_upper]
def LR(df, options): origsize = len(df) ldf = df.copy() if 'divisor' in options: ldf = df.tail(int(origsize / options['divisor'])) x = np.arange(len(ldf)) X = sm.add_constant(x) res = sm.OLS(ldf, X).fit() print(res.summary()) std, lower, upper = wls_prediction_std(res) middle = res.predict() nfill = origsize - len(ldf) if nfill < origsize: empty = np.full_like(np.arange(nfill), np.nan, dtype=np.double) std = np.append(empty, std) lower = np.append(empty, lower) upper = np.append(empty, upper) middle = np.append(empty, middle) return std, middle, lower, upper
def user_model(data): """ This function allows the user to enter their own linear regression model formula, which is then run in the statsmodels package and returns model results. """ # List available covariates in the data set print('The data set contains the following covariates: \n') print(list(data.columns), '\n') # Prompt user to input model formula, in R type syntax userFormula = choose_data = input('Enter your regression model formula, using syntax as shown: \n \n dependent_variable ~ covariate1 + covariate 2 + ... \n \n') # Run the user-defined model as a statsmodels linear regression userModel = smf.ols(formula=userFormula, data=data).fit() print('\n', userModel.summary(), '\n') # Retrieve y variable and time variable for plotting yvar = userModel.model.endog_names y = data[yvar] timeVar = list(data.columns[data.dtypes == 'datetime64[ns]']) x = data[timeVar] # covars = list(userModel.params.keys()) # Plot dependent variable data and model fitted values vs time prstd, iv_l, iv_u = wls_prediction_std(userModel) fig = plt.figure(figsize=(12,6)) plt.plot(x, userModel.fittedvalues, 'r.', alpha=0.2, label='Fitted Values') plt.plot(x, y, 'b.', alpha=0.2, label='%s data' % yvar) plt.legend(loc='upper left') plt.title('%s actual data and model fitted values' % yvar, fontsize='x-large')
def get_prediction(res, x): """ 得到模型的预测结果以及结果的上下限 """ prstd, ci_low, ci_up = wls_prediction_std(res, alpha=0.05) pred = res.predict(x) return pd.DataFrame({"ci_low": ci_low, "pred": pred, "ci_up": ci_up})
def dataframe_ordinary_least_squares(dataframe_in, x_col_name, y_col_name, showplot=False): x = dataframe_in[x_col_name].to_numpy() X = sm.add_constant(x) X = np.array(X, dtype=float) y = dataframe_in[y_col_name].to_numpy() model = sm.OLS(y, X) results = model.fit() prstd, iv_l, iv_u = wls_prediction_std(results) dataframe_in['OLS Values'] = results.fittedvalues dataframe_in['Confidence Upper'] = iv_u dataframe_in['Confidence Lower'] = iv_l if (showplot == True): print(results.summary()) fig, ax = plt.subplots() ax.scatter(x, y, color="#778899", label="Test Volume") ax.plot(x, dataframe_in['OLS Values'], ".--", color="#4682B4", label="Ordinary Least Squares Regression") ax.plot(x, iv_u, color="#F08080", ls=":") ax.plot(x, iv_l, color="#F08080", ls=":") plt.show()
def plot_locality_regression(snps,cob,gene_limit=10): # Get degree and bootstrap degree log('Fetching Empirical Degree') degree = cob.locality(cob.refgen.candidate_genes(snps,gene_limit=gene_limit,chain=True)).sort('local') log('Fetching BS Degree') #bsdegree = pd.concat([cob.locality(cob.refgen.bootstrap_candidate_genes(snps,gene_limit=gene_limit,chain=True)) for x in range(50)]).sort('local') # get OLS for the bootstrapped degree log('Fitting models') model = sm.OLS(degree['global'],degree.local) res = model.fit() std, iv_l, iv_u = wls_prediction_std(res) # plot the bootstrapped data fig,ax = pylab.subplots(figsize=(8,6)) fig.hold(True) ax.set_xlim(0,max(degree.local)) ax.set_ylim(0,max(degree['global'])) # plot the bootstraps std # plot the true data log('Plotting Empirical') ax.plot(degree.local,degree['global'],'o',label='Empirical') log('Plotting Residuals') ax.plot(degree.local,res.fittedvalues,'--') ax.plot(degree.local,res.fittedvalues+2.5*std,'r--') ax.plot(degree.local,res.fittedvalues-2.5*std,'r--') ax.set_xlabel('Number Local Interactions') ax.set_ylabel('Number Global Interactions') log('Saving Figure') fig.savefig('{}_locality.png'.format(cob.name))
def PlotFit(res, x): prstd, iv_l, iv_u = wls_prediction_std(res) fig, ax = plt.subplots(figsize=(8, 6)) ax.plot(x, 'o', label='data') #ax.plot(x, y_true, 'b-', label="True") ax.plot(x.index, res.fittedvalues, 'r--.', label="OLS") ax.plot(x.index, iv_u, 'r--') ax.plot(x.index, iv_l, 'r--') # Draw the predictions for one year into the future time_today = np.datetime64(datetime.today()) #print((time_today-x.index.values[-1])/np.timedelta64(5724000, 's')) dtRange = np.linspace( pd.Timestamp(time_today).value, pd.Timestamp(time_today + np.timedelta64(31536000, 's')).value, 4) dtRange = pd.to_datetime(dtRange) Xnew = np.ones((x.shape[0] + 4, 2)) # adding the constant Xnew[:, 1] = np.arange(x.shape[0] + 4) Xnew = sm1.add_constant(Xnew) #print(Xnew) ynewpred = res.predict(Xnew) #print(ynewpred) ax.plot(x.index.union(dtRange), ynewpred, 'r', label="OLS prediction") ax.legend(loc='best') if draw: plt.show()
def mcp_model(df_sector, sector, thres_v): # print('Check the correlation between site and satellite:') # corr_sit_sat=pearsonr(df_regression['speed_sit'],df_regression['speed_sat']) # if corr_sit_sat[0]<threshold_satcorr or corr_sit_sat[1]>threshold_p: # print("Pearson's test p-value is %f %s %f, correlation between valuables is %f %s %f, therefore this satellite data should be rejected."%(corr_sit_sat[1],'>' if corr_sit_sat[1]>threshold_p else '<=',threshold_p,corr_sit_sat[0],'<' if corr_sit_sat[0]<threshold_satcorr else '>=',threshold_satcorr)) # sys.exit() # else: # print("Pearson's test p-value is %f <= %f, and the correlation between valuables is %f >= %f"%(corr_sit_sat[1],threshold_p,corr_sit_sat[0],threshold_satcorr)) # corr3_sit_sat=pearsonr(df_regression['speed3_sit'],df_regression['speed3_sat']) # if corr3_sit_sat[0]<threshold_satcorr or corr3_sit_sat[1]>threshold_p: # print("Pearson's test p-value is %f %s %f, correlation between valuables is %f %s %f, therefore this satellite data should be rejected."%(corr3_sit_sat[1],'>' if corr3_sit_sat[1]>threshold_p else '<=',threshold_p,corr3_sit_sat[0],'<' if corr3_sit_sat[0]<threshold_satcorr else '>=',threshold_satcorr)) # sys.exit() # else: # print("Pearson's test p-value is %f <= %f, and the correlation between valuables is %f >= %f"%(corr3_sit_sat[1],threshold_p,corr3_sit_sat[0],threshold_satcorr)) # # print('Linear regression (speed_sit, veer) ~ speed_sat:') # print('Cross validation process.') # df_train,df_test=mcpprocess.holdout(df_regression,0.75) # mcpmodel=lrm(df_train['speed_sat'],df_train['speed_sit']) # print(mcpmodel.summary2()) df_regression = df_sector[df_sector.speed_sit > thres_v].reset_index( drop=True) #train:test=4:1 df_train, df_test = mcpprocess.holdout(df_regression, 0.2) pre_std, Y_l, Y_u = wls_prediction_std(mcpmodel_v) descriptive.rl_fit(df_regression.speed_sat, df_regression.speed_sit, mcpmodel_v.fittedvalues, Y_l, Y_u, sector, True, False, 1) descriptive.rl_residu(df_regression.speed_sat, mcpmodel_v.resid, sector, True, False, 1) descriptive.rl_qqplot(resid_norm, sector, True, False, 1)
def try_prod24h_before(columns=['Tout', 'vWind', 'vWindavg24', 'prod24h_before'], add_const=False, y=y): plt.close('all') X = all_data[columns] res = mlin_regression(y, X, add_const=add_const) timesteps = ens.gen_hourly_timesteps(dt.datetime(2015,12,17,1), dt.datetime(2016,1,15,0)) plt.subplot(2,1,1) plt.plot_date(timesteps, y, 'b', label='Actual prodution') plt.plot_date(timesteps, res.fittedvalues, 'r', label='Weather model') prstd, iv_l, iv_u = wls_prediction_std(res) plt.plot_date(timesteps, iv_u, 'r--', label='95% conf. int.') plt.plot_date(timesteps, iv_l, 'r--') plt.ylabel('MW') plt.legend(loc=2) plt.subplot(2,1,2) plt.plot_date(timesteps, res.resid, '-', label='Residual') plt.ylabel('MW') plt.legend() print "MAE = " + str(mae(res.resid)) print "MAPE = " + str(mape(res.resid, y)) print "RMSE = " + str(rmse(res.resid)) print res.summary() return res
def plot_best_model(): plt.close('all') columns = ['Tout', 'Toutavg24', 'vWind', 'vWindavg24']#, 'hours', 'hours2','hours3', 'hours4','hours5', 'hours6']#, 'hours7', 'hours8']#,'hours5', 'hours6'] X = all_data[columns] res = mlin_regression(y, X) timesteps = ens.gen_hourly_timesteps(dt.datetime(2015,12,17,1), dt.datetime(2016,1,15,0)) plt.subplot(2,1,1) plt.plot_date(timesteps, y, 'b', label='Actual prodution') plt.plot_date(timesteps, res.fittedvalues, 'r', label='Weather model') prstd, iv_l, iv_u = wls_prediction_std(res) plt.plot_date(timesteps, iv_u, 'r--', label='95% conf. int.') plt.plot_date(timesteps, iv_l, 'r--') mean_day_resid = [res.resid[i::24].mean() for i in range(24)] mean_resid_series = np.tile(mean_day_resid, 29) plt.plot_date(timesteps, res.fittedvalues + mean_resid_series, 'g', label='Weather model + avg daily profile') plt.ylabel('MW') plt.legend(loc=2) plt.subplot(2,1,2) plt.plot_date(timesteps, res.resid, '-', label='Residual') plt.plot_date(timesteps, mean_resid_series) plt.ylabel('MW') plt.legend() mape = np.mean(np.abs((res.fittedvalues + mean_resid_series-y)/y)) mape2 = np.mean(np.abs((res.resid)/y)) mae = np.mean(np.abs((res.fittedvalues + mean_resid_series-y))) print mape, mape2, mae res.summary() return res
def _predict(self, fit, df, **kwargs): """ Return a df with predictions and confidence interval The df will contain the following columns: - 'predicted': the model output - 'interval_u', 'interval_l': upper and lower confidence bounds. Parameters ---------- fit : Statsmodels fit df : pandas DataFrame or None (default) If None, use self.df confint : float (default=0.05) Confidence level for two-sided hypothesis, if given, overrides the default one. Returns ------- df : pandas DataFrame same as df with additional columns 'predicted', 'interval_u' and 'interval_l' """ confint = kwargs.get('confint', self.confint) # Add model results to data as column 'predictions' if 'Intercept' in fit.model.exog_names: df['Intercept'] = 1.0 df['predicted'] = fit.predict(df) if not self.allow_negative_predictions: df.loc[df['predicted'] < 0, 'predicted'] = 0 prstd, interval_l, interval_u = wls_prediction_std( fit, df[fit.model.exog_names]) df['interval_l'] = interval_l df['interval_u'] = interval_u return df
def plot_vol_surface(self, x, y, contract, put_call): """Plot surface using Ordinary Least Square Fit.""" df = pd.DataFrame(columns=['x', 'y']) df['x'] = x df['y'] = y degree = 3 try: weights = np.polyfit(x, y, degree) model = np.poly1d(weights) results = smf.ols(formula='y ~ model(x)', data=df).fit() prstd, iv_l, iv_u = wls_prediction_std(results) fig, ax = plt.subplots(figsize=(8, 6)) plt.title( "Implied Vol for NG European Options = {0}, moneyness plot= log(K/F): tradeDate: {1}" .format(contract, self.tradeDate)) ax.plot(x, y, 'o', label="{0} Implied Vol".format(put_call)) ax.plot(x, results.fittedvalues, 'r--.', label="OLS") ax.plot(x, iv_u, 'r--') ax.plot(x, iv_l, 'r--') ax.legend(loc='best') plt.xlabel("Moneyness: log(K/F)") plt.ylabel("Implied Vol.") plt.axvline(0, color='k') plt.show() print(prstd) except ValueError: logging.info("ValueError!")
def visualize_linear_regression(data, fit, response_name, predictor_name, show_std=False): """Create a scatter plot and linear model of the response variable (column 1) vs. predictor variable (column 2). Arguments: data -- a tidy DataFrame, with response in column 1, predictor in column 2, and constant in column 3 fit -- a linear regression result response_name -- name of the response variable predictor_name -- name of the predictor variable show_std -- whether to show upper and lower bands around answer (default False) """ ax = data.plot( kind='scatter', x=1, y=0, title=('Relationship of %s vs. %s' % (response_name, predictor_name)), #xlim=(axis_low, axis_high), ylim=(axis_low, axis_high) ) X_new = pd.DataFrame({'predictor': ax.get_xlim()}) X_new['const'] = 1 preds = fit.predict(X_new) plt.plot(X_new['predictor'], preds, 'r-') if show_std: _, lower, upper = wls_prediction_std(fit, X_new) plt.plot(X_new['predictor'], lower, 'r--', X_new['predictor'], upper, 'r--') ax.set_xlabel(predictor_name) ax.set_ylabel(response_name)
def test_ci(self): res_wls = self.res_wls prstd, iv_l, iv_u = wls_prediction_std(res_wls) pred_res = get_prediction(res_wls) ci = pred_res.conf_int(obs=True) assert_allclose(pred_res.se_obs, prstd, rtol=1e-13) assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13) sf = pred_res.summary_frame() col_names = [ 'mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper', 'obs_ci_lower', 'obs_ci_upper' ] assert_equal(sf.columns.tolist(), col_names) pred_res2 = res_wls.get_prediction() ci2 = pred_res2.conf_int(obs=True) assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13) assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13) sf2 = pred_res2.summary_frame() assert_equal(sf2.columns.tolist(), col_names)
def visualizeModel(re, data, features, labels): """ 模型可视化 """ # 计算预测结果的标准差,预测下界,预测上界 prstd, preLow, preUp = wls_prediction_std(re, alpha=0.05) # 为在Matplotlib中显示中文,设置特殊字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 创建一个图形框 fig = plt.figure(figsize=(6, 6), dpi=80) # 在图形框里只画一幅图 ax = fig.add_subplot(111) # 在Matplotlib中显示中文,需要使用unicode ax.set_title(u'%s' % "线性回归统计分析示例") # 画点图,用蓝色圆点表示原始数据 ax.scatter(data[features], data[labels], color='b', label=u'%s: $y = x + \epsilon$' % "真实值") # 画线图,用红色虚线表示95%置信区间 ax.plot(data[features], preUp, "r--", label=u'%s' % "95%置信区间") ax.plot(data[features], re.predict(data[features]), color='r', label=u'%s: $y = %.3fx$'\ % ("预测值", re.params[features])) ax.plot(data[features], preLow, "r--") legend = plt.legend(shadow=True) legend.get_frame().set_facecolor('#6F93AE') plt.show()
def loess_normative_model(self): """ Compute classical normative model.""" if self.bins is None: self._create_bins() # format data data = self.data[[self.conf, self.score]].to_numpy(dtype=np.float64) # take the controls ctr_mask, _ = self._get_masks() ctr = data[ctr_mask] self.zm = np.zeros(self.bins.shape[0]) # mean self.zstd = np.zeros(self.bins.shape[0]) # standard deviation self.zci = np.zeros([self.bins.shape[0], 2]) # confidence interval for i, bin_center in enumerate(self.bins): mu = np.array(bin_center) # bin_center value (age or conf) bin_mask = (abs(ctr[:, :1] - mu) < self.bin_width) * 1. idx = [u for (u, v) in np.argwhere(bin_mask)] scores = ctr[idx, 1] adj_conf = ctr[idx, 0] - mu # confound relative to bin center # if more than 2 non NaN values do the model if (~np.isnan(scores)).sum() > 2: mod = sm.WLS(scores, sm.tools.add_constant(adj_conf, has_constant='add'), missing='drop', weight=bin_mask.flatten()[idx], hasconst=True).fit() self.zm[i] = mod.params[0] # mean # std and confidence intervals prstd, iv_l, iv_u = wls_prediction_std(mod, [0, 0]) self.zstd[i] = prstd self.zci[i, :] = mod.conf_int()[0, :] # [iv_l, iv_u] else: self.zm[i] = np.nan self.zci[i] = np.nan self.zstd[i] = np.nan dists = [np.abs(conf - self.bins) for conf in self.data[self.conf]] idx = [np.argmin(d) for d in dists] m = np.array([self.zm[i] for i in idx]) std = np.array([self.zstd[i] for i in idx]) nmodel = (self.data[self.score] - m) / std self.data['LOESS_pred'] = nmodel self.data['LOESS_residuals'] = self.data[ self.score] - self.data['LOESS_pred'] score = self._get_score() res = self.data['LOESS_residuals'].to_numpy(dtype=np.float64) self.SMSE_LOESS = (np.mean(res[ctr_mask]**2)**0.5) / np.std( score[ctr_mask]) self._loess_rank()
def summary_obs(res, alpha=0.05): from scipy import stats from statsmodels.sandbox.regression.predstd import wls_prediction_std infl = Influence(res) #standard error for predicted mean #Note: using hat_matrix only works for fitted values predict_mean_se = np.sqrt(infl.hat_matrix_diag*res.mse_resid) tppf = stats.t.isf(alpha/2., res.df_resid) predict_mean_ci = np.column_stack([ res.fittedvalues - tppf * predict_mean_se, res.fittedvalues + tppf * predict_mean_se]) #standard error for predicted observation predict_se, predict_ci_low, predict_ci_upp = wls_prediction_std(res) predict_ci = np.column_stack((predict_ci_low, predict_ci_upp)) #standard deviation of residual resid_se = np.sqrt(res.mse_resid * (1 - infl.hat_matrix_diag)) table_sm = np.column_stack([ np.arange(res.nobs) + 1, res.model.endog, res.fittedvalues, predict_mean_se, predict_mean_ci[:,0], predict_mean_ci[:,1], predict_ci[:,0], predict_ci[:,1], res.resid, resid_se, infl.resid_studentized_internal, infl.cooks_distance()[0] ]) #colnames, data = zip(*table_raw) #unzip data = table_sm ss2 = ['Obs', 'Dep Var\nPopulation', 'Predicted\nValue', 'Std Error\nMean Predict', 'Mean ci\n95% low', 'Mean ci\n95% upp', 'Predict ci\n95% low', 'Predict ci\n95% upp', 'Residual', 'Std Error\nResidual', 'Student\nResidual', "Cook's\nD"] colnames = ss2 #self.table_data = data #data = np.column_stack(data) data = np.round(data,4) #self.table = data from statsmodels.iolib.table import SimpleTable, default_html_fmt from statsmodels.iolib.tableformatting import fmt_base from copy import deepcopy fmt = deepcopy(fmt_base) fmt_html = deepcopy(default_html_fmt) fmt['data_fmts'] = ["%4d"] + ["%6.3f"] * (data.shape[1] - 1) #fmt_html['data_fmts'] = fmt['data_fmts'] st = SimpleTable(data, headers=colnames, txt_fmt=fmt, html_fmt=fmt_html) return st, data, ss2
def predict_y_with_model(model, intercept_included: bool, independent_variables: List[List[float]], iv_names: List[str], significance_level=0.1): """ Format of independent_variables: x1 [[1, 2, 3, 4, 5], x2 [5, 4, 3, 2, 1], x3 [6, 6, 6, 6, 6]] """ ivs = independent_variables assert significance_level < 0.2 print('Predicting y with model (significance_level == {}):'.format( significance_level)) model_parameters = {} if intercept_included: model_parameters['intercept'] = 1 for i in range(len(ivs)): model_parameters[iv_names[i]] = ivs[i] print(model_parameters) predicted_values = model.predict(pd.DataFrame(model_parameters)) results = [] for i in range(len(independent_variables[0])): tmp = [] for j in range(len(iv_names)): tmp.append(independent_variables[j][i]) tmp.append(predicted_values[i]) results.extend([tmp]) results_pd = pd.DataFrame(results) headers = [] for i in range(len(iv_names)): headers.append(iv_names[i]) headers.append('Predicted Value') results_pd.columns = headers if intercept_included: exogenous_parameters = [] for i in range(len(ivs[0])): temp_iv = [1] for j in range(len(ivs)): temp_iv.append(ivs[j][i]) exogenous_parameters.append(temp_iv) # print(f'exogenous_parameters: {exogenous_parameters}') results_pd['Prediction Std'], results_pd['Lowers'], results_pd[ 'Uppers'] = wls_prediction_std(model, exog=exogenous_parameters, weights=1, alpha=significance_level) else: print( 'Interval prediction not available if intercept is not included.') print(results_pd)
def fit(self, x, y): x = array(x).reshape(-1, 1) model = OLS(y, PolynomialFeatures(2).fit_transform(x)).fit() self.m = model.predict( PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1))) self.s = wls_prediction_std( model, PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))[0] return self
def plot(ts, trend=True, interval=False, outliers=False, ax=None, **kwargs): ''' Plot a timeseries with optionl trend, 2 standard deviation interval and outliers Parameters ---------- ts : A DataFrame or Series with a timeseries index with all numeric columns trend : overlay trend linear? interval : overlay a 2 standard deviation interval? outliers : overlay outliers? kwargs : aguments passed to isoutler ax : axes to draw on (optional) Returns ------- axes object ''' if not ax: ax = gca() # ols won't accept a date so create time in seconds from first date as the independant variable if isinstance(ts, pd.Series): df = (ts).to_frame() # Unify handeling of Series and DataFrame else: df = ts.copy() cols = df.select_dtypes(include=[np.number]).columns df['__Seconds'] = (ts.index - ts.index.min()).astype('timedelta64[s]') for col in cols: res = smf.ols(formula=col + ' ~ __Seconds', data=df).fit() # Plot this first to get the better pandas timeseries drawing of dates on x axis df[col].plot(ax=ax, label="{} (r^2 = {:2.2})".format(col, res.rsquared) if trend else col) if trend: res.fittedvalues.plot(ax=ax, style='--g', label="") if interval: prstd, iv_l, iv_u = wls_prediction_std(res) ax.fill_between(iv_l.index, iv_l, iv_u, color='#888888', alpha=0.25) if outliers: df_outliers = df[col][isoutlier(df[col], **kwargs)] if len(df_outliers) > 0: df_outliers.plot(ax=ax, style='r*', label="") return ax
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes x_var : int or str Name or index of regressor in exog matrix. y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. kwargs The keyword arguments are passed to the plot command for the fitted values points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y = results.model.endog x1 = results.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label=results.model.endog_names) if not y_true is None: ax.plot(x1, y_true[x1_argsort], 'b-', label='True values') title = 'Fitted values versus %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(results) ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r', label='fitted', **kwargs) ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k', alpha=.7) #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, # color='k') ax.set_title(title) ax.set_xlabel(exog_name) ax.set_ylabel(results.model.endog_names) ax.legend(loc='best') return fig
def calculate_wls_prediction_std(result): """ :return: predstd : array_like, standard error of prediction same length as rows of exog iv_l : array_like, lower confidence bound iv_u : array_like, upper confidence bound """ # predstd, iv_l, iv_u = wls_prediction_std(result) return wls_prediction_std(result)
def plot_fit(res, exog_idx, exog_name='', y_true=None, ax=None, fontsize='small'): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- res : result instance result instance with resid, model.endog and model.exog as attributes exog_idx : int index of regressor in exog matrix y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. Notes ----- This is currently very simple, no options or varnames yet. """ fig, ax = utils.create_mpl_ax(ax) if exog_name == '': exog_name = 'variable %d' % exog_idx #maybe add option for wendog, wexog y = res.model.endog x1 = res.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label='observed') if not y_true is None: ax.plot(x1, y_true[x1_argsort], 'b-', label='true') title = 'fitted versus regressor %s' % exog_name else: title = 'fitted versus regressor %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(res) ax.plot(x1, res.fittedvalues[x1_argsort], 'k-', label='fitted') #'k-o') #ax.plot(x1, iv_u, 'r--') #ax.plot(x1, iv_l, 'r--') ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, color='k') ax.set_title(title, fontsize=fontsize) return fig
def plot_fit(res, exog_idx, exog_name='', y_true=None, ax=None, fontsize='small'): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- res : result instance result instance with resid, model.endog and model.exog as attributes exog_idx : int index of regressor in exog matrix y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. Notes ----- This is currently very simple, no options or varnames yet. """ fig, ax = utils.create_mpl_ax(ax) if exog_name == '': exog_name = 'variable %d' % exog_idx #maybe add option for wendog, wexog y = res.model.endog x1 = res.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label='observed') if not y_true is None: ax.plot(x1, y_true[x1_argsort], 'b-', label='true') title = 'fitted versus regressor %s' % exog_name else: title = 'fitted versus regressor %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(res) ax.plot(x1, res.fittedvalues[x1_argsort], 'k-', label='fitted') #'k-o') #ax.plot(x1, iv_u, 'r--') #ax.plot(x1, iv_l, 'r--') ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, color='k') ax.set_title(title, fontsize=fontsize) return fig
def odds_hat_l_u(self: LassoICSelector): Xols = self.transform_to_ols(self.X) yhat = self.ols.predict(self.ols_results.params, Xols) # from equation 5 odds_hat = np.exp(yhat) # the error in yhat is (yhat_std, yhat_l, yhat_u) = wls_prediction_std(self.ols_results, Xols) oddshat_l = np.exp(yhat - 2 * yhat_std) oddshat_u = np.exp(yhat + 2 * yhat_std) return odds_hat, oddshat_l, oddshat_u
def plot_OLS_CI(self, model, x, y, y_true): prstd, iv_l, iv_u = wls_prediction_std(model) fig, ax = plt.subplots(figsize=(8,6)) ax.plot(x, y, 'o', label="data") ax.plot(x, y_true, 'b-', label="True") ax.plot(x, model.fittedvalues, 'r--.', label="OLS") ax.plot(x, iv_u, 'r--') ax.plot(x, iv_l, 'r--') ax.legend(loc='best');
def lm(x, y): "fits an OLS from statsmodels. returns tuple." x, y = map(_plot_friendly, [x, y]) if _isdate(x[0]): x = np.array([i.toordinal() for i in x]) df = pd.DataFrame({'x': x, 'y': y}) df['const'] = 1. fit = sm.OLS(df.y, df[['x', 'const']]).fit() df['predicted_y'] = fit.fittedvalues df['predstd'], df['interval_l'], df['interval_u'] = wls_prediction_std(fit) return (df.predicted_y, df.interval_l, df.interval_u)
def lm(x, y): "fits an OLS from statsmodels. returns tuple." x, y = map(_plot_friendly, [x, y]) if _isdate(x[0]): x = np.array([i.toordinal() for i in x]) df = pd.DataFrame({"x": x, "y": y}) df["const"] = 1.0 fit = sm.OLS(df.y, df[["x", "const"]]).fit() df["predicted_y"] = fit.fittedvalues df["predstd"], df["interval_l"], df["interval_u"] = wls_prediction_std(fit) return (df.predicted_y, df.interval_l, df.interval_u)
def _predict(self, fit, df): """ Return a df with predictions and confidence interval Notes ----- The df will contain the following columns: - 'predicted': the model output - 'interval_u', 'interval_l': upper and lower confidence bounds. The result will depend on the following attributes of self: confint : float (default=0.95) Confidence level for two-sided hypothesis allow_negative_predictions : bool (default=True) If False, correct negative predictions to zero (typically for energy consumption predictions) Parameters ---------- fit : Statsmodels fit df : pandas DataFrame or None (default) If None, use self.df Returns ------- df_res : pandas DataFrame Copy of df with additional columns 'predicted', 'interval_u' and 'interval_l' """ # Add model results to data as column 'predictions' df_res = df.copy() if 'Intercept' in fit.model.exog_names: df_res['Intercept'] = 1.0 df_res['predicted'] = fit.predict(df_res) if not self.allow_negative_predictions: df_res.loc[df_res['predicted'] < 0, 'predicted'] = 0 def rename(x): if x == 'Intercept': return x else: return self.quote(x) prstd, interval_l, interval_u = wls_prediction_std( fit, df_res.rename(columns=rename)[fit.model.exog_names], alpha=1 - self.confint) df_res['interval_l'] = interval_l df_res['interval_u'] = interval_u if 'Intercept' in df_res: df_res.drop(labels=['Intercept'], axis=1, inplace=True) return df_res
def _predict(self, fit, df): """ Return a df with predictions and confidence interval Notes ----- The df will contain the following columns: - 'predicted': the model output - 'interval_u', 'interval_l': upper and lower confidence bounds. The result will depend on the following attributes of self: confint : float (default=0.95) Confidence level for two-sided hypothesis allow_negative_predictions : bool (default=True) If False, correct negative predictions to zero (typically for energy consumption predictions) Parameters ---------- fit : Statsmodels fit df : pandas DataFrame or None (default) If None, use self.df Returns ------- df_res : pandas DataFrame Copy of df with additional columns 'predicted', 'interval_u' and 'interval_l' """ # Add model results to data as column 'predictions' df_res = df.copy() if 'Intercept' in fit.model.exog_names: df_res['Intercept'] = 1.0 df_res['predicted'] = fit.predict(df_res) if not self.allow_negative_predictions: df_res.loc[df_res['predicted'] < 0, 'predicted'] = 0 def rename(x): if x == 'Intercept': return x else: return self.quote(x) prstd, interval_l, interval_u = wls_prediction_std(fit, df_res.rename(columns=rename)[fit.model.exog_names], alpha=1 - self.confint) df_res['interval_l'] = interval_l df_res['interval_u'] = interval_u if 'Intercept' in df_res: df_res.drop(labels=['Intercept'], axis=1, inplace=True) return df_res
def plot_locality(self,gene_list,bootstraps=10,num_windows=100,sd_thresh=2): ''' Make a fancy locality plot. ''' # Generate a blank fig fig,ax = plt.subplots(figsize=(8,6)) fig.hold(True) # Y axis is local degree (what we are TRYING to predict) degree = self.locality(gene_list).sort('global') ax.set_ylim(0,max(degree['local'])) ax.set_xlim(0,max(degree['global'])) if bootstraps > 0: bs = pd.concat( [self.locality( self.refgen.bootstrap_candidate_genes(gene_list) ) for x in range(10)] ).sort('global') ax.set_ylim(0,max(bs['local'])) ax.set_xlim(0,max(bs['global'])) plt.plot(bs['global'],bs['local'],'ro',alpha=0.05,label='Bootstraps') # Plot the bootstraps and the empirical plt.plot(degree['global'],degree['local'],'bo',label='Empirical') emp_ols = sm.OLS(degree['local'],degree['global']).fit() ax.plot(degree['global'],emp_ols.fittedvalues,'k:',label='Empirical OLS') if bootstraps > 0: # Get the OLS bs_ols = sm.OLS(bs['local'],bs['global']).fit() bs['resid'] = bs_ols.resid bs['fitted'] = bs_ols.fittedvalues ax.plot(bs['global'],bs_ols.fittedvalues,'g--',label='bootstrap OLS') # Do lowess on the residuals # We only care about windows within the empirical part window_tick = len(bs)/num_windows bs['window'] = [int(x/window_tick) for x in range(len(bs))] # get std for each window win_std = bs.groupby('window').apply(lambda df: df['resid'].std()).to_dict() bs['std_envelope'] = [win_std[x] for x in bs.window.values] # Plot confidence intervals prstd, iv_l, iv_u = wls_prediction_std(bs_ols) ax.plot(bs['global'], iv_u, 'g--',label='conf int.') ax.plot(bs['global'], iv_l, 'g--') # plot the ax.plot( bs['global'],bs['fitted']+(sd_thresh*bs['std_envelope']),'r--' ,label='{} s.d. envelope'.format(sd_thresh) ) ax.plot(bs['global'],bs['fitted']-(sd_thresh*bs['std_envelope']),'r--') ax.set_xlabel('Number Global Interactions') ax.set_ylabel('Number Local Interactions') legend = ax.legend(loc='best') return plt
def predict(self, ID, ALPHA=0.5): list1 = get_data(ID) vector = self.vectorizer.transform([list1[0]]) vector = self.lsa.transform(vector) array = np.array([list1[1:4]])**2.0 / self.sum array = array**0.5 vector= np.hstack([vector, array]) vector = del_vector(vector, self.dellist) estimated = self.results.predict(vector) prstdn, infa, supa = wls_prediction_std(self.results, vector, alpha = ALPHA) if infa[0] < 0: infa[0] = 0 return estimated[0]**2.0, infa[0]**2.0, supa[0]**2.0
def run_ordinary_least_squares(ols_dates, ols_data, statsmodels_settings): """ This method receives the dates and prices of a Quandl data-set as well as settings for the StatsModels package, it then calculates the regression lines and / or the confidence lines are returns the objects """ intercept = np.column_stack((ols_dates, ols_dates ** statsmodels_settings.exponent)) constant = sm.add_constant(intercept) statsmodel_regression = sm.OLS(ols_data, constant).fit() print(statsmodel_regression.summary()) if statsmodels_settings.confidence: prstd, lower, upper = wls_prediction_std(statsmodel_regression) return statsmodel_regression, lower, upper else: return statsmodel_regression
def main(): df = pickle.loads(open('OLS_data','r').read()) df = df.sort(columns='Median household income') y = df['Tip Perc'] X = df[['Median household income','Income2','const']] result = sm.OLS(y, X).fit() yhat = result.predict(X) prstd, iv_l, iv_u = wls_prediction_std(result) plt.scatter(X['Median household income'],y,color = 'b', alpha = 0.9) plt.plot(X['Median household income'],yhat, color = 'r', alpha = 0.7) plt.plot(X['Median household income'], iv_u, '--', color ='r',alpha = 0.7, linewidth = 0.7) plt.plot(X['Median household income'], iv_l, '--', color ='r', alpha = 0.7, linewidth = 0.7) plt.text(125000, 24.5,'$R^2$=$%.3f$' % result.rsquared, ha='center', va='center') plt.xlabel('Median Household Income ($)') plt.ylabel('Average Tip Percentage') plt.title('Regress Tip Percentage on Median Household Income') plt.show()
def main(): df = pickle.loads(open("OLS_data", "r").read()) df = df.sort(columns="White") y = df["Tip Perc"] X = df[["White", "const"]] result = sm.OLS(y, X).fit() yhat = result.predict(X) prstd, iv_l, iv_u = wls_prediction_std(result) plt.scatter(X["White"], y, color="b", alpha=0.9) plt.plot(X["White"], yhat, color="r", alpha=0.7) plt.plot(X["White"], iv_u, "--", color="r", alpha=0.7, linewidth=0.7) plt.plot(X["White"], iv_l, "--", color="r", alpha=0.7, linewidth=0.7) plt.text(1.05, 25, "$R^2$=$%.3f$" % result.rsquared, ha="center", va="center") plt.xlabel("White Rate") plt.ylabel("Average Tip Percentage") plt.title("Regress Tip Percentage on White Rate") plt.show()
def lm(x, y, alpha=ALPHA): "fits an OLS from statsmodels. returns tuple." x, y = map(plot_friendly, [x,y]) if _isdate(x[0]): x = np.array([i.toordinal() for i in x]) X = sm.add_constant(x) fit = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(fit) _, summary_values, summary_names = summary_table(fit, alpha=alpha) df = pd.DataFrame(summary_values, columns=map(snakify, summary_names)) fittedvalues = df['predicted_value'] predict_mean_se = df['std_error_mean_predict'] predict_mean_ci_low = df['mean_ci_95%_low'] predict_mean_ci_upp = df['mean_ci_95%_upp'] predict_ci_low = df['predict_ci_95%_low'] predict_ci_upp = df['predict_ci_95%_upp'] return (fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
def returnOutliers(results, x, y, alpha=0.05): o_x = [] o_y = [] #print results.cov_params().shape[0] exog = results.model.exog #print exog.shape #print x.shape[0] pred_y, iv_l, iv_u = wls_prediction_std(results, exog=x, weights=None, alpha=alpha) i = 0 for val in y: if (val > iv_u[i] or val < iv_l[i]): o_x.append(x[i][1]) o_y.append(val) i += 1 return o_x, o_y
def test_ci(self): res_wls = self.res_wls prstd, iv_l, iv_u = wls_prediction_std(res_wls) pred_res = get_prediction(res_wls) ci = pred_res.conf_int(obs=True) assert_allclose(pred_res.se_obs, prstd, rtol=1e-13) assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13) sf = pred_res.summary_frame() col_names = ['mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper', 'obs_ci_lower', 'obs_ci_upper'] assert_equal(sf.columns.tolist(), col_names) pred_res2 = res_wls.get_prediction() ci2 = pred_res2.conf_int(obs=True) assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13) assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13) sf2 = pred_res2.summary_frame() assert_equal(sf2.columns.tolist(), col_names) # check that list works, issue 4437 x = res_wls.model.exog.mean(0) pred_res3 = res_wls.get_prediction(x) ci3 = pred_res3.conf_int(obs=True) pred_res3b = res_wls.get_prediction(x.tolist()) ci3b = pred_res3b.conf_int(obs=True) assert_allclose(pred_res3b.se_obs, pred_res3.se_obs, rtol=1e-13) assert_allclose(ci3b, ci3, rtol=1e-13) res_df = pred_res3b.summary_frame() assert_equal(res_df.index.values, [0]) x = res_wls.model.exog[-2:] pred_res3 = res_wls.get_prediction(x) ci3 = pred_res3.conf_int(obs=True) pred_res3b = res_wls.get_prediction(x.tolist()) ci3b = pred_res3b.conf_int(obs=True) assert_allclose(pred_res3b.se_obs, pred_res3.se_obs, rtol=1e-13) assert_allclose(ci3b, ci3, rtol=1e-13) res_df = pred_res3b.summary_frame() assert_equal(res_df.index.values, [0, 1])
def test_pred_interval(show_plot=False): from ml_ext import examples (coefs,df)=examples.gen_simplemodel_data(n=50,k=3) df.sort('X1',inplace=True) lr=LinModel() X=df[df.columns[df.columns!='y']] y=df.y lr.fit(X=X,y=y) lr.summary() df_ci=lr.get_confidence_interval_for_mean(X) df_pi=lr.get_prediction_interval(X) #Now use statsmodels to compare from statsmodels.sandbox.regression.predstd import wls_prediction_std import statsmodels.api as sm re = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(re) if show_plot: (fig,ax)=plt.subplots(nrows=2,ncols=1,figsize=[14,12]) cols=sns.color_palette('husl',n_colors=4) ax[0].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4) ax[0].plot(X.X1,df_pi['upper_pred'],label='pred',color=cols[1],alpha=0.5) ax[0].plot(X.X1,df_pi['lower_pred'],color=cols[1],alpha=0.5) ax[0].plot(X.X1,df_ci['upper_mean'],color=cols[2],alpha=0.5) ax[0].plot(X.X1,df_ci['lower_mean'],label='mean_ci',color=cols[2],alpha=0.5) ax[0].scatter(X.X1,df_pi['y_hat'],label='y_hat',color=cols[0],alpha=0.5) ax[0].legend(loc='best') ax[1].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4) ax[1].scatter(X.X1,df_ci['y_hat'],label='y_hat',color=cols[0],alpha=0.5) ax[1].plot(X.X1,iv_u,label='wls',color=cols[1],alpha=0.5) ax[1].plot(X.X1,iv_l,color=cols[1],alpha=0.5) ax[1].legend(loc='best') #get difference between uppers from each and check they are within 1% overall_diff=100*numpy.sum(iv_u-df_pi['upper_pred'])/numpy.sum(iv_u) logging.debug("Overall % difference in prediction ranges for upper bound: {}".format(overall_diff)) assert overall_diff<0.1
def lm(x, y, alpha=ALPHA): "fits an OLS from statsmodels. returns tuple." x_is_date = _isdate(x.iloc[0]) if x_is_date: x = np.array([i.toordinal() for i in x]) X = sm.add_constant(x) fit = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(fit) _, summary_values, summary_names = summary_table(fit, alpha=alpha) df = pd.DataFrame(summary_values, columns=map(_snakify, summary_names)) # TODO: indexing w/ data frame is messing everything up fittedvalues = df['predicted_value'].values predict_mean_ci_low = df['mean_ci_95%_low'].values predict_mean_ci_upp = df['mean_ci_95%_upp'].values predict_ci_low = df['predict_ci_95%_low'].values predict_ci_upp = df['predict_ci_95%_upp'].values if x_is_date: x = [Timestamp.fromordinal(int(i)) for i in x] return (x, fittedvalues, predict_mean_ci_low, predict_mean_ci_upp)
def test_nonlinear(): np.random.seed(111) n_sample = 50 max_val = 30 sig = 0.5 x = np.linspace(0, max_val, n_sample) X = np.c_[x, np.sin(x), (x - 5)**2, np.ones(n_sample)] beta = np.array([0.5, 0.5, -0.02, 5.0]) e = np.random.normal(size=n_sample) #X = sm.add_constant(X, prepend=False) y_true = np.dot(X, beta) y = y_true + sig * e for i in xrange(5): print '%3d: %s %s' % (i, X[i, :], y[i]) print print model = sm.OLS(y, X) results = model.fit() print results.summary() print print print results.params print results.rsquared print results.bse print results.predict() plt.figure() plt.plot(x, y, 'o', x, y_true, 'b-') prstd, iv_l, iv_u = wls_prediction_std(results) plt.plot(x, results.fittedvalues, 'r--.') plt.plot(x, iv_u, 'r--') plt.plot(x, iv_l, 'r--') plt.title('blue: true, red: OLS') plt.show()
def predict(self, ID, ALPHA=0.5): list1 = get_data(ID) vector = self.vectorizer.transform([list1[0]]) vector = self.lsa.transform(vector) array = np.array([list1[1:4]])**2.0 / self.sum array = array**0.5 vector= np.hstack([vector, array]) length = vector.shape[1] ''' for i in range(length): tmp = vector[0][i] * vector[0][i] tmp = np.array([[tmp]]) vector = np.hstack([vector, tmp]) ''' for i in range(length): for j in range(i, length): tmp = vector[0][i] * vector[0][j] tmp = np.array([[tmp]]) vector = np.hstack([vector, tmp]) vector = del_vector(vector, self.dellist) estimated = self.results.predict(vector) prstdn, infa, supa = wls_prediction_std(self.results, vector, alpha = ALPHA) if infa[0] < 0: infa[0] = 0 return estimated[0]**2.0, infa[0]**2.0, supa[0]**2.0
def test_ci(self): res_wls = self.res_wls prstd, iv_l, iv_u = wls_prediction_std(res_wls) pred_res = get_prediction(res_wls) ci = pred_res.conf_int(obs=True) assert_allclose(pred_res.se_obs, prstd, rtol=1e-13) assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13) sf = pred_res.summary_frame() col_names = ['mean', 'mean_se', 'mean_ci_lower', 'mean_ci_upper', 'obs_ci_lower', 'obs_ci_upper'] assert_equal(sf.columns.tolist(), col_names) pred_res2 = res_wls.get_prediction() ci2 = pred_res2.conf_int(obs=True) assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13) assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13) sf2 = pred_res2.summary_frame() assert_equal(sf2.columns.tolist(), col_names)
def linear(data): # Regression x = [] y = [] for i in range(len(data)): x.append(i) for p in data[headers[1]]: y.append(p) x = np.array(x) y = np.array(y) x = x.reshape(-1, 1) y = y.reshape(-1, 1) x_line = np.column_stack((x, x**2)) x_cons = sm.add_constant(x_line) model = sm.OLS(y, x_cons) results = model.fit() print (results.summary()) print ('Coefficients: ', results.params) # Save to pfd print ('Standard errors: ', results.bse) print ('R2: ', results.rsquared) # Plot prstd, iv_l, iv_u = wls_prediction_std(results) fig, ax = plt.subplots() title = "Linear Regression" ,headers[1] plt.title(title) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() plt.tick_params(axis="both", which="both", bottom="on", top="off", labelbottom="on", left="off", right="off", labelleft="on") ax.plot(x, y, label="data") ax.plot(x, results.fittedvalues, 'r--.', label="OLS") ax.plot(x, iv_u, 'c--') ax.plot(x, iv_l, 'c--') ax.legend(loc='best') plt.savefig('linear.png', bbox_inches="tight")
# Fit and summary: res = sm.OLS(y, X).fit() print(res.summary()) # Extract other quantities of interest: print('Parameters: ', res.params) print('Standard errors: ', res.bse) print('Predicted values: ', res.predict()) # Draw a plot to compare the true relationship to OLS predictions. Confidence intervals around the predictions are built using the ``wls_prediction_std`` command. prstd, iv_l, iv_u = wls_prediction_std(res) fig, ax = plt.subplots() ax.plot(x, y, 'o', label="data") ax.plot(x, y_true, 'b-', label="True") ax.plot(x, res.fittedvalues, 'r--.', label="OLS") ax.plot(x, iv_u, 'r--') ax.plot(x, iv_l, 'r--') ax.legend(loc='best'); # ## OLS with dummy variables # # We generate some artificial data. There are 3 groups which will be modelled using dummy variables. Group 0 is the omitted/benchmark category.
se = np.round(se,4) colnames = ['x1', 'const'] rownames = ['WLS', 'OLS', 'OLS_HC0', 'OLS_HC1', 'OLS_HC3', 'OLS_HC3'] tabl = SimpleTable(se, colnames, rownames, txt_fmt=default_txt_fmt) print(tabl) # Calculate OLS prediction interval: covb = res_ols.cov_params() prediction_var = res_ols.mse_resid + (X * np.dot(covb,X.T).T).sum(1) prediction_std = np.sqrt(prediction_var) tppf = stats.t.ppf(0.975, res_ols.df_resid) prstd_ols, iv_l_ols, iv_u_ols = wls_prediction_std(res_ols) # Draw a plot to compare predicted values in WLS and OLS: prstd, iv_l, iv_u = wls_prediction_std(res_wls) fig, ax = plt.subplots() ax.plot(x, y, 'o', label="Data") ax.plot(x, y_true, 'b-', label="True") # OLS ax.plot(x, res_ols.fittedvalues, 'r--') ax.plot(x, iv_u_ols, 'r--', label="OLS") ax.plot(x, iv_l_ols, 'r--') # WLS ax.plot(x, res_wls.fittedvalues, 'g--.')
def summary_table(res, alpha=0.05): '''generate summary table of outlier and influence similar to SAS Parameters ---------- alpha : float significance level for confidence interval Returns ------- st : SimpleTable instance table with results that can be printed data : ndarray calculated measures and statistics for the table ss2 : list of strings column_names for table (Note: rows of table are observations) ''' from scipy import stats from statsmodels.sandbox.regression.predstd import wls_prediction_std infl = OLSInfluence(res) #standard error for predicted mean #Note: using hat_matrix only works for fitted values predict_mean_se = np.sqrt(infl.hat_matrix_diag*res.mse_resid) tppf = stats.t.isf(alpha/2., res.df_resid) predict_mean_ci = np.column_stack([ res.fittedvalues - tppf * predict_mean_se, res.fittedvalues + tppf * predict_mean_se]) #standard error for predicted observation predict_se, predict_ci_low, predict_ci_upp = wls_prediction_std(res) predict_ci = np.column_stack((predict_ci_low, predict_ci_upp)) #standard deviation of residual resid_se = np.sqrt(res.mse_resid * (1 - infl.hat_matrix_diag)) table_sm = np.column_stack([ np.arange(res.nobs) + 1, res.model.endog, res.fittedvalues, predict_mean_se, predict_mean_ci[:,0], predict_mean_ci[:,1], predict_ci[:,0], predict_ci[:,1], res.resid, resid_se, infl.resid_studentized_internal, infl.cooks_distance[0] ]) #colnames, data = zip(*table_raw) #unzip data = table_sm ss2 = ['Obs', 'Dep Var\nPopulation', 'Predicted\nValue', 'Std Error\nMean Predict', 'Mean ci\n95% low', 'Mean ci\n95% upp', 'Predict ci\n95% low', 'Predict ci\n95% upp', 'Residual', 'Std Error\nResidual', 'Student\nResidual', "Cook's\nD"] colnames = ss2 #self.table_data = data #data = np.column_stack(data) from statsmodels.iolib.table import SimpleTable, default_html_fmt from statsmodels.iolib.tableformatting import fmt_base from copy import deepcopy fmt = deepcopy(fmt_base) fmt_html = deepcopy(default_html_fmt) fmt['data_fmts'] = ["%4d"] + ["%6.3f"] * (data.shape[1] - 1) #fmt_html['data_fmts'] = fmt['data_fmts'] st = SimpleTable(data, headers=colnames, txt_fmt=fmt, html_fmt=fmt_html) return st, data, ss2
__author__ = 'Yas' import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt from statsmodels.sandbox.regression.predstd import wls_prediction_std np.random.seed(1024) X= [1,2,3,4,5,6,7] #X = range(1,8) Y = [1,7,3,20,5,6,2] #X = sm.add_constant(X) wls_model = sm.WLS(Y,X, weights=[0.1,0.1,0.1,0.0,0.1,0.1,0.1]) res_wls = wls_model.fit() print res_wls.params print res_wls.tvalues #print(results.t_test([1, 0])) plt.plot(X, res_wls.fittedvalues, 'g--'); prstd, iv_l, iv_u = wls_prediction_std(res_wls) #print(results.f_test([0, 1])) plt.ylim(-50,50) plt.xlim(0,30) plt.plot(X,Y, 'o') plt.plot(X, iv_u, 'b--'); plt.plot(X, iv_l, 'r--'); #plt.plot(X,res_wls.f, '.') plt.show()
#Read data filename = 'griliches.dta' df = rd_stata(filename) x = [] #===========Least square regression #===================== #lw VS multivariates #===================== x= df[['rns','mrt','smsa','med','iq','kww','age','s','expr']] y = df.lw X = sm.add_constant(x) model = sm.OLS(y, X) results = model.fit() print(results.summary()) #plot the results plt.figure(); plt.plot(x, y, 'o'); prstd, iv_l, iv_u = wls_prediction_std(results) plt.plot(x, results.fittedvalues, 'g--.') plt.plot(x, iv_u, 'r--') plt.plot(x, iv_l, 'r--') plt.xlabel('multivariates') plt.ylabel('Log Wage') plt.title('Multivariates'); plt.savefig('1-multivariates.png') plt.show()
def regression_and_scatter(df_x_path, x_name, y_names, df_y_path=None, roi_normalize=True, confidence_intervals=False, prediction_intervals=False, animals=None, ): df = pd.read_csv(df_x_path, index_col=0) if df_y_path: dfy = pd.read_csv(df_y_path, index_col=0) df = pd.concat([df, dfy], axis=1) if animals: df = df.loc[animals] if roi_normalize: df[x_cols] = df[x_cols].apply(lambda x: (x / x.mean())) fig, ax = plt.subplots() ax.set_xmargin(0.1) ax.set_ymargin(0.11) df[x_cols] = df[x_cols].apply(lambda x: (x / x.mean())) fig, ax = plt.subplots() ax.set_xmargin(0.1) ax.set_ymargin(0.11) for ix, y_name in enumerate(y_names): x = df[[x_name]].values y = df[[y_name]].values x_ = sm.add_constant(x) # constant intercept term model = sm.OLS(y, x_) for ix, y_name in enumerate(y_names): x = df[[x_name]].values y = df[[y_name]].values x_ = sm.add_constant(x) # constant intercept term model = sm.OLS(y, x_) fitted = model.fit() x_pred = np.linspace(x.min(), x.max(), 50) x_pred2 = sm.add_constant(x_pred) y_pred = fitted.predict(x_pred2) y_hat = fitted.predict(x_) y_err = y - y_hat mean_x = x.mean() n = len(x) dof = n - fitted.df_model - 1 t = stats.t.ppf(0.05, df=dof) s_err = np.sum(np.power(y_err, 2)) if confidence_intervals: conf = t * np.sqrt((s_err/(n-2))*(1.0/n + (np.power((x_pred-mean_x),2) / ((np.sum(np.power(x_pred,2))) - n*(np.power(mean_x,2)))))) upper_conf = y_pred + abs(conf) lower_conf = y_pred - abs(conf) ax.fill_between(x_pred, lower_conf, upper_conf, color=qualitative_colorset[ix], alpha=0.3) if prediction_intervals: sdev_pred, lower_pred, upper_pred = wls_prediction_std(fitted, exog=x_pred2, alpha=0.05) ax.fill_between(x_pred, lower_pred, upper_pred, color=qualitative_colorset[ix], alpha=0.08) data_points = ax.plot(x,y,'o',color=qualitative_colorset[ix],markeredgecolor=qualitative_colorset[ix]) ax.tick_params(axis="both",which="both",bottom="off",top="off",length=0) ax.plot(x_pred, y_pred, '-', color=qualitative_colorset[ix], linewidth=2, label=y_name) plt.legend(loc="best")
def first_ens_prod_fig(): """ This plot is based on a production model taking into account: Tout, vWind and the production 24 hours before """ plt.close('all') cols = ['Tout', 'vWind', 'prod24h_before'] ts1 = ens.gen_hourly_timesteps(dt.datetime(2015,12,17,1), dt.datetime(2016,1,15,0)) ts2 = ens.gen_hourly_timesteps(dt.datetime(2016,1,20,1), dt.datetime(2016,1,28,0)) #load the data fit_data = ens.repack_ens_mean_as_df() fit_data['prod24h_before'] = sq.fetch_production(dt.datetime(2015,12,16,1), dt.datetime(2016,1,14,0)) vali_data = ens.repack_ens_mean_as_df(dt.datetime(2016,1,20,1), dt.datetime(2016,1,28,0)) vali_data['prod24h_before'] = sq.fetch_production(dt.datetime(2016,1,19,1), dt.datetime(2016,1,27,0)) # do the fit X = fit_data[cols] y = fit_data['prod'] res = mlin_regression(y, X, add_const=True) fig, [ax1, ax2] = plt.subplots(2,1, figsize=(40,20)) # load ensemble data ens_data1 = ens.load_ens_timeseries_as_df(ts_start=ts1[0], ts_end=ts1[-1]) ens_data1['prod24h_before'] = fit_data['prod24h_before'] ens_data2 = ens.load_ens_timeseries_as_df(ts_start=ts2[0], ts_end=ts2[-1]) ens_data2['prod24h_before'] = vali_data['prod24h_before'] all_ens_data = pd.concat([ens_data1, ens_data2]) all_ts = ts1 + ts2 # calculate production for each ensemble member ens_prods = np.zeros((len(all_ts), 25)) for i in range(25): ens_cols = ['Tout' + str(i), 'vWind' + str(i), 'prod24h_before'] ens_params = pd.Series({'Tout' + str(i):res.params['Tout'], 'vWind' + str(i):res.params['vWind'], 'const':res.params['const'], 'prod24h_before':res.params['prod24h_before']}) ens_prods[:,i] = linear_map(all_ens_data, ens_params, ens_cols) # calculate combined confint prstd, iv_l, iv_u = wls_prediction_std(res) mean_conf_int_spread = np.mean(res.fittedvalues - iv_l) model_std = np.concatenate([prstd, (1./1.9599)*mean_conf_int_spread*np.ones(len(ts2))]) ens_std = ens_prods.std(axis=1) combined_std = np.sqrt(model_std**2 + ens_std**2) all_prod_model = np.concatenate([res.fittedvalues, linear_map(vali_data, res.params, cols)]) combined_ub95 = all_prod_model + 1.9599*combined_std combined_lb95 = all_prod_model - 1.9599*combined_std # plot confint ax1.fill_between(all_ts, combined_lb95, combined_ub95, label='Combined 95% conf. int.') ax1.fill_between(all_ts, all_prod_model - 1.9599*ens_std, all_prod_model + 1.9599*ens_std, facecolor='grey', label='Ensemble 95% conf. int.') # plot ensempble models ax1.plot_date(all_ts, ens_prods, '-', lw=0.5) ax1.plot_date(ts1, y, 'k-', lw=2, label='Actual production') ax1.plot_date(ts1, res.fittedvalues,'r-', lw=2, label='Model on ensemble mean') ax1.plot_date(ts2, vali_data['prod'], 'k-', lw=2, label='') ax1.plot_date(ts2, linear_map(vali_data, res.params, cols), 'r-', lw=2) ax1.set_ylabel('[MW]') ax1.legend(loc=2) vali_resid = linear_map(vali_data, res.params, cols) - vali_data['prod'] ax2.plot_date(ts1, res.resid, '-', label='Residual, fitted data') ax2.plot_date(ts2, vali_resid, '-', label='Residual, validation data') ax2.set_ylabel('[MW]') ax2.legend(loc=2) print "MAE = " + str(mae(vali_resid)) print "MAPE = " + str(mape(vali_resid, vali_data['prod'])) print "RMSE = " + str(rmse(vali_resid)) print "ME = " + str(np.mean(vali_resid)) print "MAE (fit) = " + str(mae(res.resid)) print "MAPE (fit) = " + str(mape(res.resid, fit_data['prod'])) print "RMSE (fit)= " + str(rmse(res.resid)) print "ME (fit)= " + str(np.mean(res.resid)) plt.savefig('figures/ens_prod_models.pdf', dpi=600) plt.figure() plt.plot_date(all_ts, ens_std) plt.ylabel('Std. of ensemble production models [MW]') plt.savefig('figures/std_ens_prod_models.pdf', dpi=600) sns.jointplot(x=ens_std, y=np.concatenate([res.resid, vali_resid])) return res, all_ens_data, all_ts, fit_data['prod'], vali_data['prod']