def prediction_error_plot(lin_model,x_train, y_train, x_test, y_test): fig = plt.figure(figsize=(16,12)) ax1 = fig.add_subplot(111) visualizer_pred_err = PredictionError(lin_model, ax=ax1) visualizer_pred_err.fit(x_train, y_train) # Fit the training data to the visualizer visualizer_pred_err.score(x_test, y_test) # Evaluate the model on the test data visualizer_pred_err.show()
def prediction_error_plot(self) -> None: """Plot the actual targets from the dataset against the predicted values generated by our model. This allows us to see how much variance is in the model. """ visualizer = PredictionError(self.trained_model) visualizer.fit(self.X_train, self.y_train) # Fit the training data to the visualizer visualizer.score(self.X_test, self.y_test) # Evaluate the model on the test data save_dir = f"{self.plots_dir}/prediction_error_plot_{self.model_id}.png" visualizer.show(outpath=save_dir) if not LOCAL: upload_to_s3(save_dir, f'plots/prediction_error_plot_{self.model_id}.png', bucket=S3_BUCKET_NAME) plt.clf()
def lasso_regression(X_train, y_train, X_test, y_test, plot): """ Perfomring a lasso regression with built in CV and plotting the feature importance """ # Fit the ridge regression reg = LassoCV() reg.fit(X_train, y_train) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" % reg.score(X_train, y_train)) coef = pd.Series(reg.coef_, index=X_train.columns) print( "Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) # Extract the feature importance imp_coef = coef.sort_values() # Plot the feature importance if plot: plt.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Lasso Model") plt.show() # Plotting the prediction error visualizer = PredictionError(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Visualizing the regression visualizer = ResidualsPlot(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Using the test data to calculate a score y_pred = reg.predict(X_test) # Return metrics return { "name": "Lasso Regression", "R squared": reg.score(X_test, y_test), "RMSE": rmse(y_test, y_pred), "R squared training": reg.score(X_train, y_train), "MAE": mean_absolute_error(y_test, y_pred), }
lm5 = LinearRegression().fit(x_train,y_train) lm5_pred=lm5.predict(x_test) print("RMSE = ", np.sqrt(mean_squared_error(y_test,lm5_pred))) print("R^2 = ", r2_score(y_test,lm5_pred)) # In[30]: from yellowbrick.regressor import PredictionError, ResidualsPlot visualizer=PredictionError(lm5).fit(x_train, y_train) visualizer.score(x_test, y_test) visualizer.show() # In[32]: #TASK 7: INTERACTION EFFECT - SYNERGY advert['interaction']= advert['TV'] * advert['radio'] x=advert[['TV', 'radio', 'interaction']] y=advert.sales x_train, x_test, y_train, y_test= train_test_split(x,y, random_state=1) lm6 = LinearRegression().fit(x_train,y_train)
def scikit_learn_method(x, y, min_x, max_x, max_y, ln_bool, df=all_scopus, test_size=0.2, random_state=0): # https://stackoverflow.com/questions/42988348/typeerror-cannot-convert-the-series-to-class-float if ln_bool: y = np.log(y) # set random_state = 0 for consistent seed x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) # reshape (-1, 1) - gives us 1 sample; no need to reshape y # https://datatofish.com/dropna/ # https://stackoverflow.com/questions/18691084/what-does-1-mean-in-numpy-reshape # https://stackoverflow.com/questions/53723928/attributeerror-series-object-has-no-attribute-reshape # https://stackoverflow.com/questions/35082140/preprocessing-in-scikit-learn-single-sample-depreciation-warning x_train = x_train.values.reshape(-1, 1) x_test = x_test.values.reshape(-1, 1) model_withOutliers = LinearRegression() model_withOutliers = model_withOutliers.fit(x_train, y_train) print('y-hat = %sx + %s' % (model_withOutliers.coef_[0], model_withOutliers.intercept_)) # https://stackoverflow.com/questions/41635448/how-can-i-draw-scatter-trend-line-on-matplot-python-pandas/41635626 from sklearn.metrics import r2_score plt.scatter(x, y) # with outliers plt.title('With outliers') m, b = model_withOutliers.coef_[0], model_withOutliers.intercept_ plt.plot(x, m * x + b) plt.show() text = f"$y={m:0.3f}\;x{b:+0.3f}$\n$R^2 = {r2_score(y, m * x + b):0.3f}$" plt.gca().text(0.05, 0.95, text, transform=plt.gca().transAxes, fontsize=14, verticalalignment='bottom') # https://www.scikit-yb.org/en/latest/api/regressor/peplot.html from sklearn.linear_model import Lasso from yellowbrick.regressor import PredictionError lasso_model = Lasso() visualizer = PredictionError(lasso_model) visualizer.fit(x_train, y_train) # Fit the training data to the visualizer visualizer.score(x_test, y_test) # Evaluate the model on the test data visualizer.show() # https://stackoverflow.com/questions/28876243/how-to-delete-the-current-row-in-pandas-dataframe-during-df-iterrows plt.xlim(min_x, max_x) # without outliers plt.ylim(0, max_y) plt.title('Without outliers') plt.scatter(x, y) plt.show() text = f"$y={m:0.3f}\;x{b:+0.3f}$\n$R^2 = {r2_score(y, m*x+b):0.3f}$" plt.gca().text(0.05, 0.95, text, transform=plt.gca().transAxes, fontsize=14, verticalalignment='bottom') y_pred_with_outliers = model_withOutliers.predict(x_test) sum_outliers = 0 for i in range(len(df)): squared_with_outliers = (y_test - y_pred_with_outliers)**2 sum_outliers += squared_with_outliers mean = sum_outliers / len(df) rms = mean**0.5 rms_value = 0 for element in rms: rms_value += element rms_value = rms_value / len(rms) print('Root mean squared, with outliers:', rms_value)
imr = SimpleImputer(strategy='median') X = imr.fit_transform(X) # Create the train and test data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Instantiate the linear model and visualizer model = Lasso() visualizer = PredictionError(model, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Draw the data # In[88]: from sklearn.linear_model import LinearRegression regr_linear = LinearRegression() # Load regression dataset #Index(['confirmed', 'deaths', 'recovered', # 'active', 'incident_rate', 'people_tested', 'people_hospitlized', 'mortality_rate', 'testing_rate', 'hospitalization_rate'], #dtype='object')
def prediction_error_plot(self): visualizer = PredictionError(self.pipe) visualizer.score(self.X_test, self.y_test) return visualizer.show()