def uniRegression(p, xLabel, yLabel): global image_num # Randomly shuffle rows p = p.sample(frac=1).reset_index(drop=True) # Split train and test twentyPercent = -1 * round(p.shape[0] * 0.2) xCol = p[xLabel].values.reshape(-1, 1) X_train = xCol[:twentyPercent] X_test = xCol[twentyPercent:] y_train = p[yLabel][:twentyPercent].values.reshape(-1, 1) y_test = p[yLabel][twentyPercent:].values.reshape(-1, 1) # Fit linear regression model lr = linear_model.LinearRegression() lr.fit(X_train, y_train) # Make predictions predicted = lr.predict(X_test) r2 = r2_score(y_test, predicted) mse = mean_squared_error(y_test, predicted) # Plot expected vs. predicted plt.scatter(X_test, y_test, color='black') plt.plot(X_test, predicted, color='blue', linewidth=2) plt.xlabel(xLabel) plt.ylabel(yLabel) plt.show() plt.savefig(image_path.format(image_num), bbox_inches='tight') image_num += 1 print("R2 = ", r2) print("MSE = ", mse) visualizer = ResidualsPlot(lr) # Plot residuals visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure
def train_model(rf, healed_data, target_string): #rf.fit(healed_data["train_features"], healed_data["train_target"]) model = Ridge() visualizer = ResidualsPlot(rf) try: visualizer.fit(healed_data["train_features"], healed_data["train_target"]) except Exception as e: st.error("Fit error: " + str(e)) try: visualizer.score(healed_data["test_features"], healed_data["test_target"]) except Exception as e: st.error("Score error: " + str(e)) visualizer.show() # st.write(visualizer) st.pyplot(plt.savefig("models/rf_reg_eval_" + target_string + ".png")) # save model output model_output_loc = "models/rf_reg_" + target_string + "_rf_reg_model.pkl" model_output = open(model_output_loc, "wb") pickle.dump(rf, model_output) model_output.close() print("saving model to: " + model_output_loc) return
def linregress(*args): #import dependencies import sklearn as sk from sklearn.linear_model import LogisticRegression model = LogisticRegression() from sklearn import feature_selection import statsmodels.api as sm from patsy import dmatrices import numpy as np #define arguments dataframe = args[0] y = args[1] xvars = [] for i in range(2, len(args)): xvars.append(args[i]) x = dataframe[[item for item in xvars]] y = dataframe[y] #fit the model model.fit(x, y) #Generate Fit Statistics ##prep data for patsy list = [] for item in xvars: list.append(f' + {item}') string = "".join(list) newstring = string[3:] ind = args[1] ind = ind.strip('"') ##Fit the Model Y, X = dmatrices(f"{ind} ~ {newstring}", data=dataframe, return_type="dataframe") logit = sm.Logit(Y, X) logit_result = logit.fit() #Print Log Odds print("LOG ODDS") print(logit_result.summary()) print(np.exp(logit_result.params)) #Plot the Residuals print("\n Residual Plot") from sklearn.linear_model import Ridge from yellowbrick.datasets import load_concrete from yellowbrick.regressor import ResidualsPlot model = Ridge() visualizer = ResidualsPlot(model, hist=True) y2 = y.values.reshape(-1, 1) visualizer.fit(x, y2) # Fit the training data to the visualizer visualizer.score(x, y2) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure
def residual_plot(lin_model,x_train, y_train, x_test, y_test): fig = plt.figure(figsize=(16,12)) ax = fig.add_subplot(111) visualizer = ResidualsPlot(lin_model, ax=ax) fig = plt.figure(figsize=(16,12)) visualizer.fit(x_train, y_train) # Fit the training data to the visualizer visualizer.score(x_test, y_test) # Evaluate the model on the test data visualizer.show()
def plotResidualsAgainstHoldout(df, holdOut_df, task, seed, schema): X_train = df[COLUMNS.get(task)].values X_test = holdOut_df[COLUMNS.get(task)].values y_train = df[TARGETS.get(task)].values y_test = holdOut_df[TARGETS.get(task)].values # Instantiate the linear model and visualizer wrapped_model = LinearRegression() visualizer = ResidualsPlot(wrapped_model, title="Residuals for schema {}".format(schema)) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show(outpath="figs/residuals_{}_seed{}_{}.png".format(task, seed, schema)) plt.close()
def residuals_plot(self) -> None: """Plot the difference between the observed value of the target variable (y) and the predicted value (ŷ), i.e. the error of the prediction""" visualizer = ResidualsPlot(self.trained_model) visualizer.fit(self.X_train, self.y_train) # Fit the training data to the visualizer visualizer.score(self.X_test, self.y_test) # Evaluate the model on the test data save_dir = f"{self.plots_dir}/residuals_plot_{self.model_id}.png" visualizer.show(outpath=save_dir) if not LOCAL: upload_to_s3(save_dir, f'plots/residuals_plot_{self.model_id}.png', bucket=S3_BUCKET_NAME) plt.clf()
def ridge_regression(X_train, y_train, X_test, y_test, plot): """ Perfomring a ridge regression with built in CV and plotting the feature importance """ # Fit the ridge regression reg = RidgeCV() reg.fit(X_train, y_train) print("Best alpha using built-in RidgeCV: %f" % reg.alpha_) print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train)) coef = pd.Series(reg.coef_, index=X_train.columns) print( "Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) # Extract the feature importance imp_coef = coef.sort_values() # Plot the feature importance if plot: plt.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Ridge Model") plt.show() # Visualizing the regression visualizer = ResidualsPlot(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Using the test data to calculate a score y_pred = reg.predict(X_test) # Return metrics return { "name": "Ridge Regression", "R squared": reg.score(X_test, y_test), "R squared training": reg.score(X_train, y_train), "RMSE": rmse(y_test, y_pred), "MAE": mean_absolute_error(y_test, y_pred), }
#Predicting the price of a 150m2 living area model.predict([[150]]) #Predicting the price of a 200m2 living area model.predict([[200]]) from sklearn.linear_model import Ridge from yellowbrick.regressor import ResidualsPlot #Create a new model for the and plot the residuals for the regression model = Ridge() visualizer = ResidualsPlot(model) #Fit the training data to the visualizer visualizer.fit(LIVING_AREA, Selling_price) visualizer.show() #Creating a multiple variable regression df1 = df[df['Land_size'].notna()] df1 = df1[df1['Rooms'].notna()] X = df1[['Living_area', 'Rooms', 'Land_size', 'Age']] y = df1['Selling_price'] regr = LinearRegression() regr.fit(X, y) #The intercept and coefficients of the new model print('Intercept: \n', regr.intercept_) print('Coefficients: \n', regr.coef_) #Plot the residuals for the model based on multiple variables model = Ridge()
#load data visualizations = load_dataset(file_name=config.TRAINING_DATA_FILE) #set X and y #adjust X based on feature set to use from config.py (TOP5_FEATURES or FEATURES) X = visualizations[config.TOP5_FEATURES] y = visualizations[config.TARGET] #train test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #yellowbrick ResidualsPlotVisualization visual visualizer = ResidualsPlot(config.BEST_MODEL) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath="visualizations/ResidualsPlotVisualization.pdf") visualizer.show(outpath="visualizations/ResidualsPlotVisualization.png") visualizer.show() #yellowbrick prediction error visual visualizer = PredictionError(config.BEST_MODEL) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath="visualizations/PredictionErrorVisualization.pdf") visualizer.show(outpath="visualizations/PredictionErrorVisualization.png") visualizer.show()
def show_residuals_plot(model, X_train, y_train, X_valid, y_valid): residuals_plot = ResidualsPlot(model) residuals_plot.fit(X_train, y_train) residuals_plot.score(X_valid, y_valid) residuals_plot.show()
#fitting lasso regression and making predictions lasso = Lasso(alpha=14) lasso.fit(X_train, y_train) predictions3 = lasso.predict(X_test) score3 = lasso.score(X_test, y_test) #assessing performance of lasso mae3 = MAE(y_test, predictions3) mse3 = MSE(y_test, predictions3) rmse3 = mse3**(1 / 2) #feature importance lasso_coef = lasso.fit(X, y).coef_ #visualizing regression model visualizer = PredictionError(lasso) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show() #visualizing regression residuals plot visualizer2 = ResidualsPlot(lasso) visualizer2.fit(X_train, y_train) visualizer2.score(X_test, y_test) visualizer2.show() print('Linear Regression Score: ', score1) print('Ridge Regression Score: ', score2) print('Lasso Regression Score: ', score3)
# The residuals plot shows how the model is injecting error, the bold \ # horizontal line at residuals = 0 is no error, and any point above or below \ # that line, indicates the magnitude of error. # (https://www.scikit-yb.org/en/latest/quickstart.html#installation) # Load a regression dataset X, y = load_concrete() # Create training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) visualizer = ResidualsPlot(LinearRegression()) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Xenia: Saving my plots plt.show() fig.set_size_inches(7, 5) plt.savefig("6._Residuals_Plot.png") fig.savefig("6._Residuals_Plot.png") # %% # New Plots with Temperature & Precipitation # Time series of flow values with the x axis range limited fig, ax = plt.subplots() ax.plot(flow_weekly['flow'], label='Streamflow', color='black', linewidth=1) ax.plot(data_weekly['Precipitation'], 'r:',
x = models(i[0], i[1]) results.append(x) #%% results = pd.DataFrame(results) #%% sns.barplot(x = results['Model'], y = results['MSE']) #%% from yellowbrick.regressor import ResidualsPlot visual_LR = ResidualsPlot(LR, hist = True) visual_LR.fit(x_train, y_train) visual_LR.score(x_test, y_test) visual_LR.show() #%% visual_RF = ResidualsPlot(RF_Base, hist = True) visual_RF.fit(x_train, y_train) visual_RF.score(x_test, y_test) visual_RF.show() #%% visual_SGD = ResidualsPlot(SGD_Base, hist = True) visual_SGD.fit(x_train, y_train) visual_SGD.score(x_test, y_test) visual_SGD.show() #%%
#Hc. What is the regression equation as per the model? print("Hc. Regression equation: y=",linReg.coef_,"x + ", linReg.intercept_) print("Hc. Regression equation: y = -0.06x + 35.17") #Hd. For your model, does the predicted value for mpg increase or decrease as the displacement increases? print("Hd. The predicted value for mpg decreases as the displacement increases") #He. Given a car with a displacement value of 220, what would your model predict its mpg to be? newframe = pd.DataFrame([[220]]) #New dataframe to predict pred = linReg.predict(newframe) print("He. Displacement is 220, MPG predicted: ", pred) #Hf. Display a scatterplot of the actual mpg vs displacement and superimpose the linear regression line. plt.scatter(X,y, marker=".") #Actual mpg vs displacement #Formatting plt.xlabel("Displacement") plt.ylabel("Actual MPG") plt.suptitle("Hf. Scatterplot and Regression of Actual MPG vs Displacement\nKuehler_Danielle_FinalProject_Q4") plt.plot(X,y_pred) #Regression line plt.show() #Display #Hg. Plot the residuals ridge = Ridge() #From sklearn visualizer = ResidualsPlot(ridge) visualizer.fit(X,y) #Fit to data #Formatting plt.suptitle("Hg. Plot of Residuals\nKuehler_Danielle_FinalProject_Q4") visualizer.show() #Display
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt from yellowbrick.regressor import ResidualsPlot from sklearn.linear_model import LinearRegression scaler = StandardScaler() # neigh = KNeighborsRegressor(n_neighbors=5) # regression_visualizers = [ResidualsPlot(neigh), PredictionError(neigh)] features = [ "longitude", "latitude", "peak_load", "off-grid", "avg_peak_winter", "avg_peak_spring", "avg_peak_summer", "avg_peak_autumn", "avg_base_winter", "avg_base_spring", "avg_base_summer", "avg_base_autumn" ] case_name = "mg_sizing_dataset_with_loc" df = pd.read_csv("results/" + case_name + ".csv", sep=";|,", engine="python", index_col='index') X = df[features] scaler.fit(X) X = scaler.transform(X) targets = ["PV", "BAT", "RBAT", "INV", "GEN", "NPV"] y = df[targets[0]] model = LinearRegression() visualizer_residuals = ResidualsPlot(model) visualizer_residuals.fit(X, y) visualizer_residuals.show()
def show_residusal(model, train_tup, test_tup): resPlot = ResidualsPlot(model) resPlot.fit(*train_tup) resPlot.score(*test_tup) resPlot.show()