def train_model(rf, healed_data, target_string): #rf.fit(healed_data["train_features"], healed_data["train_target"]) model = Ridge() visualizer = ResidualsPlot(rf) try: visualizer.fit(healed_data["train_features"], healed_data["train_target"]) except Exception as e: st.error("Fit error: " + str(e)) try: visualizer.score(healed_data["test_features"], healed_data["test_target"]) except Exception as e: st.error("Score error: " + str(e)) visualizer.show() # st.write(visualizer) st.pyplot(plt.savefig("models/rf_reg_eval_" + target_string + ".png")) # save model output model_output_loc = "models/rf_reg_" + target_string + "_rf_reg_model.pkl" model_output = open(model_output_loc, "wb") pickle.dump(rf, model_output) model_output.close() print("saving model to: " + model_output_loc) return
def uniRegression(p, xLabel, yLabel): global image_num # Randomly shuffle rows p = p.sample(frac=1).reset_index(drop=True) # Split train and test twentyPercent = -1 * round(p.shape[0] * 0.2) xCol = p[xLabel].values.reshape(-1, 1) X_train = xCol[:twentyPercent] X_test = xCol[twentyPercent:] y_train = p[yLabel][:twentyPercent].values.reshape(-1, 1) y_test = p[yLabel][twentyPercent:].values.reshape(-1, 1) # Fit linear regression model lr = linear_model.LinearRegression() lr.fit(X_train, y_train) # Make predictions predicted = lr.predict(X_test) r2 = r2_score(y_test, predicted) mse = mean_squared_error(y_test, predicted) # Plot expected vs. predicted plt.scatter(X_test, y_test, color='black') plt.plot(X_test, predicted, color='blue', linewidth=2) plt.xlabel(xLabel) plt.ylabel(yLabel) plt.show() plt.savefig(image_path.format(image_num), bbox_inches='tight') image_num += 1 print("R2 = ", r2) print("MSE = ", mse) visualizer = ResidualsPlot(lr) # Plot residuals visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure
def residuals(): X, y = load_concrete() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) oz = ResidualsPlot(Ridge(), ax=newfig()) oz.fit(X_train, y_train) oz.score(X_test, y_test) savefig(oz, "residuals")
def residual_plot(model_properties=None, output_path=None): ''' Method that shows the residual plot of the trained model ''' if model_properties is None or output_path is None: raise ValueError('Need Model properties and Output path as arguments !') estimator = model_properties['estimator'] X_train = model_properties['X_train'] y_train = model_properties['y_train'] X_validation = model_properties['X_validation'] y_validation = model_properties['y_validation'] config_map = model_properties['config_map'] X_scaler = model_properties['X_scaler'] y_scaler = model_properties['y_scaler'] X_train[config_map['scale_columns']] = X_scaler.transform( X_train[config_map['scale_columns']]) y_train[config_map['label']] = y_scaler.transform( y_train[config_map['label']]) X_validation[config_map['scale_columns']] = X_scaler.transform( X_validation[config_map['scale_columns']]) y_validation[config_map['label']] = y_scaler.transform( y_validation[config_map['label']]) visualizer = ResidualsPlot(estimator) visualizer.fit(X_train.values, y_train.values) visualizer.score(X_validation.values, y_validation.values) visualizer.poof(outpath=os.path.join(output_path, 'residual_plot.png')) return None
def residual_plot(lin_model,x_train, y_train, x_test, y_test): fig = plt.figure(figsize=(16,12)) ax = fig.add_subplot(111) visualizer = ResidualsPlot(lin_model, ax=ax) fig = plt.figure(figsize=(16,12)) visualizer.fit(x_train, y_train) # Fit the training data to the visualizer visualizer.score(x_test, y_test) # Evaluate the model on the test data visualizer.show()
def plot_residuals(X, y, model, outpath="images/residuals.png", **kwargs): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) _, ax = plt.subplots() visualizer = ResidualsPlot(model, ax=ax, **kwargs) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=outpath)
def plot_residuals(X, y, model, outpath="images/residuals.png", **kwargs): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) _, ax = plt.subplots() visualizer = ResidualsPlot(model, ax=ax, **kwargs) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=outpath)
def linregress(*args): #import dependencies import sklearn as sk from sklearn.linear_model import LogisticRegression model = LogisticRegression() from sklearn import feature_selection import statsmodels.api as sm from patsy import dmatrices import numpy as np #define arguments dataframe = args[0] y = args[1] xvars = [] for i in range(2, len(args)): xvars.append(args[i]) x = dataframe[[item for item in xvars]] y = dataframe[y] #fit the model model.fit(x, y) #Generate Fit Statistics ##prep data for patsy list = [] for item in xvars: list.append(f' + {item}') string = "".join(list) newstring = string[3:] ind = args[1] ind = ind.strip('"') ##Fit the Model Y, X = dmatrices(f"{ind} ~ {newstring}", data=dataframe, return_type="dataframe") logit = sm.Logit(Y, X) logit_result = logit.fit() #Print Log Odds print("LOG ODDS") print(logit_result.summary()) print(np.exp(logit_result.params)) #Plot the Residuals print("\n Residual Plot") from sklearn.linear_model import Ridge from yellowbrick.datasets import load_concrete from yellowbrick.regressor import ResidualsPlot model = Ridge() visualizer = ResidualsPlot(model, hist=True) y2 = y.values.reshape(-1, 1) visualizer.fit(x, y2) # Fit the training data to the visualizer visualizer.score(x, y2) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure
def residuals_plot(model, X_test, y_test, road): """ param model : 已训练好的模型 X_test : 测试集数据 y_test : 测试集标签 """ visualizer = ResidualsPlot(model) visualizer.score(X_test, y_test) visualizer.poof(road)
def log_residuals_chart(regressor, X_train, X_test, y_train, y_test, experiment=None): """Log residuals chart. Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method. Tip: Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example. Args: regressor (:obj:`regressor`): | Fitted sklearn regressor object X_train (:obj:`ndarray`): | Training data matrix X_test (:obj:`ndarray`): | Testing data matrix y_train (:obj:`ndarray`): | The regression target for training y_test (:obj:`ndarray`): | The regression target for testing experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): | Neptune ``Experiment`` object to control to which experiment you log the data. | If ``None``, log to currently active, and most recent experiment. Returns: ``None`` Examples: .. code:: python3 rfr = RandomForestRegressor() rfr.fit(X_train, y_train) neptune.init('my_workspace/my_project') exp = neptune.create_experiment() log_residuals_chart(rfr, X_train, X_test, y_train, y_test, experiment=exp) """ assert is_regressor(regressor), 'regressor should be sklearn regressor.' exp = _validate_experiment(experiment) try: fig, ax = plt.subplots() visualizer = ResidualsPlot(regressor, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() exp.log_image('charts_sklearn', fig, image_name='Residuals Plot') plt.close(fig) except Exception as e: print('Did not log residuals chart. Error: {}'.format(e))
def regression_sanity_check(model, X_train, X_test, y_train, y_test): fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 10)) plt.sca(ax1) visualizer = ResidualsPlot(model, ax=ax1) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) plt.sca(ax2) visualizer2 = PredictionError(model, ax=ax2) visualizer2.fit(X_train, y_train) visualizer2.score(X_test, y_test) visualizer.finalize() visualizer2.poof()
def visualize_residuals_plot(self, model_info): model = model_info['model'] X_train = model_info['X_train'] X_test = model_info['X_test'] Y_train = model_info['Y_train'] Y_test = model_info['Y_test'] visualizer = ResidualsPlot(model) visualizer.fit(X_train, Y_train) # Fit the training data to the model visualizer.score(X_test, Y_test) # Evaluate the model on the test data visualizer.poof() # Draw/show/poof the data
def test_for_homoscedasticity(X_train, y_train, X_test, y_test): """ Plot the data and check for homoscedasticity. Arguments: X_train (dataframe): examples in the training set X_test (dataframe): examples in the test set y_train (dataframe): target in the training set y_train (dataframe): target in the test set """ lr = LinearRegression() visualizer = ResidualsPlot(lr) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) #there should be no clear pattern visualizer.poof()
def create_residuals_chart(regressor, X_train, X_test, y_train, y_test): """Create residuals chart. Tip: Check Sklearn-Neptune integration `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_ for the full example. Args: regressor (:obj:`regressor`): | Fitted sklearn regressor object X_train (:obj:`ndarray`): | Training data matrix X_test (:obj:`ndarray`): | Testing data matrix y_train (:obj:`ndarray`): | The regression target for training y_test (:obj:`ndarray`): | The regression target for testing Returns: ``neptune.types.File`` object that you can assign to run's ``base_namespace``. Examples: .. code:: python3 import neptune.new.integrations.sklearn as npt_utils rfr = RandomForestRegressor() rfr.fit(X_train, y_train) run = neptune.init(project='my_workspace/my_project') run['visuals/residuals'] = npt_utils.create_residuals_chart(rfr, X_train, X_test, y_train, y_test) """ assert is_regressor(regressor), 'regressor should be sklearn regressor.' chart = None try: fig, ax = plt.subplots() visualizer = ResidualsPlot(regressor, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() chart = neptune.types.File.as_image(fig) plt.close(fig) except Exception as e: print('Did not log residuals chart. Error: {}'.format(e)) return chart
def plotResidualsAgainstHoldout(df, holdOut_df, task, seed, schema): X_train = df[COLUMNS.get(task)].values X_test = holdOut_df[COLUMNS.get(task)].values y_train = df[TARGETS.get(task)].values y_test = holdOut_df[TARGETS.get(task)].values # Instantiate the linear model and visualizer wrapped_model = LinearRegression() visualizer = ResidualsPlot(wrapped_model, title="Residuals for schema {}".format(schema)) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show(outpath="figs/residuals_{}_seed{}_{}.png".format(task, seed, schema)) plt.close()
def residuals(ax): from sklearn.linear_model import RidgeCV from yellowbrick.regressor import ResidualsPlot features = [ 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' ] splits = load_data('concrete', cols=features, target='strength', tts=True) X_train, X_test, y_train, y_test = splits estimator = RidgeCV() visualizer = ResidualsPlot(estimator, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) return visualizer
def residuals_plot(self) -> None: """Plot the difference between the observed value of the target variable (y) and the predicted value (ŷ), i.e. the error of the prediction""" visualizer = ResidualsPlot(self.trained_model) visualizer.fit(self.X_train, self.y_train) # Fit the training data to the visualizer visualizer.score(self.X_test, self.y_test) # Evaluate the model on the test data save_dir = f"{self.plots_dir}/residuals_plot_{self.model_id}.png" visualizer.show(outpath=save_dir) if not LOCAL: upload_to_s3(save_dir, f'plots/residuals_plot_{self.model_id}.png', bucket=S3_BUCKET_NAME) plt.clf()
def visualize_pred_residuals(X_train, X_test, y_train, y_test): model = linear_model.Ridge(alpha=0.05) fitted = model.fit(X_train, y_train) visualizer = ResidualsPlot(fitted, size=(1080, 720)) pred = fitted.predict(X_test) r = stats.linregress(pred, y_test) print(r[2]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof() cvr = model_selection.cross_validate(model, X_test, y_test, cv=10, return_train_score=True) print('Training scores:', cvr['train_score'], '\n') print('Testing scores:', cvr['test_score'])
def my_residual_plot(X_train, y_train, X_test, y_test): plt.figure(figsize=(20, 5)) plt.grid(True) visualizer = ResidualsPlot(LinearRegression(), hist=False) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data ticks = np.arange(1000, max(y_test.values) + 1, 500) plt.title("Wykres rezyduów", fontsize=25) plt.xlabel("Ceny mieszkań", fontsize=15) plt.ylabel("Rezydua", fontsize=15) plt.plot(ticks, np.zeros(len(ticks)), "r") plt.legend() plt.show()
def generate_ordinal_diagnostics(x, y, current_best_model, label_type, diagnostic_image_path): x = np.array(x) y = np.array(y) kf = KFold(n_splits=10, shuffle=True) guesses = [] for train_index, test_index in kf.split(x): X_train, X_test = x[train_index], x[test_index] y_train, y_test = np.array(y)[train_index], np.array(y)[test_index] model = current_best_model[0].fit(X_train, y_train) for guess in zip(y_test.tolist(), model.predict(X_test).tolist()): guesses.append(guess) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) if "VotingClassifier" not in str(current_best_model[0].__class__): visualizer = ResidualsPlot(current_best_model[0]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=diagnostic_image_path + "/residuals_plot.png") plt.clf() visualizer = PredictionError(current_best_model[0]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof(outpath=diagnostic_image_path + "/prediction_error.png") plt.clf() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=2) visualizer.fit_transform(x, y) print(diagnostic_image_path + "/pca_2.png") visualizer.poof(outpath=diagnostic_image_path + "/pca_2.png") plt.clf() visualizer = PCADecomposition(scale=True, center=False, col=y, proj_dim=3) visualizer.fit_transform(x, y) visualizer.poof(outpath=diagnostic_image_path + "/pca_3.png") plt.clf() return { "mse": mean_squared_error(*np.array(guesses).transpose()), "r2": r2_score(*np.array(guesses).transpose()), "mae": median_absolute_error(*np.array(guesses).transpose()), "evs": explained_variance_score(*np.array(guesses).transpose()), "rmse": np.sqrt(mean_squared_error(*np.array(guesses).transpose())) }
def showResiduals(): # Load the data df = load_data('concrete') feature_names = [ 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' ] target_name = 'strength' # Get the X and y data from the DataFrame X = df[feature_names].as_matrix() y = df[target_name].as_matrix() # Create the train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the linear model and visualizer ridge = Ridge() visualizer = ResidualsPlot(ridge) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data g = visualizer.poof() # Draw/show/poof the data
def ridge_regression(X_train, y_train, X_test, y_test, plot): """ Perfomring a ridge regression with built in CV and plotting the feature importance """ # Fit the ridge regression reg = RidgeCV() reg.fit(X_train, y_train) print("Best alpha using built-in RidgeCV: %f" % reg.alpha_) print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train)) coef = pd.Series(reg.coef_, index=X_train.columns) print( "Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) # Extract the feature importance imp_coef = coef.sort_values() # Plot the feature importance if plot: plt.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Ridge Model") plt.show() # Visualizing the regression visualizer = ResidualsPlot(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Using the test data to calculate a score y_pred = reg.predict(X_test) # Return metrics return { "name": "Ridge Regression", "R squared": reg.score(X_test, y_test), "R squared training": reg.score(X_train, y_train), "RMSE": rmse(y_test, y_pred), "MAE": mean_absolute_error(y_test, y_pred), }
class PrincipalComponentRegressor(Regressor): def __init__(self, n_components): super().__init__() self.n_components = n_components self.regressor = LinearRegression() self.pca = None def fit(self, x_train, y_train, standardize=False): self.pca = PCA(self.n_components) self.x_train = self.pca.fit_transform(x_train) self.y_train = y_train self.regressor.fit(self.x_train, self.y_train) self._inference() return self.regressor.intercept_, self.regressor.coef_, self.p, self.regressor.score(self.x_train, y_train) def predict(self, x_test): try: x_test_transform = self.pca.transform(x_test) except ValueError: x_test_transform = x_test prediction = self.regressor.predict(x_test_transform) return prediction def residual_plot(self, x_test=None, y_test=None): if self.standardize: x_test = self.standardizescaler.transform(x_test) try: self.residual_visualizer = ResidualsPlot(self.regressor) except yellowbrick.exceptions.YellowbrickTypeError: self.residual_visualizer = ResidualsPlot(self.regressor.regressor) self.residual_visualizer.fit(self.x_train, self.y_train) if x_test is not None and y_test is not None: try: self.residual_visualizer.score(x_test, y_test) except ValueError: x_test = self.pca.transform(x_test) self.residual_visualizer.score(x_test, y_test) self.residual_visualizer.poof()
class RandForestRegressor(Regressor): def __init__(self): super().__init__() self.regressor = RandomForestRegressor() def fit(self, x_train, y_train, standardize=False): self.standardize = standardize if self.standardize: self.standardizescaler.fit(x_train) x_train = self.standardizescaler.transform(x_train) self.x_train = x_train self.y_train = y_train self.regressor.fit(self.x_train, self.y_train.ravel()) self._inference() return self.rsquared def residual_plot(self, x_test=None, y_test=None): if self.standardize: x_test = self.standardizescaler.transform(x_test) try: self.residual_visualizer = ResidualsPlot(self.regressor) except yellowbrick.exceptions.YellowbrickTypeError: self.residual_visualizer = ResidualsPlot(self.regressor.regressor) y_train = self.y_train.ravel() self.residual_visualizer.fit(self.x_train, y_train) if x_test is not None and y_test is not None: y_test = y_test.ravel() self.residual_visualizer.score(x_test, y_test) self.residual_visualizer.poof() def predict(self, x_test): if self.standardize: x_test = self.standardizescaler.transform(x_test) return self.regressor.predict(x_test).reshape(-1, 1)
mse = np.mean((pred - y_test)**2) mse ## calculating score ridgeReg.score(X_test,y_test) from yellowbrick.regressor import ResidualsPlot # Instantiate the linear model and visualizer ridge = Ridge() visualizer = ResidualsPlot(ridge) visualizer.fit(X_train, y_train) # Fit the training data to the model visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.poof() ##Apply different algos as on X_train,X_test,y_train,y_test # Fitting K-NN to the Training set from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) classifier.fit(X_train, y_train) # Predicting the Test set results pred_y = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix
#load data visualizations = load_dataset(file_name=config.TRAINING_DATA_FILE) #set X and y #adjust X based on feature set to use from config.py (TOP5_FEATURES or FEATURES) X = visualizations[config.TOP5_FEATURES] y = visualizations[config.TARGET] #train test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #yellowbrick ResidualsPlotVisualization visual visualizer = ResidualsPlot(config.BEST_MODEL) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath="visualizations/ResidualsPlotVisualization.pdf") visualizer.show(outpath="visualizations/ResidualsPlotVisualization.png") visualizer.show() #yellowbrick prediction error visual visualizer = PredictionError(config.BEST_MODEL) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath="visualizations/PredictionErrorVisualization.pdf") visualizer.show(outpath="visualizations/PredictionErrorVisualization.png") visualizer.show()
lr = LinearRegression().fit(X_train, y_train) lr.score(X_test, y_test) ### Yellowbrick from yellowbrick.regressor import PredictionError, ResidualsPlot ## RVF plot # Run the following together lr_yb = ResidualsPlot(lr, hist=True) lr_yb.fit(X_train, y_train) lr_yb.score(X_test, y_test) lr_yb.poof() ## Prediction Error plot lr_yb = PredictionError(lr, hist=True) lr_yb.fit(X_train, y_train) lr_yb.score(X_test, y_test) lr_yb.poof() ################ Polynomial/Interactions ################ from sklearn.pipeline import make_pipeline
def show_residuals_plot(model, X_train, y_train, X_valid, y_valid): residuals_plot = ResidualsPlot(model) residuals_plot.fit(X_train, y_train) residuals_plot.score(X_valid, y_valid) residuals_plot.show()
def binary_class(self, type, target, duplicated, sep, exclude, max_runtime_secs): img = plt.figure() self.write_image(img, 'blank', width=600, height=500) self.gstep(0, "Reading Dataset") buffer = io.StringIO() self.dfo.columns = [c.replace(' ', '_') for c in self.dfo.columns] self.gstep(1, "Verify if duplicated") self.insert_text( "shape", str(self.dfo.shape[0]) + ' / ' + str(self.dfo.shape[1])) self.get_classes(self.dfo, target) self.insert_text("nclasses", str(self.nclasses)) self.insert_text("allclasses", str(self.allclasses)) shape_before = self.dfo.shape[0] if duplicated: self.dfo = self.dfo.drop_duplicates(self.dfo.columns) shape_after = self.dfo.shape[0] if shape_before == shape_after: self.insert_text("duplicated", "none") else: self.insert_text("duplicated", str(shape_after - shape_before)) if exclude != 'none': self.dfo.drop(columns=exclude, inplace=True) self.gstep(1, "Detecting hi frequency features") exclude = self.hi_freq(self.dfo) self.dfo.drop(columns=exclude['Feature'], inplace=True) hi_freq = self.w_table(data=exclude, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text("excluded", hi_freq) self.gstep(1, "Encoding as sort_by_response") self.dfo_encode = self.encode(self.dfo.copy()) self.gstep(1, "Basic Informations") df_info = pd.DataFrame() for column in self.dfo.columns: not_null = int(self.dfo.shape[0] - int(self.dfo[column].isna().sum())) dtype = self.dfo[column].dtypes df_info = df_info.append( { 'column': column, 'not_null': not_null, 'dtype': dtype }, ignore_index=True) df_info['not_null'] = df_info['not_null'].apply(lambda x: int(x)) df_info['percent'] = df_info['not_null'].apply( lambda x: float("{:.4f}".format(1 - (x / self.dfo.shape[0])))) info_dataset = self.w_table(data=df_info, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text("info_dataset", info_dataset) self.gstep(1, "Computing Regression") Y = self.dfo_encode[target] dfo_num = self.dfo_encode[self.dfo_encode._get_numeric_data().columns] X = dfo_num.drop(columns=[target]) # Criando os dados de train e test X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) cols = X.columns formule = " + ".join(map(str, cols)) formule = target + " ~ " + formule reg = smf.ols(formule, data=dfo_num) res = reg.fit() self.insert_text('regression', str(res.summary())) self.gstep(1, "Unbalance Classes") temp = self.dfo[target].value_counts() df = pd.DataFrame({target: temp.index, 'values': temp.values}) plt.figure(figsize=(6, 6)) plt.title('Data Set - target value - data unbalance\n (' + target + ')') sns.set_color_codes("pastel") sns.barplot(x=target, y="values", data=df) locs, labels = plt.xticks() self.write_image(plt, "unbalance", width=500, height=350, crop=True) self.gstep(1, "Correlation") plt.clf() corr = self.dfo_encode.corr() mask = np.zeros_like(corr, dtype=bool) mask[np.triu_indices_from(mask)] = True cmap = sns.diverging_palette(230, 20, as_cmap=True) plt.figure(figsize=(8, 8)) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0, annot=True, square=True, linewidths=1.5, cbar_kws={"shrink": .5}) self.write_image(plt, "corr", width=0, height=0, crop=True) self.gstep(1, "Detecting Multicollinearity with VIF") y = self.dfo_encode[target] y = y.apply(lambda x: 1 if x == 'yes' else 0) X = self.dfo_encode.drop(target, axis=1) X = X[X._get_numeric_data().columns] X = X.fillna(0) X = X.dropna() vif = [ variance_inflation_factor(X.values, i) for i in range(X.shape[1]) ] cols = X.columns cols = cols[cols != target] df_m = pd.DataFrame({'cols': cols, 'vif': vif}) df_m['significant'] = '' df_m['significant'] = df_m['vif'].apply(self.parse_values) m_vif = self.w_table(data=df_m, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text("vif", str(m_vif)) i = 2 text = '' text2 = '' for column in self.dfo.columns: feature = self.dfo[column].describe() text = text + '<option value="' + str( i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t' text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue == '" + str( i ) + "') {\n\t\t\t\t\t\t\t\tdivElement.innerHTML = '" + pd.DataFrame( feature).to_html().replace('\n', '') + "';\n\t\t\t\t\t\t\t\t" i = i + 1 text2 = text2 + '\n\t\t\t\t\t\t\t\t};' self.insert_text('vif_desc_option', text) self.insert_text('vif_desc_table', text2) self.gstep(1, "Residual Analisys") plt.clf() model = Ridge() visualizer = ResidualsPlot(model, hist=False, qqplot=True) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) self.write_image(plt, "residual1", width=500, height=350, crop=True) plt.clf() visualizer = ResidualsPlot(model, hist=True, qqplot=False) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) self.write_image(plt, "residual2", width=500, height=350, crop=True) self.gstep(1, "Initializing H2O") h2o.init() self.gstep(1, "Parsing Data Frame") df = h2o.H2OFrame(self.dfo_encode) self.gstep(1, "Trainning Auto Machine Learning") train, valid, test = df.split_frame(ratios=[0.7, 0.2], seed=1234) x = train.columns y = target x.remove(y) train[y] = train[y].asfactor() test[y] = test[y].asfactor() aml = H2OAutoML(max_models=20, max_runtime_secs=max_runtime_secs, seed=1, include_algos=[ "GLM", "DeepLearning", "DRF", "xGBoost", "StackedEnsemble" ], balance_classes=True) aml.train(x=x, y=y, training_frame=train) lb = h2o.automl.get_leaderboard(aml, extra_columns='ALL') lb = lb.as_data_frame() lb = lb.drop(columns=['rmse', 'mse', 'predict_time_per_row_ms']) text = self.w_table(lb) self.insert_text('auto_ml_results', text) self.write_image(aml.varimp_heatmap(), 'var_imp_model', width=450, height=400, crop=True) self.gstep(1, "AML - Partial Dependence") i = 101 text = '' text2 = '' for column in tqdm(self.dfo.columns): feature = self.dfo[column].describe() text = text + '<option value="' + str( i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t' text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue2 == '" + str( i ) + "'){\n\t\t\t\t\t\t\t\tdivElement2.innerHTML = '<img src=\"images/img_aml_pd_" + str( i) + ".png\">';\n\t\t\t\t\t\t\t\t" self.write_image(aml.pd_multi_plot(valid, column), 'aml_pd_' + str(i), width=600, height=500) i = i + 1 text2 = text2 + '\n\t\t\t\t\t\t\t\t};' self.insert_text('aml_pd_option', text) self.insert_text('aml_pd_image', text2) self.gstep(1, "Trainning (GLM) Gradient Linear Model to Ensemble") nfolds = 5 family = "binomial" amlr_glm = H2OGeneralizedLinearEstimator( family=family, nfolds=nfolds, lambda_=0, max_runtime_secs=max_runtime_secs, balance_classes=True, fold_assignment="Modulo", compute_p_values=True, keep_cross_validation_predictions=True, remove_collinear_columns=True) amlr_glm.train(x, y, training_frame=train) self.gstep(1, "Trainning (DRF) Dynamic Random Forest to Ensemble") amlr_rf = H2ORandomForestEstimator( ntrees=50, nfolds=nfolds, fold_assignment="Modulo", max_runtime_secs=max_runtime_secs, balance_classes=True, keep_cross_validation_predictions=True, seed=1) amlr_rf.train(x=x, y=y, training_frame=train) self.gstep( 1, "Trainning (GBM) Gradient Boost Estimator Model to Ensemble") amlr_gbm = H2OGradientBoostingEstimator( nfolds=nfolds, seed=1111, balance_classes=True, fold_assignment="Modulo", max_runtime_secs=max_runtime_secs, keep_cross_validation_predictions=True) amlr_gbm.train(x=x, y=y, training_frame=train) self.gstep(1, "Trainning xGBoost Model to Ensemble") amlr_xgb = H2OXGBoostEstimator(booster='dart', nfolds=nfolds, normalize_type="tree", fold_assignment="Modulo", max_runtime_secs=max_runtime_secs, keep_cross_validation_predictions=True, seed=1234) amlr_xgb.train(x=x, y=y, training_frame=train, validation_frame=valid) self.gstep(1, "Trainning Deep Learning Model to Ensemble") family = "bernoulli" dl_model = H2ODeepLearningEstimator(distribution=family, hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", single_node_mode=False, balance_classes=True, force_load_balance=False, seed=23123, tweedie_power=1.5, max_runtime_secs=max_runtime_secs, score_training_samples=0, score_validation_samples=0, stopping_rounds=0) dl_model.train(x=x, y=y, training_frame=train) self.gstep(1, "Trainning Ensemble") ensemble = H2OStackedEnsembleEstimator( model_id="amlr_ensemble", base_models=[amlr_gbm, amlr_rf, amlr_xgb, amlr_glm]) ensemble.train(x=x, y=y, training_frame=train) i = 201 text = '' text2 = '' self.gstep(1, "Ensamble - (ICE) Individual Condition Expectation") for column in tqdm(self.dfo.columns): feature = self.dfo[column].describe() text = text + '<option value="' + str( i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t' text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue3 == '" + str( i ) + "'){\n\t\t\t\t\t\t\t\tdivElement3.innerHTML = '<img src=\"images/img_ice_pd_" + str( i) + ".png\">';\n\t\t\t\t\t\t\t\t" self.write_image(ensemble.ice_plot(valid, column), 'ice_pd_' + str(i), width=600, height=500) i = i + 1 text2 = text2 + '\n\t\t\t\t\t\t\t\t};' self.insert_text('ice_pd_option', text) self.insert_text('ice_pd_image', text2) self.gstep(1, "AMLR - Correlation by Model") self.write_image(aml.model_correlation_heatmap(test), 'aml_correlation_models') self.gstep(1, "Processing Models Performance") i = 0 dfp = pd.DataFrame({'Algo': []}) outcome = list(valid[target].as_data_frame()[target]) for algo in [ 'GLM', 'Random Forest', 'GBM', 'xGBoost', 'Deep Learning' ]: plt.clf() if algo == 'GLM': predict = list( amlr_glm.predict(valid).as_data_frame()['predict']) cf_table = 'cf_glm' cm_glm = ConfusionMatrix(outcome, predict) glm_var_imp = amlr_glm._model_json['output'][ 'variable_importances'].as_data_frame() x = glm_var_imp['percentage'] x.index = glm_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_glm', width=450, height=450) if algo == 'Random Forest': predict = list( amlr_rf.predict(valid).as_data_frame()['predict']) cf_table = 'cf_rf' cm_rf = ConfusionMatrix(outcome, predict) rf_var_imp = amlr_rf._model_json['output'][ 'variable_importances'].as_data_frame() x = rf_var_imp['percentage'] x.index = rf_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_rf', width=450, height=450) if algo == 'GBM': predict = list( amlr_gbm.predict(valid).as_data_frame()['predict']) cf_table = 'cf_gbm' cm_gbm = ConfusionMatrix(outcome, predict) gbm_var_imp = amlr_gbm._model_json['output'][ 'variable_importances'].as_data_frame() x = gbm_var_imp['percentage'] x.index = gbm_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_gbm', width=450, height=450) if algo == 'xGBoost': predict = list( amlr_xgb.predict(valid).as_data_frame()['predict']) cf_table = 'cf_xgb' cm_xgb = ConfusionMatrix(outcome, predict) xgb_var_imp = amlr_xgb._model_json['output'][ 'variable_importances'].as_data_frame() x = xgb_var_imp['percentage'] x.index = xgb_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_xgb', width=450, height=450) if algo == 'Deep Learning': predict = list( dl_model.predict(valid).as_data_frame()['predict']) cf_table = 'cf_dl' cm_dl = ConfusionMatrix(outcome, predict) dl_var_imp = dl_model._model_json['output'][ 'variable_importances'].as_data_frame() x = dl_var_imp['percentage'] x.index = dl_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_dl', width=450, height=450) # Confusion Matrix for all models cm = confusion_matrix(predict, outcome) cm = pd.DataFrame(cm) cr = classification_report(outcome, predict, target_names=self.allclasses, output_dict=True) table_cr = pd.DataFrame(cr).transpose().round(4) table_cr.reset_index(level=0, inplace=True) table_cr = table_cr.rename(columns={'index': 'Description'}) table_model = self.w_table(data=table_cr, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text(cf_table, str(table_model)) # Statistcs for all metrics cm = ConfusionMatrix(outcome, predict) dfp = pd.concat([dfp, pd.DataFrame(cm.overall_stat)[1:]], ignore_index=True) dfp.loc[i:, ['Algo']] = algo i = i + 1 dfp = dfp.round(4) cp = Compare({ 'RF': cm_rf, 'GLM': cm_glm, 'GBM': cm_gbm, 'XGB': cm_xgb, 'DL': cm_dl }) cp_best_name = cp.best_name cp = pd.DataFrame(cp.scores) cp.reset_index(level=0, inplace=True) cp = cp.rename(columns={'index': 'Description'}) table_cp = self.w_table(data=cp, border=0, align='left', collapse='collapse', color='black', foot=False) if str(cp_best_name) == 'None': cp_best_name = 'Confusion matrices are too close and the best one can not be recognized.' max_v = cp.loc[0][1:].max() i = 0 list_max = list() for column in cp.columns: if i > 0: if cp[column][0] >= max_v: list_max.append(column) i = i + 1 self.insert_text( "the_best_name", "Winners: " + ' - '.join(list_max) + '<br>' + cp_best_name) else: self.insert_text("the_best_name", str(cp_best_name)) self.insert_text("best_algorithms", str(table_cp)) self.insert_text("the_best_name", str(cp_best_name)) table_model = self.w_table(data=dfp, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text("table_performance", str(table_model)) self.gstep(1, "Closing!! All works are done!!") # write report self.write_report(self.index_html)
print(f"Train R2 is {lr_log.score(X=X_train_log, y=y_train_log)}") print(f"Test R2 is {lr_log.score(X=X_test_log, y=y_test_log)}") # There is a slight improvement (~2%) in the train R2 and test R2 utilizing log transform # + [markdown] pycharm={"name": "#%% md\n"} # ## Model Evaluation - Linear Regression # ### The following section evaluates the random error, constant variance and normal distribution with mean 0 assumption of linear model in the context of the four initial models utilizing a residual plot from Yellowbrick. # # - # Residual Plot for Huber LR with no log-transform from yellowbrick.regressor import ResidualsPlot rpv_hr = ResidualsPlot(hr) rpv_hr.fit(X=X_train, y=y_train) rpv_hr.score(X=X_test, y=y_test) rpv_hr.poof() rpv_lr = ResidualsPlot(lr) rpv_lr.fit(X=X_train, y=y_train) rpv_lr.score(X=X_test, y=y_test) rpv_lr.poof() # Residual Plot for LR with log transform rpv_lr_log = ResidualsPlot(lr_log) rpv_lr_log.fit(X=X_train_log, y=y_train_log) rpv_lr_log.score(X=X_test_log, y=y_test_log) rpv_lr_log.poof() # + [markdown] pycharm={"name": "#%% md\n"} # ## Model Evaluation of Ordinary Least Squares -Log Transform
st.write( eli5.formatters.as_dataframe.explain_weights_df( estimator=model_lr, feature_names=feature_names)[['feature', 'weight']]) ''' Koefisien yang paling besar dari model adalah GrLivArea sebesar 0.3154, artinya harga rumah sensitif dengan kolom ini. Apabila terjadi peningkatan terhadap nilai GrLivArea, harga rumah akan meningkat lebih tinggi dibandingkan apabila terjadi kenaikan pada feature yang lain dengan kenaikan yang sama. Perhatikan juga terdapat feature dengan nilai koefisien yang negatif (ExterQual_TA dan ExterQual_Fa), artinya apabila feature ini meningkat maka harga rumah akan menjadi lebih turun. ''' ''' #### 2. Residual Plot ''' st.write('') visualizer_residual = ResidualsPlot(model_lr) visualizer_residual.fit(X_train, y_train) visualizer_residual.score(X_test, y_test) visualizer_residual.finalize() st.pyplot() ''' Residual berdistribusi paling banyak pada nilai 0. Akan tetapi, masih terdapat nilai residual yang cukup tinggi. Hal ini menyebabkan distribusi dari residual tidak sepenuhnya normal, tetapi menjadi skew. ''' ''' #### 3. Prediction Error ''' st.write('') visualizer_prediction_error = PredictionError(model_lr) visualizer_prediction_error.fit(X_train, y_train) visualizer_prediction_error.score(X_test, y_test) visualizer_prediction_error.finalize()