# diagnostic plots y_training_predictions = model.predict(X_train) y_test_predictions = model.predict(X_test) # Plot residuals plot_residuals(y_training_predictions, y_test_predictions, y_train, y_test) residuals_path = "./reports/figures/all_features_no_null/residuals.png" plt.savefig(residuals_path) # Plot predictions vs actual predicted_plot = PredictionPlot(title="Linear Regression (all_features)") predicted_plot.plot(y_training_predictions, y_test_predictions, y_train, y_test) predictions_path = "./reports/figures/all_features_no_null/predictions_vs_actuals.png" predicted_plot.save(predictions_path) model_path = "models/all_features_no_null/model.pickle" with open(model_path, "wb") as model_file_pointer: pickle.dump(model, model_file_pointer) if __name__ == "__main__": configure_logging() dataset = load_training_dataset() logging.info("Linear Regression all features model") train_basic_model(dataset, preprocessing_pipeline_handle_nulls)
def main_method(): configure_logging() train = load_training_dataset() logging.info(train.shape) # Duplicates check check_for_duplicates(train) exit() # ## Pre-processing plot_col_vs_sale_price(train, "GrLivArea", title="Looking for outliers") # Drop the houses with more than 4000 sq feet following dataset author recommendations: # https://ww2.amstat.org/publications/jse/v19n3/decock.pdf # ### Pre-processing steps added # # * Filter large house outliers # * Log transform the target (Sale Price) # * errors have same effect whether the house is cheap or not # train = preprocessing_pipeline(train) y = train.SalePrice plot_col_vs_sale_price(train, "GrLivArea", title="Area vs Sale Price AFTER Log transform") train = fill_null_values(train) # Numerical features that are really categories # TODO: pull into pipeline method train = create_sub_class_categories(train) train = create_month_sold_category(train) # Encode categoricals as ordered number features # when there is information in the order train = create_ordinal_categories(train) # Simplifications of existing features create_simple_overall_quality(train) create_simple_overall_condition(train) create_simple_pool_quality(train) create_simple_garage_condition(train) create_simple_garage_quality(train) create_simple_fireplace_quality(train) create_simple_functional_feature(train) create_simple_kitchen_quality(train) create_simple_heating_quality(train) create_simple_basement_finish(train, "BsmtFinType1") create_simple_basement_finish(train, "BsmtFinType2") create_simple_basement_condition(train) create_simple_basement_quality(train) create_simple_exterior_condition(train) create_simple_exterior_quality(train) # Combinations of existing features create_interaction_features(train) # Has masonry veneer or not create_simple_has_masonry_veneer(train) create_house_bought_pre_build(train) create_polynomial_features(train) # #### Split numerical and categorical features categorical_features = train.select_dtypes(include=["object"]).columns numerical_features = train.select_dtypes(exclude=["object"]).columns.drop("SalePrice") logging.info(f"Numerical features: {len(numerical_features)}") logging.info(f"Categorical features: {len(categorical_features)}") train_num = train[numerical_features] train_cat = train[categorical_features] # Handle missing values in numerical features by using the median logging.info(f"Missing numerical values: {train_num.isnull().values.sum()}") train_num = train_num.fillna(train_num.median()) logging.info(f"Remaining missing numerical values: {train_num.isnull().values.sum()}") ## Log transform skewed numerical features to lessen impact of outliers # Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models # As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed skewness = train_num.apply(lambda x: skew(x)) skewness = skewness[abs(skewness) > 0.5] skewed_features = skewness.index train_num[skewed_features] = np.log1p(train_num[skewed_features]) ## One hot encode categorical variables train_cat = pd.get_dummies(train_cat) train = pd.concat([train_num, train_cat], axis=1) logging.info(f"Number of features: {train.shape[1]}") # ## Modelling # # * Split dataset # * Standardisation (don't want to fit on observations that will be in the test set) # # ### Modelling techniques tried # # * Linear regression # * Ridge Regression (L2) # * LASSO (L1) # * ElasticNET (L1 AND L2) # # Split training set X_train, X_test, y_train, y_test = train_test_split( train, y, test_size=0.3, random_state=0 ) logging.info("X_train", str(X_train.shape)) logging.info("X_test", str(X_test.shape)) logging.info("y_train", str(y_train.shape)) logging.info("y_test", str(y_test.shape)) # Standard scale the features # Done after partitioning to avoid fitting scaler to observations in the test set # Should the scaler be pickled for deployment use cases then? scaler = StandardScaler() X_train.loc[:, numerical_features] = scaler.fit_transform(X_train.loc[:, numerical_features]) X_test.loc[:, numerical_features] = scaler.transform(X_test.loc[:, numerical_features]) # Official error measure for scoring: RMSE scorer = make_scorer(mean_squared_error, greater_is_better=False) # ## Linear regression without regularisation linear_regression = LinearRegression() linear_regression.fit(X_train, y_train) logging.info(f"RMSE on Training set: {rmse_cv_train(linear_regression).mean()}") logging.info(f"RMSE on Test set: {rmse_cv_test(linear_regression).mean()}") y_train_pred = linear_regression.predict(X_train) y_test_pred = linear_regression.predict(X_test) # Plot residuals plt.scatter(y_train_pred, y_train_pred - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test_pred - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear Regression") plt.xlabel("Predicted Values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test, c="lightgreen", marker="s", label="Validation") plt.title("Linear Regression") plt.xlabel("Predicted Values") plt.ylabel("Real Values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() # ## Linear Regression with Ridge Regression (L2 Penalty) # # * Regularisation is a good way to hadnle collinearity, filter out noise and prevent overfitting. # * L2 penalty add the squared sum of weights to cost function ridge_regression = RidgeCV( alphas=[0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60] ) ridge_regression.fit(X_train, y_train) best_alpha = ridge_regression.alpha_ logging.info(f"Best alpha {best_alpha}") logging.info("Re-fit with alphas around the best alpha") ridge_regression = RidgeCV( alphas=[ best_alpha * .6, best_alpha * .65, best_alpha * .7, best_alpha * .75, best_alpha * .8, best_alpha * .85, best_alpha * .9, best_alpha * .9, best_alpha * .95, best_alpha, best_alpha * 1.05, best_alpha * 1.1, best_alpha * 1.15, best_alpha * 1.2, best_alpha * 1.25, best_alpha * 1.3, best_alpha * 1.35, best_alpha * 1.4, ], cv=10 ) ridge_regression.fit(X_train, y_train) best_alpha = ridge_regression.alpha_ logging.info(f"Best alpha {best_alpha}") logging.info(f"Ridge RMSE on Training set: {rmse_cv_train(ridge_regression).mean()}") logging.info(f"Ridge RMSE on Test set: {rmse_cv_test(ridge_regression).mean()}") y_train_pred = ridge_regression.predict(X_train) y_test_pred = ridge_regression.predict(X_test) # Plot residuals plt.scatter(y_train_pred, y_train_pred - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test_pred - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear Regression with Ridge regularisation") plt.xlabel("Predicted Values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test, c="lightgreen", marker="s", label="Validation") plt.title("Linear Regression with Ridge regularisation") plt.xlabel("Predicted Values") plt.ylabel("Real Values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() ## Plot important coefficients coefs = pd.Series(ridge_regression.coef_, index=X_train.columns) logging.info(f"Ridge picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features") important_coefficients = pd.concat([coefs.sort_values().head(10), coefs.sort_values().tail(10)]) important_coefficients.plot(kind="barh") plt.title("Coefficients in the Ridge Model") plt.show() # Results: # * Better RMSE for Ridge # * Small difference between training and test suggests that overfitting has been eliminated # * Ridge used almost all of the features (only 3 dropped) # # TODO: Understand the coefficient plot (if possible?) # ## LASSO Regression # # Least Absolute Shrinkage and Selection Operator. # # Alternative regularisation method, L1 Regularisation, use the absolute value of weights rather than squares. # # Most weights will be 0. Useful in high dimensional dataset where most features are irrelevant. # # Hypothesis: more efficient than Ridge Regression. lasso_regression = LassoCV( alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter=50000, cv=10 ) lasso_regression.fit(X_train, y_train) best_alpha = lasso_regression.alpha_ logging.info(f"Best alpha {best_alpha}") logging.info("Re-fit with alphas around the best alpha") lasso_regression = LassoCV( alphas=[ best_alpha * .6, best_alpha * .65, best_alpha * .7, best_alpha * .75, best_alpha * .8, best_alpha * .85, best_alpha * .9, best_alpha * .9, best_alpha * .95, best_alpha, best_alpha * 1.05, best_alpha * 1.1, best_alpha * 1.15, best_alpha * 1.2, best_alpha * 1.25, best_alpha * 1.3, best_alpha * 1.35, best_alpha * 1.4, ], max_iter=50000, cv=10 ) lasso_regression.fit(X_train, y_train) best_alpha = lasso_regression.alpha_ logging.info(f"Best alpha {best_alpha}") logging.info(f"LASSO RMSE on Training set: {rmse_cv_train(lasso_regression).mean()}") logging.info(f"LASSO RMSE on Test set: {rmse_cv_test(lasso_regression).mean()}") y_train_pred = lasso_regression.predict(X_train) y_test_pred = lasso_regression.predict(X_test) # Plot residuals plt.scatter(y_train_pred, y_train_pred - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test_pred - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear Regression with LASSO regularisation") plt.xlabel("Predicted Values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test, c="lightgreen", marker="s", label="Validation") plt.title("Linear Regression with Ridge regularisation") plt.xlabel("Predicted Values") plt.ylabel("Real Values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() ## Plot important coefficients coefs = pd.Series(lasso_regression.coef_, index=X_train.columns) logging.info(f"LASSO picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features") important_coefficients = pd.concat([coefs.sort_values().head(10), coefs.sort_values().tail(10)]) important_coefficients.plot(kind="barh") plt.title("Coefficients in the Ridge Model") plt.show() # ## ElasticNET # # * Compromise between L1 and L2 penalties elastic_net_regression = ElasticNetCV( l1_ratio=[0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1], alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter=50000, cv=10 ) elastic_net_regression.fit(X_train, y_train) best_l1_ratio = elastic_net_regression.l1_ratio_ best_alpha = elastic_net_regression.alpha_ logging.info(f"Best L1 Ratio {best_l1_ratio}") logging.info(f"Best alpha {best_alpha}") logging.info("Re-fit with alphas around the best alpha") elastic_net_regression = ElasticNetCV( l1_ratio=[ best_l1_ratio * .85, best_l1_ratio * .9, best_l1_ratio * .9, best_l1_ratio * .95, best_l1_ratio, best_l1_ratio * 1.05, best_l1_ratio * 1.1, best_l1_ratio * 1.15 ], alphas=[ best_alpha * .6, best_alpha * .65, best_alpha * .7, best_alpha * .75, best_alpha * .8, best_alpha * .85, best_alpha * .9, best_alpha * .9, best_alpha * .95, best_alpha, best_alpha * 1.05, best_alpha * 1.1, best_alpha * 1.15, best_alpha * 1.2, best_alpha * 1.25, best_alpha * 1.3, best_alpha * 1.35, best_alpha * 1.4, ], max_iter=50000, cv=10 ) elastic_net_regression.fit(X_train, y_train) best_l1_ratio = elastic_net_regression.l1_ratio_ best_alpha = elastic_net_regression.alpha_ logging.info(f"Best L1 Ratio {best_l1_ratio}") logging.info(f"Best alpha {best_alpha}") logging.info(f"ElasticNet RMSE on Training set: {rmse_cv_train(elastic_net_regression).mean()}") logging.info(f"ElasticNet RMSE on Test set: {rmse_cv_test(elastic_net_regression).mean()}") y_train_pred = elastic_net_regression.predict(X_train) y_test_pred = elastic_net_regression.predict(X_test) # Plot residuals plt.scatter(y_train_pred, y_train_pred - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test_pred - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear Regression with ElasticNET regularisation") plt.xlabel("Predicted Values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test, c="lightgreen", marker="s", label="Validation") plt.title("Linear Regression with ElasticNET regularisation") plt.xlabel("Predicted Values") plt.ylabel("Real Values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() ## Plot important coefficients coefs = pd.Series(elastic_net_regression.coef_, index=X_train.columns) logging.info(f"ElasticNET picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features") important_coefficients = pd.concat([coefs.sort_values().head(10), coefs.sort_values().tail(10)]) important_coefficients.plot(kind="barh") plt.title("Coefficients in the ElasticNET Model") plt.show()