# diagnostic plots
    y_training_predictions = model.predict(X_train)
    y_test_predictions = model.predict(X_test)

    # Plot residuals
    plot_residuals(y_training_predictions, y_test_predictions, y_train, y_test)
    residuals_path = "./reports/figures/all_features_no_null/residuals.png"
    plt.savefig(residuals_path)

    # Plot predictions vs actual
    predicted_plot = PredictionPlot(title="Linear Regression (all_features)")
    predicted_plot.plot(y_training_predictions, y_test_predictions, y_train,
                        y_test)

    predictions_path = "./reports/figures/all_features_no_null/predictions_vs_actuals.png"
    predicted_plot.save(predictions_path)

    model_path = "models/all_features_no_null/model.pickle"
    with open(model_path, "wb") as model_file_pointer:
        pickle.dump(model, model_file_pointer)


if __name__ == "__main__":
    configure_logging()

    dataset = load_training_dataset()

    logging.info("Linear Regression all features model")
    train_basic_model(dataset, preprocessing_pipeline_handle_nulls)
Exemple #2
0
def main_method():
    configure_logging()
    train = load_training_dataset()
    logging.info(train.shape)

    # Duplicates check
    check_for_duplicates(train)
    exit()

    # ## Pre-processing
    plot_col_vs_sale_price(train, "GrLivArea", title="Looking for outliers")

    # Drop the houses with more than 4000 sq feet following dataset author recommendations:
    # https://ww2.amstat.org/publications/jse/v19n3/decock.pdf
    # ### Pre-processing steps added
    #
    # * Filter large house outliers
    # * Log transform the target (Sale Price)
    # * errors have same effect whether the house is cheap or not
    #
    train = preprocessing_pipeline(train)
    y = train.SalePrice

    plot_col_vs_sale_price(train, "GrLivArea", title="Area vs Sale Price AFTER Log transform")

    train = fill_null_values(train)

    # Numerical features that are really categories
    # TODO: pull into pipeline method
    train = create_sub_class_categories(train)
    train = create_month_sold_category(train)

    # Encode categoricals as ordered number features
    # when there is information in the order
    train = create_ordinal_categories(train)

    # Simplifications of existing features
    create_simple_overall_quality(train)
    create_simple_overall_condition(train)
    create_simple_pool_quality(train)
    create_simple_garage_condition(train)
    create_simple_garage_quality(train)
    create_simple_fireplace_quality(train)
    create_simple_functional_feature(train)
    create_simple_kitchen_quality(train)
    create_simple_heating_quality(train)
    create_simple_basement_finish(train, "BsmtFinType1")
    create_simple_basement_finish(train, "BsmtFinType2")

    create_simple_basement_condition(train)
    create_simple_basement_quality(train)
    create_simple_exterior_condition(train)
    create_simple_exterior_quality(train)

    # Combinations of existing features
    create_interaction_features(train)

    # Has masonry veneer or not
    create_simple_has_masonry_veneer(train)

    create_house_bought_pre_build(train)

    create_polynomial_features(train)

    # #### Split numerical and categorical features
    categorical_features = train.select_dtypes(include=["object"]).columns
    numerical_features = train.select_dtypes(exclude=["object"]).columns.drop("SalePrice")
    logging.info(f"Numerical features: {len(numerical_features)}")
    logging.info(f"Categorical features: {len(categorical_features)}")
    train_num = train[numerical_features]
    train_cat = train[categorical_features]

    # Handle missing values in numerical features by using the median
    logging.info(f"Missing numerical values: {train_num.isnull().values.sum()}")
    train_num = train_num.fillna(train_num.median())
    logging.info(f"Remaining missing numerical values: {train_num.isnull().values.sum()}")

    ## Log transform skewed numerical features to lessen impact of outliers
    # Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models
    # As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed
    skewness = train_num.apply(lambda x: skew(x))
    skewness = skewness[abs(skewness) > 0.5]
    skewed_features = skewness.index
    train_num[skewed_features] = np.log1p(train_num[skewed_features])

    ## One hot encode categorical variables
    train_cat = pd.get_dummies(train_cat)

    train = pd.concat([train_num, train_cat], axis=1)
    logging.info(f"Number of features: {train.shape[1]}")

    # ## Modelling
    #
    # * Split dataset
    # * Standardisation (don't want to fit on observations that will be in the test set)
    #
    # ### Modelling techniques tried
    #
    # * Linear regression
    # * Ridge Regression (L2)
    # * LASSO (L1)
    # * ElasticNET (L1 AND L2)
    #
    # Split training set
    X_train, X_test, y_train, y_test = train_test_split(
        train,
        y,
        test_size=0.3,
        random_state=0
    )
    logging.info("X_train", str(X_train.shape))
    logging.info("X_test", str(X_test.shape))
    logging.info("y_train", str(y_train.shape))
    logging.info("y_test", str(y_test.shape))
    # Standard scale the features
    # Done after partitioning to avoid fitting scaler to observations in the test set
    # Should the scaler be pickled for deployment use cases then?
    scaler = StandardScaler()
    X_train.loc[:, numerical_features] = scaler.fit_transform(X_train.loc[:, numerical_features])
    X_test.loc[:, numerical_features] = scaler.transform(X_test.loc[:, numerical_features])
    # Official error measure for scoring: RMSE
    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    # ## Linear regression without regularisation
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)
    logging.info(f"RMSE on Training set: {rmse_cv_train(linear_regression).mean()}")
    logging.info(f"RMSE on Test set: {rmse_cv_test(linear_regression).mean()}")
    y_train_pred = linear_regression.predict(X_train)
    y_test_pred = linear_regression.predict(X_test)
    # Plot residuals
    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear Regression")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(y_train_pred,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation")
    plt.title("Linear Regression")
    plt.xlabel("Predicted Values")
    plt.ylabel("Real Values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    # ## Linear Regression with Ridge Regression (L2 Penalty)
    #
    # * Regularisation is a good way to hadnle collinearity, filter out noise and prevent overfitting.
    # * L2 penalty add the squared sum of weights to cost function
    ridge_regression = RidgeCV(
        alphas=[0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60]
    )
    ridge_regression.fit(X_train, y_train)
    best_alpha = ridge_regression.alpha_
    logging.info(f"Best alpha {best_alpha}")
    logging.info("Re-fit with alphas around the best alpha")
    ridge_regression = RidgeCV(
        alphas=[
            best_alpha * .6,
            best_alpha * .65,
            best_alpha * .7,
            best_alpha * .75,
            best_alpha * .8,
            best_alpha * .85,
            best_alpha * .9,
            best_alpha * .9,
            best_alpha * .95,
            best_alpha,
            best_alpha * 1.05,
            best_alpha * 1.1,
            best_alpha * 1.15,
            best_alpha * 1.2,
            best_alpha * 1.25,
            best_alpha * 1.3,
            best_alpha * 1.35,
            best_alpha * 1.4,
        ],
        cv=10
    )
    ridge_regression.fit(X_train, y_train)
    best_alpha = ridge_regression.alpha_
    logging.info(f"Best alpha {best_alpha}")
    logging.info(f"Ridge RMSE on Training set: {rmse_cv_train(ridge_regression).mean()}")
    logging.info(f"Ridge RMSE on Test set: {rmse_cv_test(ridge_regression).mean()}")
    y_train_pred = ridge_regression.predict(X_train)
    y_test_pred = ridge_regression.predict(X_test)
    # Plot residuals
    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear Regression with Ridge regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(y_train_pred,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation")
    plt.title("Linear Regression with Ridge regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Real Values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    ## Plot important coefficients
    coefs = pd.Series(ridge_regression.coef_, index=X_train.columns)
    logging.info(f"Ridge picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features")
    important_coefficients = pd.concat([coefs.sort_values().head(10),
                                        coefs.sort_values().tail(10)])
    important_coefficients.plot(kind="barh")
    plt.title("Coefficients in the Ridge Model")
    plt.show()
    # Results:
    # * Better RMSE for Ridge
    # * Small difference between training and test suggests that overfitting has been eliminated
    # * Ridge used almost all of the features (only 3 dropped)
    #
    # TODO: Understand the coefficient plot (if possible?)
    # ## LASSO Regression
    #
    # Least Absolute Shrinkage and Selection Operator.
    #
    # Alternative regularisation method, L1 Regularisation, use the absolute value of weights rather than squares.
    #
    # Most weights will be 0. Useful in high dimensional dataset where most features are irrelevant.
    #
    # Hypothesis: more efficient than Ridge Regression.
    lasso_regression = LassoCV(
        alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1],
        max_iter=50000,
        cv=10
    )
    lasso_regression.fit(X_train, y_train)
    best_alpha = lasso_regression.alpha_
    logging.info(f"Best alpha {best_alpha}")
    logging.info("Re-fit with alphas around the best alpha")
    lasso_regression = LassoCV(
        alphas=[
            best_alpha * .6,
            best_alpha * .65,
            best_alpha * .7,
            best_alpha * .75,
            best_alpha * .8,
            best_alpha * .85,
            best_alpha * .9,
            best_alpha * .9,
            best_alpha * .95,
            best_alpha,
            best_alpha * 1.05,
            best_alpha * 1.1,
            best_alpha * 1.15,
            best_alpha * 1.2,
            best_alpha * 1.25,
            best_alpha * 1.3,
            best_alpha * 1.35,
            best_alpha * 1.4,
        ],
        max_iter=50000,
        cv=10
    )
    lasso_regression.fit(X_train, y_train)
    best_alpha = lasso_regression.alpha_
    logging.info(f"Best alpha {best_alpha}")
    logging.info(f"LASSO RMSE on Training set: {rmse_cv_train(lasso_regression).mean()}")
    logging.info(f"LASSO RMSE on Test set: {rmse_cv_test(lasso_regression).mean()}")
    y_train_pred = lasso_regression.predict(X_train)
    y_test_pred = lasso_regression.predict(X_test)
    # Plot residuals
    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear Regression with LASSO regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(y_train_pred,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation")
    plt.title("Linear Regression with Ridge regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Real Values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    ## Plot important coefficients
    coefs = pd.Series(lasso_regression.coef_, index=X_train.columns)
    logging.info(f"LASSO picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features")
    important_coefficients = pd.concat([coefs.sort_values().head(10),
                                        coefs.sort_values().tail(10)])
    important_coefficients.plot(kind="barh")
    plt.title("Coefficients in the Ridge Model")
    plt.show()
    # ## ElasticNET
    #
    # * Compromise between L1 and L2 penalties
    elastic_net_regression = ElasticNetCV(
        l1_ratio=[0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1],
        alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1],
        max_iter=50000,
        cv=10
    )
    elastic_net_regression.fit(X_train, y_train)
    best_l1_ratio = elastic_net_regression.l1_ratio_
    best_alpha = elastic_net_regression.alpha_
    logging.info(f"Best L1 Ratio {best_l1_ratio}")
    logging.info(f"Best alpha {best_alpha}")
    logging.info("Re-fit with alphas around the best alpha")
    elastic_net_regression = ElasticNetCV(
        l1_ratio=[
            best_l1_ratio * .85,
            best_l1_ratio * .9,
            best_l1_ratio * .9,
            best_l1_ratio * .95,
            best_l1_ratio,
            best_l1_ratio * 1.05,
            best_l1_ratio * 1.1,
            best_l1_ratio * 1.15
        ],
        alphas=[
            best_alpha * .6,
            best_alpha * .65,
            best_alpha * .7,
            best_alpha * .75,
            best_alpha * .8,
            best_alpha * .85,
            best_alpha * .9,
            best_alpha * .9,
            best_alpha * .95,
            best_alpha,
            best_alpha * 1.05,
            best_alpha * 1.1,
            best_alpha * 1.15,
            best_alpha * 1.2,
            best_alpha * 1.25,
            best_alpha * 1.3,
            best_alpha * 1.35,
            best_alpha * 1.4,
        ],
        max_iter=50000,
        cv=10
    )
    elastic_net_regression.fit(X_train, y_train)
    best_l1_ratio = elastic_net_regression.l1_ratio_
    best_alpha = elastic_net_regression.alpha_
    logging.info(f"Best L1 Ratio {best_l1_ratio}")
    logging.info(f"Best alpha {best_alpha}")
    logging.info(f"ElasticNet RMSE on Training set: {rmse_cv_train(elastic_net_regression).mean()}")
    logging.info(f"ElasticNet RMSE on Test set: {rmse_cv_test(elastic_net_regression).mean()}")
    y_train_pred = elastic_net_regression.predict(X_train)
    y_test_pred = elastic_net_regression.predict(X_test)
    # Plot residuals
    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear Regression with ElasticNET regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(y_train_pred,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation")
    plt.title("Linear Regression with ElasticNET regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Real Values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    ## Plot important coefficients
    coefs = pd.Series(elastic_net_regression.coef_, index=X_train.columns)
    logging.info(f"ElasticNET picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features")
    important_coefficients = pd.concat([coefs.sort_values().head(10),
                                        coefs.sort_values().tail(10)])
    important_coefficients.plot(kind="barh")
    plt.title("Coefficients in the ElasticNET Model")
    plt.show()