Beispiel #1
0
def test_fit_ml_model_with_evaluation_with_user_provided_bounds():
    """Tests fit_ml_model_with_evaluation
        with min_admissible_value and max_admissible_value"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        min_admissible_value=-7,
        max_admissible_value=20.00)

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values on a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [
        8.36, 11.19, 1.85, 15.57, 16.84, 14.44, 20.00, 9.02, 1.81, -7.00
    ]
    assert list(y_test_pred.round(2)) == expected_values
Beispiel #2
0
def test_fit_ml_model_with_evaluation_skip_test():
    """Tests fit_ml_model_with_evaluation, on linear model,
        skipping test set"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df, model_formula_str=model_formula_str, training_fraction=1.0)

    assert len(trained_model["y_test"]) == 0
    assert trained_model["y_test_pred"] is None
    assert trained_model["test_evaluation"] is None
    assert trained_model["plt_compare_test"] is None

    arr1 = predict_ml(fut_df=df, trained_model=trained_model)[y_col].tolist()
    arr2 = trained_model["y_train_pred"]

    assert np.array_equal(arr1, arr2)

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]
    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5
Beispiel #3
0
def test_fit_ml_model_with_evaluation_with_test_set():
    """Tests fit_ml_model_with_evaluation, with test set"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm="sgd",
        fit_algorithm_params={"alpha": 0.1})

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    assert trained_model["ml_model"].alpha == 0.1
    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert round(err[enum.get_metric_name()]) == 6.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert round(err[enum.get_metric_name()]) == 7.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [9.0, 9.0, 7.0, 10.0, 10.0, 10.0, 11.0, 9.0, 8.0, 6.0]
    assert list(y_test_pred.round()) == expected_values
Beispiel #4
0
def test_fit_ml_model_with_evaluation_constant_column_sgd():
    """Tests fit_ml_model_with_evaluation using sgd with
    no penalty when some regressors are constant
    With limited data, the models converge to slightly different predictions
    than the linear model"""
    res = generate_test_data_for_fitting(n=80)
    df = res["df"]
    y_test = res["y_test"]
    df_test = res["df_test"]
    y_col = "y"

    # add constant columns
    new_cols = []
    for i in range(300):
        col = f"cst{i}"
        df[col] = 0
        df_test[col] = 2
        new_cols.append(col)
    df["cst_event"] = "string"
    df_test["cst_event"] = "string"
    new_cols.append("cst_event")

    model_formula_str = "+".join([res["model_formula_str"]] + new_cols)

    fit_algorithm = "sgd"
    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm=fit_algorithm,
        fit_algorithm_params={
            "tol": 1e-5,
            "penalty": "none"
        })

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]
    expected_values = [-8.0, 2.0, 3.0, -2.0, 38.0, 0.0, 0.0]
    assert list(pd.Series(
        trained_model["ml_model"].coef_)[:7].round()) == expected_values

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [
        17.0, 18.0, 16.0, 13.0, 9.0, 2.0, 20.0, 12.0, 12.0, 13.0
    ]
    assert list(y_test_pred.round()) == expected_values
Beispiel #5
0
def test_fit_ml_model_with_evaluation_constant_column():
    """Tests ``fit_ml_model_with_evaluation``
    when some regressors are constant"""
    res = generate_test_data_for_fitting(n=80)
    df = res["df"]
    y_test = res["y_test"]
    df_test = res["df_test"]
    y_col = "y"

    # add constant columns
    new_cols = []
    for i in range(300):
        col = f"cst{i}"
        df[col] = 0
        df_test[col] = 2
        new_cols.append(col)
    df["cst_event"] = "string"
    df_test["cst_event"] = "string"
    new_cols.append("cst_event")

    model_formula_str = "+".join([res["model_formula_str"]] + new_cols)

    fit_algorithm = "linear"
    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm=fit_algorithm,
        normalize_method="min_max")

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    # intercept, x1, x2, x3, x4, [constant columns]
    expected_values = [-23.0, 1.0, 4.0, 0.0, 44.0, 0.0, 0.0]
    assert list(pd.Series(
        trained_model["ml_model"].coef_)[:7].round()) == expected_values

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [
        18.0, 19.0, 16.0, 13.0, 9.0, 1.0, 23.0, 14.0, 12.0, 14.0
    ]
    assert list(y_test_pred.round()) == expected_values
Beispiel #6
0
def test_fit_ml_model_with_evaluation_with_weights():
    """Tests fit_ml_model_with_evaluation, with test set"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"
    df["weights"] = range(1, len(df) + 1)

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm="ridge",
        regression_weight_col="weights")

    assert trained_model["regression_weight_col"] == "weights"

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert round(err[enum.get_metric_name()]) == 2.0

    # Checks for raising exception if weights have negative values
    df["weights"] = -df["weights"]
    with pytest.raises(ValueError, match="Weights can not be negative."):
        fit_ml_model_with_evaluation(df=df,
                                     model_formula_str=model_formula_str,
                                     fit_algorithm="ridge",
                                     regression_weight_col="weights")
Beispiel #7
0
def test_fit_ml_model_with_evaluation_sgd():
    """Tests fit_ml_model_with_evaluation, on sgd model"""
    res = generate_test_data_for_fitting()
    df = res["df"]
    model_formula_str = res["model_formula_str"]
    y_test = res["y_test"]
    df_test = res["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm="sgd",
        fit_algorithm_params={"penalty": "none"})

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm="sgd",
        fit_algorithm_params={
            "penalty": "elasticnet",
            "alpha": 0.01,
            "l1_ratio": 0.2
        })

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert round(err[enum.get_metric_name()]) == 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert round(err[enum.get_metric_name()]) == 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5
Beispiel #8
0
def test_fit_ml_model_with_evaluation_random_forest():
    """Tests fit_ml_model_with_evaluation, on random forest model"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df, model_formula_str=model_formula_str, fit_algorithm="rf")

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 4.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 4.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5
Beispiel #9
0
def test_fit_ml_model_with_evaluation_with_uncertainty():
    """Tests fit_ml_model_with_evaluation with uncertainty intervals"""
    df = gen_sliced_df(sample_size_dict={
        "a": 200,
        "b": 340,
        "c": 300,
        "d": 8,
        "e": 800
    },
                       seed_dict={
                           "a": 301,
                           "b": 167,
                           "c": 593,
                           "d": 893,
                           "e": 191,
                           "z": 397
                       },
                       err_magnitude_coef=8.0)

    df = df[["x", "z_categ", "y_hat"]]
    df.rename(columns={"y_hat": "y"}, inplace=True)
    model_formula_str = "y~x+z_categ"
    y_col = "y"
    # test_df
    fut_df = df.copy()
    # we change the name of the column of true values in fut_df
    # to be able to keep track of true values later
    fut_df.rename(columns={"y": "y_true"}, inplace=True)
    y_test = fut_df["y_true"]
    # create a small dataframe for testing values only
    small_sample_index = [1, 500, 750, 1000]

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        uncertainty_dict={
            "uncertainty_method": "simple_conditional_residuals",
            "params": {
                "quantiles": [0.025, 0.975],
                "quantile_estimation_method": "normal_fit",
                "sample_size_thresh": 10,
                "small_sample_size_method": "std_quantiles",
                "small_sample_size_quantile": 0.8
            }
        })

    y_test_pred = predict_ml(fut_df=fut_df, trained_model=trained_model)[y_col]
    y_test_pred_small = y_test_pred[small_sample_index]

    # testing predictions
    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 10.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 10.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    assert list(
        y_test_pred_small.round(1)) == [99.7, 201.5, 303.5,
                                        7.3], ("predictions are not correct")

    # testing uncertainty
    # assign the predicted y to the response in fut_df
    fut_df["y"] = y_test_pred
    new_df_with_uncertainty = predict_ci(fut_df,
                                         trained_model["uncertainty_model"])
    assert list(new_df_with_uncertainty.columns) == [
        "y_quantile_summary", ERR_STD_COL
    ], ("column names are not as expected")
    fut_df["y_quantile_summary"] = new_df_with_uncertainty[
        "y_quantile_summary"]

    # calculate coverage of the CI
    fut_df["inside_95_ci"] = fut_df.apply(lambda row: (
        (row["y_true"] <= row["y_quantile_summary"][1]) and
        (row["y_true"] >= row["y_quantile_summary"][0])),
                                          axis=1)

    ci_coverage = 100.0 * fut_df["inside_95_ci"].mean()
    assert ci_coverage > 94.0 and ci_coverage < 96.0, (
        "95 percent CI coverage is not between 94 and 96")

    # testing uncertainty_method not being implemented but passed
    with pytest.raises(
            Exception,
            match="uncertainty method: non_existing_method is not implemented"
    ):
        fit_ml_model_with_evaluation(df=df,
                                     model_formula_str=model_formula_str,
                                     uncertainty_dict={
                                         "uncertainty_method":
                                         "non_existing_method",
                                         "params": {
                                             "quantiles": [0.025, 0.975],
                                             "quantile_estimation_method":
                                             "normal_fit",
                                             "sample_size_thresh": 10,
                                             "small_sample_size_method":
                                             "std_quantiles",
                                             "small_sample_size_quantile": 0.8
                                         }
                                     })
Beispiel #10
0
def test_fit_ml_model():
    """Tests fit_ml_model"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model(df=df,
                                 model_formula_str=model_formula_str,
                                 fit_algorithm="sgd",
                                 fit_algorithm_params={"alpha": 0.1})

    pred_df = predict_ml(fut_df=df_test, trained_model=trained_model)

    input_cols = ["x1", "x2", "x3", "x4", "x1_categ"]
    assert_frame_equal(pred_df[input_cols].reset_index(drop=True),
                       df_test[input_cols].reset_index(drop=True))

    y_test_pred = pred_df[y_col]

    assert trained_model["ml_model"].alpha == 0.1

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert round(err[enum.get_metric_name()]) == 6.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert round(err[enum.get_metric_name()]) == 7.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # Tests if `fitted_df` returned is correct
    fitted_df_via_predict = predict_ml(fut_df=df, trained_model=trained_model)
    assert trained_model["fitted_df"].equals(fitted_df_via_predict)

    # Tests actual values for a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [9.0, 9.0, 7.0, 10.0, 10.0, 10.0, 11.0, 9.0, 8.0, 6.0]
    assert list(y_test_pred.round()) == expected_values

    ml_model_summary = trained_model["ml_model_summary"].round(2)
    assert list(ml_model_summary["variable"].values) == [
        "Intercept", "x1", "x2", "x3", "x4"
    ]
    assert list(ml_model_summary["coef"].round().values) == [
        -0.0, -0.0, 2.0, 1.0, 10.0
    ]

    # Testing the summary returned from statsmodels.
    # The summary in this case is very informative with several tables.
    # `table[1]` inlcudes the cofficients and p-values.
    # Parameters without p-values are available directly
    # through `trained_model["ml_model"].params` (names and coefficients).
    # However `summary` does include more information (eg p-values) which is desirable.
    # Here we test those values even though it is harder to get them through `summary`.
    trained_model = fit_ml_model(df=df,
                                 model_formula_str=model_formula_str,
                                 fit_algorithm="statsmodels_ols",
                                 fit_algorithm_params={"alpha": 0.1})

    ml_model_summary = trained_model["ml_model_summary"]
    ml_model_summary_table = ml_model_summary.tables[1]
    assert ml_model_summary_table[0].data == ([
        "", "coef", "std err", "t", "P>|t|", "[0.025", "0.975]"
    ])
    assert ml_model_summary_table[1].data == ([
        "Intercept", "  -26.5445", "    0.456", "  -58.197", " 0.000",
        "  -27.440", "  -25.649"
    ])
    assert ml_model_summary_table[2].data == ([
        "x1", "    0.5335", "    0.409", "    1.304", " 0.192", "   -0.269",
        "    1.336"
    ])