Ejemplo n.º 1
0
def test_fit_ml_model_with_evaluation_with_weights():
    """Tests fit_ml_model_with_evaluation, with test set"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"
    df["weights"] = range(1, len(df) + 1)

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm="ridge",
        regression_weight_col="weights")

    assert trained_model["regression_weight_col"] == "weights"

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert round(err[enum.get_metric_name()]) == 2.0

    # Checks for raising exception if weights have negative values
    df["weights"] = -df["weights"]
    with pytest.raises(ValueError, match="Weights can not be negative."):
        fit_ml_model_with_evaluation(df=df,
                                     model_formula_str=model_formula_str,
                                     fit_algorithm="ridge",
                                     regression_weight_col="weights")
Ejemplo n.º 2
0
def test_fit_ml_model_with_evaluation_skip_test():
    """Tests fit_ml_model_with_evaluation, on linear model,
        skipping test set"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df, model_formula_str=model_formula_str, training_fraction=1.0)

    assert len(trained_model["y_test"]) == 0
    assert trained_model["y_test_pred"] is None
    assert trained_model["test_evaluation"] is None
    assert trained_model["plt_compare_test"] is None

    arr1 = predict_ml(fut_df=df, trained_model=trained_model)[y_col].tolist()
    arr2 = trained_model["y_train_pred"]

    assert np.array_equal(arr1, arr2)

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]
    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5
Ejemplo n.º 3
0
def test_fit_ml_model_with_evaluation_with_user_provided_bounds():
    """Tests fit_ml_model_with_evaluation
        with min_admissible_value and max_admissible_value"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        min_admissible_value=-7,
        max_admissible_value=20.00)

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values on a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [
        8.36, 11.19, 1.85, 15.57, 16.84, 14.44, 20.00, 9.02, 1.81, -7.00
    ]
    assert list(y_test_pred.round(2)) == expected_values
Ejemplo n.º 4
0
def test_fit_ml_model_with_evaluation_with_test_set():
    """Tests fit_ml_model_with_evaluation, with test set"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm="sgd",
        fit_algorithm_params={"alpha": 0.1})

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    assert trained_model["ml_model"].alpha == 0.1
    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert round(err[enum.get_metric_name()]) == 6.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert round(err[enum.get_metric_name()]) == 7.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [9.0, 9.0, 7.0, 10.0, 10.0, 10.0, 11.0, 9.0, 8.0, 6.0]
    assert list(y_test_pred.round()) == expected_values
Ejemplo n.º 5
0
def test_fit_ml_model_with_evaluation_sgd():
    """Tests fit_ml_model_with_evaluation, on sgd model"""
    res = generate_test_data_for_fitting()
    df = res["df"]
    model_formula_str = res["model_formula_str"]
    y_test = res["y_test"]
    df_test = res["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm="sgd",
        fit_algorithm_params={"penalty": "none"})

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm="sgd",
        fit_algorithm_params={
            "penalty": "elasticnet",
            "alpha": 0.01,
            "l1_ratio": 0.2
        })

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert round(err[enum.get_metric_name()]) == 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert round(err[enum.get_metric_name()]) == 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5
Ejemplo n.º 6
0
def test_dummy():
    df = pd.DataFrame({"a": [1, 2, 1], "b": [1, 3, 1], "c": ["a", "b", "a"]})
    df = pd.get_dummies(df)
    df["y"] = [1, 5, 4]
    model_formula_str = "y~a+b+c_a+c_b"
    trained_model = fit_ml_model_with_evaluation(
        df=df, model_formula_str=model_formula_str, training_fraction=1.0)
    expected_coefs = np.array([0., 1., 1., -1., 1.])
    obtained_coefs = np.array(trained_model["ml_model"].coef_).round()
    np.array_equal(expected_coefs, obtained_coefs)
Ejemplo n.º 7
0
def test_fit_ml_model_with_evaluation_constant_column_sgd():
    """Tests fit_ml_model_with_evaluation using sgd with
    no penalty when some regressors are constant
    With limited data, the models converge to slightly different predictions
    than the linear model"""
    res = generate_test_data_for_fitting(n=80)
    df = res["df"]
    y_test = res["y_test"]
    df_test = res["df_test"]
    y_col = "y"

    # add constant columns
    new_cols = []
    for i in range(300):
        col = f"cst{i}"
        df[col] = 0
        df_test[col] = 2
        new_cols.append(col)
    df["cst_event"] = "string"
    df_test["cst_event"] = "string"
    new_cols.append("cst_event")

    model_formula_str = "+".join([res["model_formula_str"]] + new_cols)

    fit_algorithm = "sgd"
    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm=fit_algorithm,
        fit_algorithm_params={
            "tol": 1e-5,
            "penalty": "none"
        })

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]
    expected_values = [-8.0, 2.0, 3.0, -2.0, 38.0, 0.0, 0.0]
    assert list(pd.Series(
        trained_model["ml_model"].coef_)[:7].round()) == expected_values

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [
        17.0, 18.0, 16.0, 13.0, 9.0, 2.0, 20.0, 12.0, 12.0, 13.0
    ]
    assert list(y_test_pred.round()) == expected_values
Ejemplo n.º 8
0
def test_fit_ml_model_with_evaluation_constant_column():
    """Tests ``fit_ml_model_with_evaluation``
    when some regressors are constant"""
    res = generate_test_data_for_fitting(n=80)
    df = res["df"]
    y_test = res["y_test"]
    df_test = res["df_test"]
    y_col = "y"

    # add constant columns
    new_cols = []
    for i in range(300):
        col = f"cst{i}"
        df[col] = 0
        df_test[col] = 2
        new_cols.append(col)
    df["cst_event"] = "string"
    df_test["cst_event"] = "string"
    new_cols.append("cst_event")

    model_formula_str = "+".join([res["model_formula_str"]] + new_cols)

    fit_algorithm = "linear"
    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        fit_algorithm=fit_algorithm,
        normalize_method="min_max")

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    # intercept, x1, x2, x3, x4, [constant columns]
    expected_values = [-23.0, 1.0, 4.0, 0.0, 44.0, 0.0, 0.0]
    assert list(pd.Series(
        trained_model["ml_model"].coef_)[:7].round()) == expected_values

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 3.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    y_test_pred = predict_ml(fut_df=df_test[:10],
                             trained_model=trained_model)[y_col]
    expected_values = [
        18.0, 19.0, 16.0, 13.0, 9.0, 1.0, 23.0, 14.0, 12.0, 14.0
    ]
    assert list(y_test_pred.round()) == expected_values
Ejemplo n.º 9
0
def test_fit_ml_model_with_evaluation_nan():
    """Tests if NaNs are dropped before fitting."""
    df = pd.DataFrame({
        "a": [1, 2, 3, 2],
        "b": [1, 3, 1, 2],
        "c": ["a", "b", "a", "b"]
    })
    df = pd.get_dummies(df)
    df["y"] = [1, 5, np.nan, 3]
    model_formula_str = "y~a+b+c_a+c_b"
    with pytest.raises(
            ValueError,
            match="Model training requires at least 3 observations"):
        fit_ml_model_with_evaluation(df=df.head(3),
                                     model_formula_str=model_formula_str,
                                     training_fraction=1.0)

    with pytest.warns(UserWarning) as record:
        trained_model = fit_ml_model_with_evaluation(
            df=df, model_formula_str=model_formula_str, training_fraction=1.0)
        assert "The data frame included 1 row(s) with NAs which were removed for model fitting."\
               in record[0].message.args[0]
        assert_equal(trained_model["y"], df["y"].loc[(0, 1, 3), ])
Ejemplo n.º 10
0
def test_fit_ml_model_with_evaluation_random_forest():
    """Tests fit_ml_model_with_evaluation, on random forest model"""
    data = generate_test_data_for_fitting()
    df = data["df"]
    model_formula_str = data["model_formula_str"]
    y_test = data["y_test"]
    df_test = data["df_test"]
    y_col = "y"

    trained_model = fit_ml_model_with_evaluation(
        df=df, model_formula_str=model_formula_str, fit_algorithm="rf")

    y_test_pred = predict_ml(fut_df=df_test,
                             trained_model=trained_model)[y_col]

    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 4.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 4.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5
Ejemplo n.º 11
0
def test_fit_ml_model_with_evaluation_with_uncertainty():
    """Tests fit_ml_model_with_evaluation with uncertainty intervals"""
    df = gen_sliced_df(sample_size_dict={
        "a": 200,
        "b": 340,
        "c": 300,
        "d": 8,
        "e": 800
    },
                       seed_dict={
                           "a": 301,
                           "b": 167,
                           "c": 593,
                           "d": 893,
                           "e": 191,
                           "z": 397
                       },
                       err_magnitude_coef=8.0)

    df = df[["x", "z_categ", "y_hat"]]
    df.rename(columns={"y_hat": "y"}, inplace=True)
    model_formula_str = "y~x+z_categ"
    y_col = "y"
    # test_df
    fut_df = df.copy()
    # we change the name of the column of true values in fut_df
    # to be able to keep track of true values later
    fut_df.rename(columns={"y": "y_true"}, inplace=True)
    y_test = fut_df["y_true"]
    # create a small dataframe for testing values only
    small_sample_index = [1, 500, 750, 1000]

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        uncertainty_dict={
            "uncertainty_method": "simple_conditional_residuals",
            "params": {
                "quantiles": [0.025, 0.975],
                "quantile_estimation_method": "normal_fit",
                "sample_size_thresh": 10,
                "small_sample_size_method": "std_quantiles",
                "small_sample_size_quantile": 0.8
            }
        })

    y_test_pred = predict_ml(fut_df=fut_df, trained_model=trained_model)[y_col]
    y_test_pred_small = y_test_pred[small_sample_index]

    # testing predictions
    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 10.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 10.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    assert list(
        y_test_pred_small.round(1)) == [99.7, 201.5, 303.5,
                                        7.3], ("predictions are not correct")

    # testing uncertainty
    # assign the predicted y to the response in fut_df
    fut_df["y"] = y_test_pred
    new_df_with_uncertainty = predict_ci(fut_df,
                                         trained_model["uncertainty_model"])
    assert list(new_df_with_uncertainty.columns) == [
        "y_quantile_summary", ERR_STD_COL
    ], ("column names are not as expected")
    fut_df["y_quantile_summary"] = new_df_with_uncertainty[
        "y_quantile_summary"]

    # calculate coverage of the CI
    fut_df["inside_95_ci"] = fut_df.apply(lambda row: (
        (row["y_true"] <= row["y_quantile_summary"][1]) and
        (row["y_true"] >= row["y_quantile_summary"][0])),
                                          axis=1)

    ci_coverage = 100.0 * fut_df["inside_95_ci"].mean()
    assert ci_coverage > 94.0 and ci_coverage < 96.0, (
        "95 percent CI coverage is not between 94 and 96")

    # testing uncertainty_method not being implemented but passed
    with pytest.raises(
            Exception,
            match="uncertainty method: non_existing_method is not implemented"
    ):
        fit_ml_model_with_evaluation(df=df,
                                     model_formula_str=model_formula_str,
                                     uncertainty_dict={
                                         "uncertainty_method":
                                         "non_existing_method",
                                         "params": {
                                             "quantiles": [0.025, 0.975],
                                             "quantile_estimation_method":
                                             "normal_fit",
                                             "sample_size_thresh": 10,
                                             "small_sample_size_method":
                                             "std_quantiles",
                                             "small_sample_size_quantile": 0.8
                                         }
                                     })