Exemple #1
0
 def __init__(self,
              problem_type: str,
              self_pipelines=None,
              objective=None,
              **kwds):
     '''
     Parameters
     --------
     problem_type: binary,multiclass,regression
     self_pipelines: define yourself pipline,please use define_pipline generating it
     objective: default by evalml.objectives.FraudCost or you can set to auto,if you want overwrite it please see
     https://evalml.alteryx.com/en/stable/user_guide/objectives.html
     '''
     self.problem_type = problem_type
     if isinstance(objective, dict):
         objective = FraudCost(
             retry_percentage=objective.get('retry_percentage', 0),
             interchange_fee=objective.get('interchange_fee', 0.04),
             fraud_payout_percentage=objective.get('loss_percentage', 0.9),
             amount_col=objective['amount_col'])
     elif objective is None:
         objective = 'auto'
     self.auto_ml = AutoMLSearch(
         problem_type=problem_type,
         allowed_pipelines=self_pipelines,
         objective=objective,
         additional_objectives=['auc', 'f1', 'precision'],
         **kwds)
def test_different_input_lengths():
    fraud_cost = FraudCost(amount_col="value")
    y_predicted = np.array([0, 0])
    y_true = np.array([1])
    with pytest.raises(ValueError, match="Inputs have mismatched dimensions"):
        fraud_cost.score(y_true, y_predicted)

    y_true = np.array([0, 0])
    y_predicted = np.array([1, 2, 0])
    with pytest.raises(ValueError, match="Inputs have mismatched dimensions"):
        fraud_cost.score(y_true, y_predicted)
def test_input_contains_inf(capsys):
    fraud_cost = FraudCost(amount_col="value")
    y_predicted = np.array([np.inf, 0, 0])
    y_true = np.array([1, 0, 0])
    with pytest.raises(ValueError,
                       match="y_predicted contains NaN or infinity"):
        fraud_cost.score(y_true, y_predicted)

    y_true = np.array([np.inf, 0, 0])
    y_predicted = np.array([1, 0, 0])
    with pytest.raises(ValueError, match="y_true contains NaN or infinity"):
        fraud_cost.score(y_true, y_predicted)
def test_input_contains_nan(X_y_binary):
    fraud_cost = FraudCost(amount_col="value")
    y_predicted = np.array([np.nan, 0, 0])
    y_true = np.array([1, 2, 1])
    with pytest.raises(ValueError,
                       match="y_predicted contains NaN or infinity"):
        fraud_cost.score(y_true, y_predicted)

    y_true = np.array([np.nan, 0, 0])
    y_predicted = np.array([1, 2, 0])
    with pytest.raises(ValueError, match="y_true contains NaN or infinity"):
        fraud_cost.score(y_true, y_predicted)
def test_additional_objectives(X_y_binary):
    X, y = X_y_binary

    objective = FraudCost(retry_percentage=.5,
                          interchange_fee=.02,
                          fraud_payout_percentage=.75,
                          amount_col=10)
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_iterations=2, additional_objectives=[objective],
                          n_jobs=1)
    automl.search()

    results = automl.describe_pipeline(0, return_dict=True)
    assert 'Fraud Cost' in list(results["cv_data"][0]["all_objective_scores"].keys())
def test_fraud_objective_score_list(X_y_binary):
    X, y = X_y_binary
    fraud_cost = FraudCost(amount_col="value")

    y_predicted = [.1, .5, .5]
    y_true = [True, False, True]
    extra_columns = pd.DataFrame({"value": [100, 5, 250]})

    out = fraud_cost.decision_function(y_predicted, 5, extra_columns)
    assert isinstance(out, pd.Series)
    pd.testing.assert_series_equal(out, pd.Series(y_true), check_names=False)
    score = fraud_cost.score(y_true, out, extra_columns)
    assert (score == 0.0)
def test_binary_more_than_two_unique_values():
    fraud_cost = FraudCost(amount_col="value")
    y_predicted = np.array([0, 1, 2])
    y_true = np.array([1, 0, 1])
    with pytest.raises(
            ValueError,
            match="y_predicted contains more than two unique values"):
        fraud_cost.score(y_true, y_predicted)

    y_true = np.array([0, 1, 2])
    y_predicted = np.array([1, 0, 1])
    with pytest.raises(ValueError,
                       match="y_true contains more than two unique values"):
        fraud_cost.score(y_true, y_predicted)
def test_binary_predict_pipeline_use_objective(
        mock_decision_function, X_y_binary,
        logistic_regression_binary_pipeline_class):
    X, y = X_y_binary
    binary_pipeline = logistic_regression_binary_pipeline_class(
        parameters={"Logistic Regression Classifier": {
            "n_jobs": 1
        }})
    mock_decision_function.return_value = pd.Series([0] * 100)

    binary_pipeline.threshold = 0.7
    binary_pipeline.fit(X, y)
    fraud_cost = FraudCost(amount_col=0)
    binary_pipeline.score(X, y, ['precision', 'auc', fraud_cost])
    mock_decision_function.assert_called()
def test_fraud_objective_score(X_y_binary):
    X, y = X_y_binary
    fraud_cost = FraudCost(amount_col="value")

    y_predicted = pd.Series([.1, .5, .5])
    y_true = pd.Series([True, False, True])
    extra_columns = pd.DataFrame({"value": [100, 5, 250]})

    out = fraud_cost.decision_function(y_predicted, 5, extra_columns)
    assert isinstance(out, pd.Series)
    pd.testing.assert_series_equal(out, y_true, check_names=False)
    score = fraud_cost.score(y_true, out, extra_columns)
    assert (score == 0.0)

    out = fraud_cost.decision_function(y_predicted.to_numpy(), 5,
                                       extra_columns)
    assert isinstance(out, pd.Series)
    pd.testing.assert_series_equal(out, y_true, check_names=False)
    score = fraud_cost.score(y_true, out, extra_columns)
    assert (score == 0.0)

    out = ww.DataColumn(
        fraud_cost.decision_function(y_predicted, 5, extra_columns))
    pd.testing.assert_series_equal(out.to_series(),
                                   y_true,
                                   check_dtype=False,
                                   check_names=False)
    score = fraud_cost.score(y_true, out, extra_columns)
    assert (score == 0.0)

    # testing with other types of inputs
    y_predicted = np.array([.1, .5, .5])
    extra_columns = pd.DataFrame({"value": [100, 5, 250]})
    out = fraud_cost.decision_function(y_predicted, 5, extra_columns)
    pd.testing.assert_series_equal(out, y_true, check_names=False)
    score = fraud_cost.score(y_true, out, extra_columns)
    assert (score == 0.0)

    y_predicted = pd.Series([.2, .01, .01])
    extra_columns = pd.DataFrame({"value": [100, 50, 50]})
    y_true = pd.Series([False, False, True])
    expected_y_pred = pd.Series([True, False, False])
    out = fraud_cost.decision_function(y_predicted, 10, extra_columns)
    pd.testing.assert_series_equal(out, expected_y_pred, check_names=False)
    score = fraud_cost.score(y_true, out, extra_columns)
    assert (score == 0.255)
def test_fraud_objective_function_amount_col(X_y_binary):
    X, y = X_y_binary

    objective = FraudCost(retry_percentage=.5,
                          interchange_fee=.02,
                          fraud_payout_percentage=.75,
                          amount_col="this column does not exist")
    y_predicted = pd.Series([.1, .5, .5])
    y_true = [True, False, True]
    with pytest.raises(
            ValueError,
            match="`this column does not exist` is not a valid column in X."):
        objective.objective_function(y_true, y_predicted, X)

    with pytest.raises(
            ValueError,
            match="`this column does not exist` is not a valid column in X."):
        objective.objective_function(y_true, y_predicted, X.tolist())
def test_fraud_objective(X_y_binary):
    X, y = X_y_binary

    objective = FraudCost(retry_percentage=.5,
                          interchange_fee=.02,
                          fraud_payout_percentage=.75,
                          amount_col=10)

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective=objective,
                          max_iterations=1)
    automl.search()

    pipeline = automl.best_pipeline
    pipeline.fit(X, y)
    pipeline.predict(X, objective)
    pipeline.predict_proba(X)
    pipeline.score(X, y, [objective])
def test_binary_predict_pipeline_use_objective(
        mock_decision_function, X_y_binary,
        time_series_binary_classification_pipeline_class):
    X, y = X_y_binary
    binary_pipeline = time_series_binary_classification_pipeline_class(
        parameters={
            "Logistic Regression Classifier": {
                "n_jobs": 1
            },
            "pipeline": {
                "gap": 0,
                "max_delay": 0,
                "date_index": None
            }
        })
    mock_decision_function.return_value = pd.Series([0] * 98)
    binary_pipeline.threshold = 0.7
    binary_pipeline.fit(X, y)
    fraud_cost = FraudCost(amount_col=0)
    binary_pipeline.score(X, y, ['precision', 'auc', fraud_cost])
    mock_decision_function.assert_called()
def test_zero_input_lengths():
    fraud_cost = FraudCost(amount_col="value")
    y_predicted = np.array([])
    y_true = np.array([])
    with pytest.raises(ValueError, match="Length of inputs is 0"):
        fraud_cost.score(y_true, y_predicted)