def __init__(self, problem_type: str, self_pipelines=None, objective=None, **kwds): ''' Parameters -------- problem_type: binary,multiclass,regression self_pipelines: define yourself pipline,please use define_pipline generating it objective: default by evalml.objectives.FraudCost or you can set to auto,if you want overwrite it please see https://evalml.alteryx.com/en/stable/user_guide/objectives.html ''' self.problem_type = problem_type if isinstance(objective, dict): objective = FraudCost( retry_percentage=objective.get('retry_percentage', 0), interchange_fee=objective.get('interchange_fee', 0.04), fraud_payout_percentage=objective.get('loss_percentage', 0.9), amount_col=objective['amount_col']) elif objective is None: objective = 'auto' self.auto_ml = AutoMLSearch( problem_type=problem_type, allowed_pipelines=self_pipelines, objective=objective, additional_objectives=['auc', 'f1', 'precision'], **kwds)
def test_different_input_lengths(): fraud_cost = FraudCost(amount_col="value") y_predicted = np.array([0, 0]) y_true = np.array([1]) with pytest.raises(ValueError, match="Inputs have mismatched dimensions"): fraud_cost.score(y_true, y_predicted) y_true = np.array([0, 0]) y_predicted = np.array([1, 2, 0]) with pytest.raises(ValueError, match="Inputs have mismatched dimensions"): fraud_cost.score(y_true, y_predicted)
def test_input_contains_inf(capsys): fraud_cost = FraudCost(amount_col="value") y_predicted = np.array([np.inf, 0, 0]) y_true = np.array([1, 0, 0]) with pytest.raises(ValueError, match="y_predicted contains NaN or infinity"): fraud_cost.score(y_true, y_predicted) y_true = np.array([np.inf, 0, 0]) y_predicted = np.array([1, 0, 0]) with pytest.raises(ValueError, match="y_true contains NaN or infinity"): fraud_cost.score(y_true, y_predicted)
def test_input_contains_nan(X_y_binary): fraud_cost = FraudCost(amount_col="value") y_predicted = np.array([np.nan, 0, 0]) y_true = np.array([1, 2, 1]) with pytest.raises(ValueError, match="y_predicted contains NaN or infinity"): fraud_cost.score(y_true, y_predicted) y_true = np.array([np.nan, 0, 0]) y_predicted = np.array([1, 2, 0]) with pytest.raises(ValueError, match="y_true contains NaN or infinity"): fraud_cost.score(y_true, y_predicted)
def test_additional_objectives(X_y_binary): X, y = X_y_binary objective = FraudCost(retry_percentage=.5, interchange_fee=.02, fraud_payout_percentage=.75, amount_col=10) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_iterations=2, additional_objectives=[objective], n_jobs=1) automl.search() results = automl.describe_pipeline(0, return_dict=True) assert 'Fraud Cost' in list(results["cv_data"][0]["all_objective_scores"].keys())
def test_fraud_objective_score_list(X_y_binary): X, y = X_y_binary fraud_cost = FraudCost(amount_col="value") y_predicted = [.1, .5, .5] y_true = [True, False, True] extra_columns = pd.DataFrame({"value": [100, 5, 250]}) out = fraud_cost.decision_function(y_predicted, 5, extra_columns) assert isinstance(out, pd.Series) pd.testing.assert_series_equal(out, pd.Series(y_true), check_names=False) score = fraud_cost.score(y_true, out, extra_columns) assert (score == 0.0)
def test_binary_more_than_two_unique_values(): fraud_cost = FraudCost(amount_col="value") y_predicted = np.array([0, 1, 2]) y_true = np.array([1, 0, 1]) with pytest.raises( ValueError, match="y_predicted contains more than two unique values"): fraud_cost.score(y_true, y_predicted) y_true = np.array([0, 1, 2]) y_predicted = np.array([1, 0, 1]) with pytest.raises(ValueError, match="y_true contains more than two unique values"): fraud_cost.score(y_true, y_predicted)
def test_binary_predict_pipeline_use_objective( mock_decision_function, X_y_binary, logistic_regression_binary_pipeline_class): X, y = X_y_binary binary_pipeline = logistic_regression_binary_pipeline_class( parameters={"Logistic Regression Classifier": { "n_jobs": 1 }}) mock_decision_function.return_value = pd.Series([0] * 100) binary_pipeline.threshold = 0.7 binary_pipeline.fit(X, y) fraud_cost = FraudCost(amount_col=0) binary_pipeline.score(X, y, ['precision', 'auc', fraud_cost]) mock_decision_function.assert_called()
def test_fraud_objective_score(X_y_binary): X, y = X_y_binary fraud_cost = FraudCost(amount_col="value") y_predicted = pd.Series([.1, .5, .5]) y_true = pd.Series([True, False, True]) extra_columns = pd.DataFrame({"value": [100, 5, 250]}) out = fraud_cost.decision_function(y_predicted, 5, extra_columns) assert isinstance(out, pd.Series) pd.testing.assert_series_equal(out, y_true, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) assert (score == 0.0) out = fraud_cost.decision_function(y_predicted.to_numpy(), 5, extra_columns) assert isinstance(out, pd.Series) pd.testing.assert_series_equal(out, y_true, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) assert (score == 0.0) out = ww.DataColumn( fraud_cost.decision_function(y_predicted, 5, extra_columns)) pd.testing.assert_series_equal(out.to_series(), y_true, check_dtype=False, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) assert (score == 0.0) # testing with other types of inputs y_predicted = np.array([.1, .5, .5]) extra_columns = pd.DataFrame({"value": [100, 5, 250]}) out = fraud_cost.decision_function(y_predicted, 5, extra_columns) pd.testing.assert_series_equal(out, y_true, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) assert (score == 0.0) y_predicted = pd.Series([.2, .01, .01]) extra_columns = pd.DataFrame({"value": [100, 50, 50]}) y_true = pd.Series([False, False, True]) expected_y_pred = pd.Series([True, False, False]) out = fraud_cost.decision_function(y_predicted, 10, extra_columns) pd.testing.assert_series_equal(out, expected_y_pred, check_names=False) score = fraud_cost.score(y_true, out, extra_columns) assert (score == 0.255)
def test_fraud_objective_function_amount_col(X_y_binary): X, y = X_y_binary objective = FraudCost(retry_percentage=.5, interchange_fee=.02, fraud_payout_percentage=.75, amount_col="this column does not exist") y_predicted = pd.Series([.1, .5, .5]) y_true = [True, False, True] with pytest.raises( ValueError, match="`this column does not exist` is not a valid column in X."): objective.objective_function(y_true, y_predicted, X) with pytest.raises( ValueError, match="`this column does not exist` is not a valid column in X."): objective.objective_function(y_true, y_predicted, X.tolist())
def test_fraud_objective(X_y_binary): X, y = X_y_binary objective = FraudCost(retry_percentage=.5, interchange_fee=.02, fraud_payout_percentage=.75, amount_col=10) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=objective, max_iterations=1) automl.search() pipeline = automl.best_pipeline pipeline.fit(X, y) pipeline.predict(X, objective) pipeline.predict_proba(X) pipeline.score(X, y, [objective])
def test_binary_predict_pipeline_use_objective( mock_decision_function, X_y_binary, time_series_binary_classification_pipeline_class): X, y = X_y_binary binary_pipeline = time_series_binary_classification_pipeline_class( parameters={ "Logistic Regression Classifier": { "n_jobs": 1 }, "pipeline": { "gap": 0, "max_delay": 0, "date_index": None } }) mock_decision_function.return_value = pd.Series([0] * 98) binary_pipeline.threshold = 0.7 binary_pipeline.fit(X, y) fraud_cost = FraudCost(amount_col=0) binary_pipeline.score(X, y, ['precision', 'auc', fraud_cost]) mock_decision_function.assert_called()
def test_zero_input_lengths(): fraud_cost = FraudCost(amount_col="value") y_predicted = np.array([]) y_true = np.array([]) with pytest.raises(ValueError, match="Length of inputs is 0"): fraud_cost.score(y_true, y_predicted)