def param_train_fn(space, train_set): return xgb_classification_learner( features=["x"], target="target", learning_rate=space["learning_rate"], num_estimators=space["num_estimators"])(train_set)
def test_xgb_classification_learner(): df_train_binary = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0], "x2": [0, 1, 1, 0], "w": [2, 1, 2, 0.5], 'y': [0, 1, 0, 1] }) df_test_binary = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0], "x2": [1, 1, 0, 1], "w": [1, 2, 0, 0.5], 'y': [1, 0, 0, 1] }) df_train_multinomial = pd.DataFrame({ 'id': ["id1", "id2", "id3", "id4", "id3", "id4"], 'x1': [10.0, 13.0, 10.0, 13.0, 10.0, 13.0], "x2": [0, 1, 1, 0, 1, 0], "w": [2, 1, 2, 0.5, 2, 0.5], 'y': [0, 1, 2, 1, 2, 0] }) df_test_multinomial = pd.DataFrame({ 'id': ["id4", "id4", "id5", "id6", "id5", "id6"], 'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0], "x2": [1, 1, 0, 1, 0, 1], "w": [1, 2, 0, 0.5, 0, 0.5], 'y': [1, 2, 0, 1, 2, 0] }) features = ["x1", "x2"] learner_binary = xgb_classification_learner(features=features, target="y", learning_rate=0.1, num_estimators=20, extra_params={"max_depth": 4, "seed": 42}, prediction_column="prediction", weight_column="w") predict_fn_binary, pred_train_binary, log = learner_binary(df_train_binary) pred_test_binary = predict_fn_binary(df_test_binary) expected_col_train = df_train_binary.columns.tolist() + ["prediction"] expected_col_test = df_test_binary.columns.tolist() + ["prediction"] assert Counter(expected_col_train) == Counter(pred_train_binary.columns.tolist()) assert Counter(expected_col_test) == Counter(pred_test_binary.columns.tolist()) assert pred_test_binary.prediction.max() < 1 assert pred_test_binary.prediction.min() > 0 assert (pred_test_binary.columns == pred_train_binary.columns).all() # SHAP test (binary only) pred_shap = predict_fn_binary(df_test_binary, apply_shap=True) assert "shap_values" in pred_shap.columns assert "shap_expected_value" in pred_shap.columns assert np.vstack(pred_shap["shap_values"]).shape == (4, 2) # test multinomial case learner_multinomial = xgb_classification_learner(features=features, target="y", learning_rate=0.1, num_estimators=20, extra_params={"max_depth": 2, "seed": 42, "objective": 'multi:softprob', "num_class": 3}, prediction_column="prediction") predict_fn_multinomial, pred_train_multinomial, log = learner_multinomial(df_train_multinomial) pred_test_multinomial = predict_fn_multinomial(df_test_multinomial) expected_col_train = df_train_binary.columns.tolist() + ["prediction_0", "prediction_1", "prediction_2", "prediction"] expected_col_test = df_test_binary.columns.tolist() + ["prediction_0", "prediction_1", "prediction_2", "prediction"] assert Counter(expected_col_train) == Counter(pred_train_multinomial.columns.tolist()) assert Counter(expected_col_test) == Counter(pred_test_multinomial.columns.tolist()) assert (pred_test_multinomial.columns == pred_train_multinomial.columns).all()
def xgb_octopus_classification_learner( train_set: pd.DataFrame, learning_rate_by_bin: Dict[T, float], num_estimators_by_bin: Dict[T, int], extra_params_by_bin: Dict[T, Dict[str, Any]], features_by_bin: Dict[T, List[str]], train_split_col: str, train_split_bins: List, nthread: int, target_column: str, prediction_column: str = "prediction") -> LearnerReturnType: """ Octopus ensemble allows you to inject domain specific knowledge to force a split in an initial feature, instead of assuming the tree model will do that intelligent split on its own. It works by first defining a split on your dataset and then training one individual model in each separated dataset. Parameters ---------- train_set: pd.DataFrame A Pandas' DataFrame with features, target columns and a splitting column that must be categorical. learning_rate_by_bin: dict A dictionary of learning rate in the XGBoost model to use in each model split. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to specify a list of learning rates for each split:: { 1: 0.08, 2: 0.08, ... 12: 0.1 } num_estimators_by_bin: dict A dictionary of number of tree estimators in the XGBoost model to use in each model split. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to specify a list of estimators for each split:: { 1: 300, 2: 250, ... 12: 300 } extra_params_by_bin: dict A dictionary of extra parameters dictionaries in the XGBoost model to use in each model split. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to specify a list of extra parameters for each split:: { 1: { 'reg_alpha': 0.0, 'colsample_bytree': 0.4, ... 'colsample_bylevel': 0.8 } 2: { 'reg_alpha': 0.1, 'colsample_bytree': 0.6, ... 'colsample_bylevel': 0.4 } ... 12: { 'reg_alpha': 0.0, 'colsample_bytree': 0.7, ... 'colsample_bylevel': 1.0 } } features_by_bin: dict A dictionary of features to use in each model split. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to specify a list of features for each split:: { 1: [feature-1, feature-2, feature-3, ...], 2: [feature-1, feature-3, feature-5, ...], ... 12: [feature-2, feature-4, feature-8, ...] } train_split_col: str The name of the categorical column where the model will make the splits. Ex: if you want to split your training by tenure, you can have a categorical column called "tenure". train_split_bins: list A list with the actual values of the categories from the `train_split_col`. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12] you can pass this list and you will split your training into 12 different models. nthread: int Number of threads for the XGBoost learners. target_column: str The name of the target column. prediction_column: str The name of the column with the predictions from the model. """ train_fns = { b: xgb_classification_learner( features=features_by_bin[b], learning_rate=learning_rate_by_bin[b], num_estimators=num_estimators_by_bin[b], target=target_column, extra_params=assoc(extra_params_by_bin[b], 'nthread', nthread), prediction_column=prediction_column + "_bin_" + str(b)) for b in train_split_bins } train_sets = { b: train_set[train_set[train_split_col] == b] for b in train_split_bins } train_results = {b: train_fns[b](train_sets[b]) for b in train_split_bins} # train_results is a 3-tuple (prediction functions, predicted train dataset, train logs) pred_fns = {b: train_results[b][0] for b in train_split_bins} train_logs = {b: train_results[b][2] for b in train_split_bins} def p(df: pd.DataFrame) -> pd.DataFrame: pred_fn = compose(*pred_fns.values()) return (pred_fn(df).assign( pred_bin=prediction_column + "_bin_" + df[train_split_col].astype(str)).assign( prediction=lambda d: d.lookup( d.index.values, d.pred_bin.values.squeeze())).rename( index=str, columns={ "prediction": prediction_column }).drop("pred_bin", axis=1)) p.__doc__ = learner_pred_fn_docstring("xgb_octopus_classification_learner") log = { 'xgb_octopus_classification_learner': { 'features': features_by_bin, 'target': target_column, 'prediction_column': prediction_column, 'package': "xgboost", 'train_logs': train_logs, 'parameters': extra_params_by_bin, 'training_samples': len(train_set) } } return p, p(train_set), log