Beispiel #1
0
    model_final_fitted = final_model.fit(X=df[final_model_x],
                                         y=final_target,
                                         sample_weight=weights)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return new_df.assign(**{prediction_column: model_final_fitted.predict(new_df[final_model_x].values)})

    p.__doc__ = learner_pred_fn_docstring("non_parametric_double_ml_learner")

    log = {'non_parametric_double_ml_learner': {
        'features': feature_columns,
        'debias_feature_columns': debias_feature_columns,
        'denoise_feature_columns': denoise_feature_columns,
        'final_model_feature_columns': final_model_feature_columns,
        'outcome_column': outcome_column,
        'treatment_column': treatment_column,
        'prediction_column': prediction_column,
        'package': "sklearn",
        'package_version': sk_version,
        'feature_importance': dict(zip(features, model_final_fitted.feature_importances_)),
        'training_samples': len(df)},
        'debias_models': mts,
        'denoise_models': mys,
        'cv_splits': cv_splits,
        'object': model_final_fitted}

    return p, p(df), log


non_parametric_double_ml_learner.__doc__ += learner_return_docstring("Non Parametric Double/ML")
Beispiel #2
0
    log = {
        'selector': {
            'training_columns':
            training_columns,
            'predict_columns':
            predict_columns,
            'transformed_column':
            list(set(training_columns).union(predict_columns))
        }
    }

    return p, df[training_columns], log


selector.__doc__ += learner_return_docstring("Selector")


@curry
@log_learner_time(learner_name='capper')
def capper(df: pd.DataFrame,
           columns_to_cap: List[str],
           precomputed_caps: Dict[str, float] = None) -> LearnerReturnType:
    """
    Learns the maximum value for each of the `columns_to_cap`
    and used that as the cap for those columns. If precomputed caps
    are passed, the function uses that as the cap value instead of
    computing the maximum.

    Parameters
    ----------
Beispiel #3
0
            'features': features,
            'target': target,
            'parameters': merged_params,
            'prediction_column': prediction_column,
            'package': "sklearn",
            'package_version': sk_version,
            'feature_importance': dict(zip(features, clf.coef_.flatten())),
            'training_samples': len(df)
        },
        'object': clf
    }

    return p, p(df), log


logistic_classification_learner.__doc__ += learner_return_docstring(
    "Logistic Regression")


@curry
@log_learner_time(learner_name='xgb_classification_learner')
def xgb_classification_learner(
        df: pd.DataFrame,
        features: List[str],
        target: str,
        learning_rate: float = 0.1,
        num_estimators: int = 100,
        extra_params: LogType = None,
        prediction_column: str = "prediction",
        weight_column: str = None,
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
Beispiel #4
0
            'features': features,
            'target': target,
            'parameters': params,
            'prediction_column': prediction_column,
            'package': "sklearn",
            'package_version': sk_version,
            'feature_importance': dict(zip(features, regr.coef_.flatten())),
            'training_samples': len(df)
        },
        'object': regr
    }

    return p, p(df), log


linear_regression_learner.__doc__ += learner_return_docstring(
    "Linear Regression")


@curry
@log_learner_time(learner_name='xgb_regression_learner')
def xgb_regression_learner(
        df: pd.DataFrame,
        features: List[str],
        target: str,
        learning_rate: float = 0.1,
        num_estimators: int = 100,
        extra_params: Dict[str, Any] = None,
        prediction_column: str = "prediction",
        weight_column: str = None,
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
Beispiel #5
0
    model = IsolationForest()
    model.set_params(**params)
    model.fit(df[features].values)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        output_col = {
            prediction_column: model.decision_function(new_df[features])
        }

        return new_df.assign(**output_col)

    p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner")

    log = {
        'isolation_forest_learner': {
            'features': features,
            'parameters': params,
            'prediction_column': prediction_column,
            'package': "sklearn",
            'package_version': sklearn.__version__,
            'training_samples': len(df)
        }
    }

    return p, p(df), log


isolation_forest_learner.__doc__ += learner_return_docstring(
    "Isolation Forest")
Beispiel #6
0
        pred_fn = compose(*pred_fns.values())

        return (pred_fn(df).assign(
            pred_bin=prediction_column + "_bin_" +
            df[train_split_col].astype(str)).assign(
                prediction=lambda d: d.lookup(
                    d.index.values, d.pred_bin.values.squeeze())).rename(
                        index=str, columns={
                            "prediction": prediction_column
                        }).drop("pred_bin", axis=1))

    p.__doc__ = learner_pred_fn_docstring("xgb_octopus_classification_learner")

    log = {
        'xgb_octopus_classification_learner': {
            'features': features_by_bin,
            'target': target_column,
            'prediction_column': prediction_column,
            'package': "xgboost",
            'train_logs': train_logs,
            'parameters': extra_params_by_bin,
            'training_samples': len(train_set)
        }
    }

    return p, p(train_set), log


xgb_octopus_classification_learner.__doc__ += learner_return_docstring(
    "Octopus XGB Classifier")
Beispiel #7
0
    output_column : str
        The name of the column with the calibrated predictions from the model.

    """

    clf = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
    clf.fit(df[prediction_column], df[target_column])

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return new_df.assign(
            **{output_column: clf.predict(new_df[prediction_column])})

    p.__doc__ = learner_pred_fn_docstring("isotonic_calibration_learner")

    log = {
        'isotonic_calibration_learner': {
            'output_column': output_column,
            'target_column': target_column,
            'prediction_column': prediction_column,
            'package': "sklearn",
            'package_version': sklearn.__version__,
            'training_samples': len(df)
        }
    }

    return p, p(df), log


isotonic_calibration_learner.__doc__ += learner_return_docstring(
    "Isotonic Calibration")
Beispiel #8
0
    p.__doc__ = learner_pred_fn_docstring("isotonic_calibration_learner")

    log = {'isotonic_calibration_learner': {
        'output_column': output_column,
        'target_column': target_column,
        'prediction_column': prediction_column,
        'package': "sklearn",
        'package_version': sklearn.__version__,
        'training_samples': len(df)},
        'object': clf}

    return p, p(df), log


isotonic_calibration_learner.__doc__ += learner_return_docstring("Isotonic Calibration")


@curry
@log_learner_time(learner_name='find_thresholds_with_same_risk')
def find_thresholds_with_same_risk(df: pd.DataFrame,
                                   sensitive_factor: str,
                                   unfair_band_column: str,
                                   model_prediction_output: str,
                                   target_column: str = "target",
                                   output_column_name: str = "fair_band") -> LearnerReturnType:
    """
    Calculate fair calibration, where for each band any sensitive factor group have the same target mean.

    Parameters
    ----------
        'linear_regression_learner': {
            'features': features,
            'target': target,
            'parameters': params,
            'prediction_column': prediction_column,
            'package': "sklearn",
            'package_version': sk_version,
            'feature_importance': dict(zip(features, regr.coef_.flatten())),
            'training_samples': len(df)
        }
    }

    return p, p(df), log


linear_regression_learner.__doc__ += learner_return_docstring(
    "Linear Regression")


@curry
@log_learner_time(learner_name='xgb_regression_learner')
def xgb_regression_learner(df: pd.DataFrame,
                           features: List[str],
                           target: str,
                           learning_rate: float = 0.1,
                           num_estimators: int = 100,
                           extra_params: Dict[str, Any] = None,
                           prediction_column: str = "prediction",
                           weight_column: str = None) -> LearnerReturnType:
    """
    Fits an XGBoost regressor to the dataset. It first generates a DMatrix
    with the specified features and labels from `df`. Then it fits a XGBoost
Beispiel #10
0
            columns_imputable,
            'training_proportion_of_nulls':
            df[columns_to_impute].isnull().mean(axis=0).to_dict(),
            'statistics':
            imp.statistics_,
            'placeholder_imputer_fn':
            fill_fn,
            'placeholder_imputer_logs':
            fill_logs,
        }
    }

    return p, p(df), log


imputer.__doc__ += learner_return_docstring("SimpleImputer")


@curry
@log_learner_time(learner_name='placeholder_imputer')
def placeholder_imputer(df: pd.DataFrame,
                        columns_to_impute: List[str],
                        placeholder_value: Any = -999) -> LearnerReturnType:
    """
    Fills missing values with a fixed value.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with columns to fill missing values.