def standard_scaler(df: pd.DataFrame, columns_to_scale: List[str]) -> LearnerReturnType: """ Fits a standard scaler to the dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to scale. It must contain all columns listed in `columns_to_scale`. columns_to_scale : list of str A list of names of the columns for standard scaling. """ scaler = StandardScaler() scaler.fit(df[columns_to_scale].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_data = scaler.transform(new_data_set[columns_to_scale].values) new_cols = pd.DataFrame(data=new_data, columns=columns_to_scale).to_dict('list') return new_data_set.assign(**new_cols) p.__doc__ = learner_pred_fn_docstring("standard_scaler") log = {'standard_scaler': { 'standard_scaler': scaler.get_params(), 'transformed_column': columns_to_scale}} return p, p(df), log
def placeholder_imputer(df: pd.DataFrame, columns_to_impute: List[str], placeholder_value: Any = -999) -> LearnerReturnType: """ Fills missing values with a fixed value. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to fill missing values. It must contain all columns listed in `columns_to_impute` columns_to_impute : List of strings A list of names of the columns for filling missing value. placeholder_value : Any, (default=-999) The value used to fill in missing values. """ def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_cols = new_data_set[columns_to_impute].fillna(placeholder_value).to_dict('list') return new_data_set.assign(**new_cols) p.__doc__ = learner_pred_fn_docstring("placeholder_imputer") log = { 'placeholder_imputer': { 'columns_to_impute': columns_to_impute, 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), 'placeholder_value': placeholder_value } } return p, p(df), log
def selector(df: pd.DataFrame, training_columns: List[str], predict_columns: List[str] = None) -> LearnerReturnType: """ Filters a DataFrames by selecting only the desired columns. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns` training_columns : list of str A list of column names that will remain in the dataframe during training time (fit) predict_columns: list of str A list of column names that will remain in the dataframe during prediction time (transform) If None, it defaults to `training_columns`. """ if predict_columns is None: predict_columns = training_columns def p(new_data_set: pd.DataFrame) -> pd.DataFrame: return new_data_set[predict_columns] p.__doc__ = learner_pred_fn_docstring("selector") log = {'selector': { 'training_columns': training_columns, 'predict_columns': predict_columns, 'transformed_column': list(set(training_columns).union(predict_columns))}} return p, df[training_columns], log
def prediction_ranger(df: pd.DataFrame, prediction_min: float, prediction_max: float, prediction_column: str = "prediction") -> LearnerReturnType: """ Caps and floors the specified prediction column to a set range. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain a `prediction_column` columns. prediction_min : float The floor for the prediction. prediction_max : float The cap for the prediction. prediction_column : str The name of the column in `df` to cap and floor """ def p(new_df: pd.DataFrame) -> pd.DataFrame: return new_df.assign( **{prediction_column: new_df[prediction_column].clip(lower=prediction_min, upper=prediction_max)} ) p.__doc__ = learner_pred_fn_docstring("prediction_ranger") log = {'prediction_ranger': { 'prediction_min': prediction_min, 'prediction_max': prediction_max, 'transformed_column': [prediction_column]}} return p, p(df), log
def truncate_categorical(df: pd.DataFrame, columns_to_truncate: List[str], percentile: float, replacement: Union[str, float] = -9999, replace_unseen: Union[str, float] = -9999, store_mapping: bool = False) -> LearnerReturnType: """ Truncate infrequent categories and replace them by a single one. You can think of it like "others" category. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain a `prediction_column` columns. columns_to_truncate : list of str The df columns names to perform the truncation. percentile : float Categories less frequent than the percentile will be replaced by the same one. replacement: int, str, float or nan The value to use when a category is less frequent that the percentile variable. replace_unseen : int, str, float, or nan The value to impute unseen categories. store_mapping : bool (default: False) Whether to store the feature value -> integer dictionary in the log. """ get_categs = lambda col: (df[col].value_counts() / len(df)).to_dict() update = lambda d: map( lambda kv: (kv[0], replacement) if kv[1] <= percentile else (kv[0], kv[0]), d.items()) categs_to_dict = lambda categ_dict: dict(categ_dict) vec = { column: compose(categs_to_dict, update, get_categs)(column) for column in columns_to_truncate } def p(new_df: pd.DataFrame) -> pd.DataFrame: return apply_replacements(new_df, columns_to_truncate, vec, replace_unseen) p.__doc__ = learner_pred_fn_docstring("truncate_categorical") log: LearnerLogType = { 'truncate_categorical': { 'transformed_column': columns_to_truncate, 'replace_unseen': replace_unseen } } if store_mapping: log["truncate_categorical"]["mapping"] = vec return p, p(df), log
def custom_transformer(df: pd.DataFrame, columns_to_transform: List[str], transformation_function: Callable[[pd.DataFrame], pd.DataFrame], is_vectorized: bool = False) -> LearnerReturnType: """ Applies a custom function to the desired columns. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns` columns_to_transform : list of str A list of column names that will remain in the dataframe during training time (fit) transformation_function : function(pandas.DataFrame) -> pandas.DataFrame A function that receives a DataFrame as input, performs a transformation on its columns and returns another DataFrame. """ def p(df: pd.DataFrame) -> pd.DataFrame: if is_vectorized: return df.assign(**{col: transformation_function(df[col]) for col in columns_to_transform}) return df.assign(**{col: df[col].swifter.apply(transformation_function) for col in columns_to_transform}) p.__doc__ = learner_pred_fn_docstring("custom_transformer") log = {'custom_transformer': { 'transformed_column': columns_to_transform, 'transformation_function': transformation_function.__name__} } return p, p(df), log
def ecdfer(df: pd.DataFrame, ascending: bool = True, prediction_column: str = "prediction", ecdf_column: str = "prediction_ecdf", max_range: int = 1000) -> LearnerReturnType: """ Learns an Empirical Cumulative Distribution Function from the specified column in the input DataFrame. It is usually used in the prediction column to convert a predicted probability into a score from 0 to 1000. Parameters ---------- df : Pandas' pandas.DataFrame A Pandas' DataFrame that must contain a `prediction_column` columns. ascending : bool Whether to compute an ascending ECDF or a descending one. prediction_column : str The name of the column in `df` to learn the ECDF from. ecdf_column : str The name of the new ECDF column added by this function max_range : int The maximum value for the ECDF. It will go will go from 0 to max_range. """ if ascending: base = 0 sign = 1 else: base = max_range sign = -1 values = df[prediction_column] ecdf = ed.ECDF(values) def p(new_df: pd.DataFrame) -> pd.DataFrame: return new_df.assign( **{ ecdf_column: ( base + sign * max_range * ecdf(new_df[prediction_column])) }) p.__doc__ = learner_pred_fn_docstring("ecdefer") log = { 'ecdfer': { 'nobs': len(values), 'prediction_column': prediction_column, 'ascending': ascending, 'transformed_column': [ecdf_column] } } return p, p(df), log
def onehot_categorizer(df: pd.DataFrame, columns_to_categorize: List[str], hardcode_nans: bool = False, drop_first_column: bool = False, store_mapping: bool = False) -> LearnerReturnType: """ Onehot encoding on categorical columns. Encoded columns are removed and substituted by columns named `fklearn_feat__col==val`, where `col` is the name of the column and `val` is one of the values the feature can assume. Parameters ---------- df : pd.DataFrame A Pandas' DataFrame that must contain `columns_to_categorize` columns. columns_to_categorize : list of str A list of categorical column names. Must be non-empty. hardcode_nans : bool Hardcodes an extra column with: 1 if nan or unseen else 0. drop_first_column : bool Drops the first column to create (k-1)-sized one-hot arrays for k features per categorical column. Can be used to avoid colinearity. store_mapping : bool (default: False) Whether to store the feature value -> integer dictionary in the log """ categ_getter = lambda col: list(np.sort(df[col].dropna(axis=0, how='any').unique())) vec = {column: categ_getter(column) for column in sorted(columns_to_categorize)} def p(new_df: pd.DataFrame) -> pd.DataFrame: make_dummies = lambda col: dict(map(lambda categ: ("fklearn_feat__" + col + "==" + str(categ), (new_df[col] == categ).astype(int)), vec[col])) oh_cols = dict(mapcat(lambda col: merge(make_dummies(col), {"fklearn_feat__" + col + "==" + "nan": (~new_df[col].isin(vec[col])).astype(int)} if hardcode_nans else {}).items(), columns_to_categorize)) return new_df.assign(**oh_cols).drop(columns_to_categorize, axis=1) p.__doc__ = learner_pred_fn_docstring("onehot_categorizer") log = {'onehot_categorizer': { 'transformed_column': columns_to_categorize, 'hardcode_nans': hardcode_nans, 'drop_first_column': drop_first_column}} if store_mapping: log['onehot_categorizer']['mapping'] = vec return p, p(df), log
def isotonic_calibration_learner(df: pd.DataFrame, target_column: str = "target", prediction_column: str = "prediction", output_column: str = "calibrated_prediction", y_min: float = 0.0, y_max: float = 1.0) -> LearnerReturnType: """ Fits a single feature isotonic regression to the dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. target_column : str The name of the column in `df` that should be used as target for the model. This column should be binary, since this is a classification model. prediction_column : str The name of the column with the uncalibrated predictions from the model. output_column : str The name of the column with the calibrated predictions from the model. y_min: float Lower bound of Isotonic Regression y_max: float Upper bound of Isotonic Regression """ clf = IsotonicRegression(y_min=y_min, y_max=y_max, out_of_bounds='clip') clf.fit(df[prediction_column], df[target_column]) def p(new_df: pd.DataFrame) -> pd.DataFrame: return new_df.assign(**{output_column: clf.predict(new_df[prediction_column])}) p.__doc__ = learner_pred_fn_docstring("isotonic_calibration_learner") log = {'isotonic_calibration_learner': { 'output_column': output_column, 'target_column': target_column, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sklearn.__version__, 'training_samples': len(df)}, 'object': clf} return p, p(df), log
def isolation_forest_learner( df: pd.DataFrame, features: List[str], params: Dict[str, Any] = None, prediction_column: str = "prediction") -> LearnerReturnType: """Fits an anomaly detection algorithm (Isolation Forest) to the dataset Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. params : dict The IsolationForest parameters in the format {"par_name": param}. See: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html prediction_column : str The name of the column with the predictions from the model. """ default_params = {"n_jobs": -1, "random_state": 1729} params = default_params if not params else merge(default_params, params) model = IsolationForest() model.set_params(**params) model.fit(df[features].values) def p(new_df: pd.DataFrame) -> pd.DataFrame: output_col = { prediction_column: model.decision_function(new_df[features]) } return new_df.assign(**output_col) p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner") log = { 'isolation_forest_learner': { 'features': features, 'parameters': params, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sklearn.__version__, 'training_samples': len(df) } } return p, p(df), log
def null_injector(df: pd.DataFrame, proportion: float, columns_to_inject: Optional[List[str]] = None, groups: Optional[List[List[str]]] = None, seed: int = 1) -> LearnerReturnType: """ Injects null into columns Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns_to_inject` as columns columns_to_inject : list of str A list of features to inject nulls. If groups is not None it will be ignored. proportion : float Proportion of nulls to inject in the columns. groups : list of list of str (default = None) A list of group of features. If not None, feature in the same group will be set to NaN together. seed : int Random seed for consistency. """ if proportion < 0 or proportion > 1: raise ValueError('proportions must be between 0 and 1.') if not ((columns_to_inject is None) ^ (groups is None)): raise ValueError('Either columns_to_inject or groups must be None.') n_rows = df.shape[0] groups = [[f] for f in columns_to_inject] if columns_to_inject is not None else groups null_cols = {} # type: ignore for seed_i, group in enumerate(groups): # type: ignore np.random.seed(seed + seed_i) replace_mask = np.random.binomial(1, 1 - proportion, n_rows).astype(bool) null_cols = merge(null_cols, {feature: df[feature].where(replace_mask) for feature in group}) null_data = df.assign(**null_cols) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: return new_data_set p.__doc__ = learner_pred_fn_docstring("null_injector") log = {'null_injector': { "columns_to_inject": columns_to_inject, "proportion": proportion, "groups": groups }} return p, null_data, log
def missing_warner(df: pd.DataFrame, cols_list: List[str], new_column_name: str = "has_unexpected_missing", detailed_warning: bool = False, detailed_column_name: Optional[str] = None) -> LearnerReturnType: """ Creates a new column to warn about rows that columns that don't have missing in the training set but have missing on the scoring Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame. cols_list : list of str List of columns to consider when evaluating missingness new_column_name : str Name of the column created to alert the existence of missing values """ if (detailed_warning is False and detailed_column_name is not None) or \ (detailed_warning is True and detailed_column_name is None): raise ValueError('Either detailed_warning and detailed_column_name should be defined or both should be False.') df_selected = df[cols_list] cols_without_missing = df_selected.loc[:, df_selected.isna().sum(axis=0) == 0].columns.tolist() def p(dataset: pd.DataFrame) -> pd.DataFrame: def detailed_assignment(df: pd.DataFrame, cols_to_check: List[str]) -> np.array: cols_with_missing = np.array([np.where(df[col].isna(), col, "") for col in cols_to_check]).T missing_by_row_list = np.array([list(filter(None, x)) for x in cols_with_missing]).reshape(-1, 1) if missing_by_row_list.size == 0: return np.empty((df.shape[0], 0)).tolist() else: return missing_by_row_list new_dataset = dataset.assign(**{new_column_name: lambda df: df[cols_without_missing].isna().sum(axis=1) > 0}) if detailed_warning and detailed_column_name: missing_by_row_list = detailed_assignment(new_dataset, cols_without_missing) return new_dataset.assign(**{detailed_column_name: missing_by_row_list}) else: return new_dataset p.__doc__ = learner_pred_fn_docstring("missing_warner") log = {"missing_warner": { "cols_list": cols_list, "cols_without_missing": cols_without_missing} } return p, df, log
def custom_transformer(df: pd.DataFrame, columns_to_transform: List[str], transformation_function: Callable[[pd.DataFrame], pd.DataFrame], is_vectorized: bool = False) -> LearnerReturnType: """ Applies a custom function to the desired columns. The default behaviour is to replace the original values. To store the transformed values in a new column, specify `prefix` or `suffix` in the parameters, or specify a dictionary with the desired column mapping using the `columns_mapping` parameter. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns` columns_to_transform : list of str A list of column names that will remain in the dataframe during training time (fit) transformation_function : function(pandas.DataFrame) -> pandas.DataFrame A function that receives a DataFrame as input, performs a transformation on its columns and returns another DataFrame. """ import swifter # NOQA def p(df: pd.DataFrame) -> pd.DataFrame: if is_vectorized: return df.assign( **{ col: transformation_function(df[col]) for col in columns_to_transform }) return df.assign( **{ col: df[col].swifter.apply(transformation_function) for col in columns_to_transform }) p.__doc__ = learner_pred_fn_docstring("custom_transformer") log = { 'custom_transformer': { 'transformed_column': columns_to_transform, 'transformation_function': transformation_function.__name__ } } return p, p(df), log
def label_categorizer(df: pd.DataFrame, columns_to_categorize: List[str], replace_unseen: Union[str, float] = nan, store_mapping: bool = False) -> LearnerReturnType: """ Replaces categorical variables with a numeric identifier. The default behaviour is to replace the original values. To store the transformed values in a new column, specify `prefix` or `suffix` in the parameters, or specify a dictionary with the desired column mapping using the `columns_mapping` parameter. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns_to_categorize` columns. columns_to_categorize : list of str A list of categorical column names. replace_unseen : int, str, float, or nan The value to impute unseen categories. store_mapping : bool (default: False) Whether to store the feature value -> integer dictionary in the log """ def categ_dict(series: pd.Series) -> Dict: categs = series.dropna().unique() return dict(map(reversed, enumerate(categs))) # type: ignore vec = {column: categ_dict(df[column]) for column in columns_to_categorize} def p(new_df: pd.DataFrame) -> pd.DataFrame: return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen) p.__doc__ = learner_pred_fn_docstring("label_categorizer") log: LearnerLogType = { 'label_categorizer': { 'transformed_column': columns_to_categorize, 'replace_unseen': replace_unseen } } if store_mapping: log['label_categorizer']['mapping'] = vec return p, p(df), log
def rank_categorical(df: pd.DataFrame, columns_to_rank: List[str], replace_unseen: Union[str, float] = nan, store_mapping: bool = False) -> LearnerReturnType: """ Rank categorical features by their frequency in the train set. The default behaviour is to replace the original values. To store the transformed values in a new column, specify `prefix` or `suffix` in the parameters, or specify a dictionary with the desired column mapping using the `columns_mapping` parameter. Parameters ---------- df : Pandas' DataFrame A Pandas' DataFrame that must contain a `prediction_column` columns. columns_to_rank : list of str The df columns names to perform the rank. replace_unseen : int, str, float, or nan The value to impute unseen categories. store_mapping : bool (default: False) Whether to store the feature value -> integer dictionary in the log """ col_categ_getter = lambda col: (df[col] .value_counts() .reset_index() .sort_values([col, "index"], ascending=[False, True]) .set_index("index")[col] .rank(method="first", ascending=False).to_dict()) vec = {column: col_categ_getter(column) for column in columns_to_rank} def p(new_df: pd.DataFrame) -> pd.DataFrame: return apply_replacements(new_df, columns_to_rank, vec, replace_unseen) p.__doc__ = learner_pred_fn_docstring("rank_categorical") log: LearnerLogType = {'rank_categorical': { 'transformed_column': columns_to_rank, 'replace_unseen': replace_unseen} } if store_mapping: log['rank_categorical']['mapping'] = vec return p, p(df), log
def quantile_biner(df: pd.DataFrame, columns_to_bin: List[str], q: int = 4, right: bool = False) -> LearnerReturnType: """ Discretize continuous numerical columns into its quantiles. Uses pandas.qcut to find the bins and then numpy.digitize to fit the columns into bins. The default behaviour is to replace the original values. To store the transformed values in a new column, specify `prefix` or `suffix` in the parameters, or specify a dictionary with the desired column mapping using the `columns_mapping` parameter. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns_to_categorize` columns. columns_to_bin : list of str A list of numerical column names. q : int Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. See https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html right : bool Indicating whether the intervals include the right or the left bin edge. Default behavior is (right==False) indicating that the interval does not include the right edge. The left bin end is open in this case, i.e., bins[i-1] <= x < bins[i] is the default behavior for monotonically increasing bins. See https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.digitize.html """ bin_getter = lambda col: pd.qcut(df[col], q, retbins=True)[1] bins = {column: bin_getter(column) for column in columns_to_bin} def p(new_df: pd.DataFrame) -> pd.DataFrame: col_biner = lambda col: np.where(new_df[col].isnull(), nan, np.digitize(new_df[col], bins[col], right=right)) bined_columns = {col: col_biner(col) for col in columns_to_bin} return new_df.assign(**bined_columns) p.__doc__ = learner_pred_fn_docstring("quantile_biner") log = {'quantile_biner': { 'transformed_column': columns_to_bin, 'q': q}} return p, p(df), log
def imputer(df: pd.DataFrame, columns_to_impute: List[str], impute_strategy: str = 'median') -> LearnerReturnType: """ Fits a missing value imputer to the dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to impute missing values. It must contain all columns listed in `columns_to_impute` columns_to_impute : List of strings A list of names of the columns for missing value imputation. impute_strategy : String, (default="median") The imputation strategy. - If "mean", then replace missing values using the mean along the axis. - If "median", then replace missing values using the median along the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. """ imp = Imputer(strategy=impute_strategy) imp.fit(df[columns_to_impute].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_data = imp.transform(new_data_set[columns_to_impute]) new_cols = pd.DataFrame(data=new_data, columns=columns_to_impute).to_dict('list') return new_data_set.assign(**new_cols) p.__doc__ = learner_pred_fn_docstring("imputer") log = { 'imputer': { 'impute_strategy': impute_strategy, 'columns_to_impute': columns_to_impute, 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), 'statistics': imp.statistics_ } } return p, p(df), log
def floorer(df: pd.DataFrame, columns_to_floor: List[str], precomputed_floors: Dict[str, float] = None) -> LearnerReturnType: """ Learns the minimum value for each of the `columns_to_floor` and used that as the floot for those columns. If precomputed floors are passed, the function uses that as the cap value instead of computing the minimun. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns_to_floor` columns. columns_to_floor : list of str A list os column names that should be floored. precomputed_floors : dict A dictionary on the format {"column_name" : floor_value} that maps column names to pre computed floor values """ if not precomputed_floors: precomputed_floors = {} floors = { col: precomputed_floors.get(col, df[col].min()) for col in columns_to_floor } def p(new_data_set: pd.DataFrame) -> pd.DataFrame: capped_cols = { col: new_data_set[col].clip(lower=floors[col]) for col in floors.keys() } return new_data_set.assign(**capped_cols) p.__doc__ = learner_pred_fn_docstring("floorer") log = { 'floorer': { 'floors': floors, 'transformed_column': columns_to_floor, 'precomputed_floors': precomputed_floors } } return p, p(df), log
def count_categorizer(df: pd.DataFrame, columns_to_categorize: List[str], replace_unseen: int = -1, store_mapping: bool = False) -> LearnerReturnType: """ Replaces categorical variables by count. The default behaviour is to replace the original values. To store the transformed values in a new column, specify `prefix` or `suffix` in the parameters, or specify a dictionary with the desired column mapping using the `columns_mapping` parameter. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns_to_categorize` columns. columns_to_categorize : list of str A list of categorical column names. replace_unseen : int The value to impute unseen categories. store_mapping : bool (default: False) Whether to store the feature value -> integer dictionary in the log """ categ_getter = lambda col: df[col].value_counts().to_dict() vec = {column: categ_getter(column) for column in columns_to_categorize} def p(new_df: pd.DataFrame) -> pd.DataFrame: return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen) p.__doc__ = learner_pred_fn_docstring("count_categorizer") log: LearnerLogType = { 'count_categorizer': { 'transformed_column': columns_to_categorize, 'replace_unseen': replace_unseen } } if store_mapping: log['count_categorizer']['mapping'] = vec return p, p(df), log
def capper(df: pd.DataFrame, columns_to_cap: List[str], precomputed_caps: Dict[str, float] = None) -> LearnerReturnType: """ Learns the maximum value for each of the `columns_to_cap` and used that as the cap for those columns. If precomputed caps are passed, the function uses that as the cap value instead of computing the maximum. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns_to_cap` columns. columns_to_cap : list of str A list os column names that should be caped. precomputed_caps : dict A dictionary on the format {"column_name" : cap_value}. That maps column names to pre computed cap values """ if not precomputed_caps: precomputed_caps = {} caps = { col: precomputed_caps.get(col, df[col].max()) for col in columns_to_cap } def p(new_data_set: pd.DataFrame) -> pd.DataFrame: capped_cols = { col: new_data_set[col].clip(upper=caps[col]) for col in caps.keys() } return new_data_set.assign(**capped_cols) p.__doc__ = learner_pred_fn_docstring("capper") log = { 'capper': { 'caps': caps, 'transformed_column': columns_to_cap, 'precomputed_caps': precomputed_caps } } return p, p(df), log
def standard_scaler(df: pd.DataFrame, columns_to_scale: List[str]) -> LearnerReturnType: """ Fits a standard scaler to the dataset. The default behaviour is to replace the original values. To store the transformed values in a new column, specify `prefix` or `suffix` in the parameters, or specify a dictionary with the desired column mapping using the `columns_mapping` parameter. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to scale. It must contain all columns listed in `columns_to_scale`. columns_to_scale : list of str A list of names of the columns for standard scaling. """ scaler = StandardScaler() scaler.fit(df[columns_to_scale].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_data = scaler.transform(new_data_set[columns_to_scale].values) new_cols = pd.DataFrame(data=new_data, columns=columns_to_scale).to_dict('list') return new_data_set.assign(**new_cols) p.__doc__ = learner_pred_fn_docstring("standard_scaler") log = { 'standard_scaler': { 'standard_scaler': scaler.get_params(), 'transformed_column': columns_to_scale } } return p, p(df), log
def non_parametric_double_ml_learner(df: pd.DataFrame, feature_columns: List[str], treatment_column: str, outcome_column: str, debias_model: Union[RegressorMixin, None] = None, debias_feature_columns: List[str] = None, denoise_model: Union[RegressorMixin, None] = None, denoise_feature_columns: List[str] = None, final_model: Union[RegressorMixin, None] = None, final_model_feature_columns: List[str] = None, prediction_column: str = "prediction", cv_splits: int = 2, encode_extra_cols: bool = True) -> LearnerReturnType: """ Fits an Non-Parametric Double/ML Meta Learner for Conditional Average Treatment Effect Estimation. It implements the following steps: 1) fits k instances of the debias model to predict the treatment from the features and get out-of-fold residuals t_res=t-t_hat; 2) fits k instances of the denoise model to predict the outcome from the features and get out-of-fold residuals y_res=y-y_hat; 3) fits a final ML model to predict y_res / t_res from the features using weighted regression with weights set to t_res^2. Trained like this, the final model will output treatment effect predictions. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features, treatment and target columns. The model will be trained to predict the target column from the features. feature_columns : list of str A list os column names that are used as features for the denoise, debias and final models in double-ml. All this names should be in `df`. treatment_column : str The name of the column in `df` that should be used as treatment for the double-ml model. It will learn the impact of this column with respect to the outcome column. outcome_column : str The name of the column in `df` that should be used as outcome for the double-ml model. It will learn the impact of the treatment column on this outcome column. debias_model : RegressorMixin (default None) The estimator for fitting the treatment from the features. Must implement fit and predict methods. It can be an scikit-learn regressor. When None, defaults to GradientBoostingRegressor. debias_feature_columns : list of str (default None) A list os column names to be used only for the debias model. If not None, it will replace feature_columns when fitting the debias model. denoise_model : RegressorMixin (default None) The estimator for fitting the outcome from the features. Must implement fit and predict methods. It can be an scikit-learn regressor. When None, defaults to GradientBoostingRegressor. denoise_feature_columns : list of str (default None) A list os column names to be used only for the denoise model. If not None, it will replace feature_columns when fitting the denoise model. final_model : RegressorMixin (default None) The estimator for fitting the outcome residuals from the treatment residuals. Must implement fit and predict methods. It can be an arbitrary scikit-learn regressor. The fit method must accept sample_weight as a keyword argument. When None, defaults to GradientBoostingRegressor. final_model_feature_columns : list of str (default None) A list os column names to be used only for the final model. If not None, it will replace feature_columns when fitting the final model. prediction_column : str (default "prediction") The name of the column with the treatment effect predictions from the final model. cv_splits : int (default 2) Number of folds to split the training data when fitting the debias and denoise models encode_extra_cols : bool (default: True) If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns. """ features = feature_columns if not encode_extra_cols else expand_features_encoded(df, feature_columns) debias_model = GradientBoostingRegressor() if not debias_model else debias_model denoise_model = GradientBoostingRegressor() if not denoise_model else denoise_model final_model = GradientBoostingRegressor() if not final_model else final_model t_hat, mts = _cv_estimate(debias_model, df, features if debias_feature_columns is None else debias_feature_columns, treatment_column, cv_splits) y_hat, mys = _cv_estimate(denoise_model, df, features if denoise_feature_columns is None else denoise_feature_columns, outcome_column, cv_splits) y_res = df[outcome_column] - y_hat t_res = df[treatment_column] - t_hat final_target = y_res / t_res weights = t_res ** 2 final_model_x = features if final_model_feature_columns is None else final_model_feature_columns model_final_fitted = final_model.fit(X=df[final_model_x], y=final_target, sample_weight=weights) def p(new_df: pd.DataFrame) -> pd.DataFrame: return new_df.assign(**{prediction_column: model_final_fitted.predict(new_df[final_model_x].values)}) p.__doc__ = learner_pred_fn_docstring("non_parametric_double_ml_learner") log = {'non_parametric_double_ml_learner': { 'features': feature_columns, 'debias_feature_columns': debias_feature_columns, 'denoise_feature_columns': denoise_feature_columns, 'final_model_feature_columns': final_model_feature_columns, 'outcome_column': outcome_column, 'treatment_column': treatment_column, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sk_version, 'feature_importance': dict(zip(features, model_final_fitted.feature_importances_)), 'training_samples': len(df)}, 'debias_models': mts, 'denoise_models': mys, 'cv_splits': cv_splits, 'object': model_final_fitted} return p, p(df), log
def lgbm_classification_learner( df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: LogType = None, prediction_column: str = "prediction", weight_column: str = None, encode_extra_cols: bool = True) -> LearnerReturnType: """ Fits an LGBM classifier to the dataset. It first generates a Dataset with the specified features and labels from `df`. Then, it fits a LGBM model to this Dataset. Return the predict function for the model and the predictions for the input dataset. Parameters ---------- df : pandas.DataFrame A pandas DataFrame with features and target columns. The model will be trained to predict the target column from the features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. This column should be discrete, since this is a classification model. learning_rate : float Float in the range (0, 1] Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative. See the learning_rate hyper-parameter in: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst num_estimators : int Int in the range (0, inf) Number of boosted trees to fit. See the num_iterations hyper-parameter in: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst extra_params : dict, optional Dictionary in the format {"hyperparameter_name" : hyperparameter_value}. Other parameters for the LGBM model. See the list in: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst If not passed, the default will be used. prediction_column : str The name of the column with the predictions from the model. weight_column : str, optional The name of the column with scores to weight the data. encode_extra_cols : bool (default: True) If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns. """ import lightgbm as lgbm params = extra_params if extra_params else {} params = assoc(params, "eta", learning_rate) params = params if "objective" in params else assoc( params, "objective", 'binary') weights = df[weight_column].values if weight_column else None features = features if not encode_extra_cols else expand_features_encoded( df, features) dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights, silent=True) bst = lgbm.train(params, dtrain, num_estimators) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: if params["objective"] == "multiclass": col_dict = { prediction_column + "_" + str(key): value for (key, value ) in enumerate(bst.predict(new_df[features].values).T) } else: col_dict = { prediction_column: bst.predict(new_df[features].values) } if apply_shap: import shap explainer = shap.TreeExplainer(bst) shap_values = explainer.shap_values(new_df[features]) shap_expected_value = explainer.expected_value if params["objective"] == "multiclass": shap_values_multiclass = { f"shap_values_{class_index}": list(value) for (class_index, value) in enumerate(shap_values) } shap_expected_value_multiclass = { f"shap_expected_value_{class_index}": np.repeat(expected_value, len(class_shap_values)) for (class_index, (expected_value, class_shap_values) ) in enumerate(zip(shap_expected_value, shap_values)) } shap_output = merge(shap_values_multiclass, shap_expected_value_multiclass) else: shap_values = list(shap_values[1]) shap_output = { "shap_values": shap_values, "shap_expected_value": np.repeat(shap_expected_value[1], len(shap_values)) } col_dict = merge(col_dict, shap_output) return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring("lgbm_classification_learner", shap=True) log = { 'lgbm_classification_learner': { 'features': features, 'target': target, 'prediction_column': prediction_column, 'package': "lightgbm", 'package_version': lgbm.__version__, 'parameters': assoc(params, "num_estimators", num_estimators), 'feature_importance': dict(zip(features, bst.feature_importance().tolist())), 'training_samples': len(df) }, 'object': bst } return p, p(df), log
def nlp_logistic_classification_learner( df: pd.DataFrame, text_feature_cols: List[str], target: str, vectorizer_params: LogType = None, logistic_params: LogType = None, prediction_column: str = "prediction") -> LearnerReturnType: """ Fits a text vectorizer (TfidfVectorizer) followed by a logistic regression (LogisticRegression). Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. text_feature_cols : list of str A list of column names of the text features used for the model. All these names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. This column should be discrete, since this is a classification model. vectorizer_params : dict The TfidfVectorizer parameters in the format {"par_name": param}. See: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html logistic_params : dict The LogisticRegression parameters in the format {"par_name": param}. See: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html prediction_column : str The name of the column with the predictions from the model. """ # set default params default_vect_params = {"strip_accents": "unicode", "min_df": 20} merged_vect_params = default_vect_params if not vectorizer_params else merge( default_vect_params, vectorizer_params) default_clf_params = { "C": 0.1, "multi_class": "ovr", "solver": "liblinear" } merged_logistic_params = default_clf_params if not logistic_params else merge( default_clf_params, logistic_params) vect = TfidfVectorizer(**merged_vect_params) clf = LogisticRegression(**merged_logistic_params) text_df = df[text_feature_cols].apply(lambda x: x + " ", axis=1).sum(axis=1) vect.fit(text_df.values) sparse_vect = vect.transform(text_df.values) clf.fit(sparse_vect, df[target].values) def p(new_df: pd.DataFrame) -> pd.DataFrame: predict_text_df = new_df[text_feature_cols].apply(lambda x: x + " ", axis=1).sum(axis=1) predict_sparse_vect = vect.transform(predict_text_df) if merged_logistic_params["multi_class"] == "multinomial": col_dict = { prediction_column + "_" + str(key): value for (key, value ) in enumerate(clf.predict_proba(predict_sparse_vect).T) } else: col_dict = { prediction_column: clf.predict_proba(predict_sparse_vect)[:, 1] } return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring( "nlp_logistic_classification_learner") params = { "vectorizer_params": merged_vect_params, "logistic_params": merged_logistic_params } log = { 'nlp_logistic_classification_learner': { 'features': text_feature_cols, 'target': target, 'prediction_column': prediction_column, 'parameters': assoc(params, "vocab_size", sparse_vect.shape[1]), 'package': "sklearn", 'package_version': sk_version, 'training_samples': len(df) }, 'object': clf } return p, p(df), log
def catboost_classification_learner( df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: LogType = None, prediction_column: str = "prediction", weight_column: str = None, encode_extra_cols: bool = True) -> LearnerReturnType: """ Fits an CatBoost classifier to the dataset. It first generates a DMatrix with the specified features and labels from `df`. Then, it fits a CatBoost model to this DMatrix. Return the predict function for the model and the predictions for the input dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. This column should be discrete, since this is a classification model. learning_rate : float Float in the range (0, 1] Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative. See the eta hyper-parameter in: https://catboost.ai/docs/concepts/python-reference_parameters-list.html num_estimators : int Int in the range (0, inf) Number of boosted trees to fit. See the n_estimators hyper-parameter in: https://catboost.ai/docs/concepts/python-reference_parameters-list.html extra_params : dict, optional Dictionary in the format {"hyperparameter_name" : hyperparameter_value}. Other parameters for the CatBoost model. See the list in: https://catboost.ai/docs/concepts/python-reference_catboostregressor.html If not passed, the default will be used. prediction_column : str The name of the column with the predictions from the model. If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes). weight_column : str, optional The name of the column with scores to weight the data. encode_extra_cols : bool (default: True) If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns. """ from catboost import Pool, CatBoostClassifier import catboost weights = df[weight_column].values if weight_column else None params = extra_params if extra_params else {} params = assoc(params, "eta", learning_rate) params = params if "objective" in params else assoc( params, "objective", 'Logloss') features = features if not encode_extra_cols else expand_features_encoded( df, features) cat_features = params["cat_features"] if "cat_features" in params else None dtrain = Pool(df[features].values, df[target].values, weight=weights, feature_names=list(map(str, features)), cat_features=cat_features) cat_boost_classifier = CatBoostClassifier(iterations=num_estimators, **params) cbr = cat_boost_classifier.fit(dtrain, verbose=0) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: pred = cbr.predict_proba(new_df[features]) if params["objective"] == "MultiClass": col_dict = { prediction_column + "_" + str(key): value for (key, value) in enumerate(pred.T) } col_dict.update({prediction_column: pred.argmax(axis=1)}) else: col_dict = {prediction_column: pred[:, 1]} if apply_shap: import shap if params["objective"] == "MultiClass": shap_values = _get_catboost_shap_values( df, cbr, features, target, weights, cat_features) # catboost shap returns a list for each row, we reformat it to return # a list for each class shap_values = shap_values.transpose(1, 0, 2) shap_values_multiclass = { f"shap_values_{class_index}": list(value[:, :-1]) for (class_index, value) in enumerate(shap_values) } shap_expected_value_multiclass = { f"shap_expected_value_{class_index}": value[:, -1] for (class_index, value) in enumerate(shap_values) } shap_output = merge(shap_values_multiclass, shap_expected_value_multiclass) else: explainer = shap.TreeExplainer(cbr) shap_values = explainer.shap_values(new_df[features]) shap_expected_value = explainer.expected_value shap_values = list(shap_values) shap_output = { "shap_values": shap_values, "shap_expected_value": np.repeat(shap_expected_value, len(shap_values)) } col_dict = merge(col_dict, shap_output) return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring("catboost_classification_learner", shap=True) log = { 'catboost_classification_learner': { 'features': features, 'target': target, 'prediction_column': prediction_column, 'package': "catboost", 'package_version': catboost.__version__, 'parameters': assoc(params, "num_estimators", num_estimators), 'feature_importance': cbr.feature_importances_, 'training_samples': len(df) }, 'object': cbr } return p, p(df), log
def logistic_classification_learner( df: pd.DataFrame, features: List[str], target: str, params: LogType = None, prediction_column: str = "prediction", weight_column: str = None, encode_extra_cols: bool = True) -> LearnerReturnType: """ Fits an logistic regression classifier to the dataset. Return the predict function for the model and the predictions for the input dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. This column should be discrete, since this is a classification model. params : dict The LogisticRegression parameters in the format {"par_name": param}. See: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html prediction_column : str The name of the column with the predictions from the model. If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes). weight_column : str, optional The name of the column with scores to weight the data. encode_extra_cols : bool (default: True) If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns. """ def_params = {"C": 0.1, "multi_class": "ovr", "solver": "liblinear"} merged_params = def_params if not params else merge(def_params, params) weights = df[weight_column].values if weight_column else None features = features if not encode_extra_cols else expand_features_encoded( df, features) clf = LogisticRegression(**merged_params) clf.fit(df[features].values, df[target].values, sample_weight=weights) def p(new_df: pd.DataFrame) -> pd.DataFrame: pred = clf.predict_proba(new_df[features].values) if merged_params["multi_class"] == "multinomial": col_dict = { prediction_column + "_" + str(key): value for (key, value) in enumerate(pred.T) } col_dict.update({prediction_column: pred.argmax(axis=1)}) else: col_dict = {prediction_column: pred[:, 1]} return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring("logistic_classification_learner") log = { 'logistic_classification_learner': { 'features': features, 'target': target, 'parameters': merged_params, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sk_version, 'feature_importance': dict(zip(features, clf.coef_.flatten())), 'training_samples': len(df) }, 'object': clf } return p, p(df), log
def xgb_classification_learner( df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: LogType = None, prediction_column: str = "prediction", weight_column: str = None, encode_extra_cols: bool = True) -> LearnerReturnType: """ Fits an XGBoost classifier to the dataset. It first generates a DMatrix with the specified features and labels from `df`. Then, it fits a XGBoost model to this DMatrix. Return the predict function for the model and the predictions for the input dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. This column should be discrete, since this is a classification model. learning_rate : float Float in the range (0, 1] Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative. See the eta hyper-parameter in: http://xgboost.readthedocs.io/en/latest/parameter.html num_estimators : int Int in the range (0, inf) Number of boosted trees to fit. See the n_estimators hyper-parameter in: http://xgboost.readthedocs.io/en/latest/python/python_api.html extra_params : dict, optional Dictionary in the format {"hyperparameter_name" : hyperparameter_value}. Other parameters for the XGBoost model. See the list in: http://xgboost.readthedocs.io/en/latest/parameter.html If not passed, the default will be used. prediction_column : str The name of the column with the predictions from the model. If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes). weight_column : str, optional The name of the column with scores to weight the data. encode_extra_cols : bool (default: True) If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns. """ import xgboost as xgb params = extra_params if extra_params else {} params = assoc(params, "eta", learning_rate) params = params if "objective" in params else assoc( params, "objective", 'binary:logistic') weights = df[weight_column].values if weight_column else None features = features if not encode_extra_cols else expand_features_encoded( df, features) dtrain = xgb.DMatrix(df[features].values, label=df[target].values, feature_names=map(str, features), weight=weights) bst = xgb.train(params, dtrain, num_estimators) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: dtest = xgb.DMatrix(new_df[features].values, feature_names=map(str, features)) pred = bst.predict(dtest) if params["objective"] == "multi:softprob": col_dict = { prediction_column + "_" + str(key): value for (key, value) in enumerate(pred.T) } col_dict.update({prediction_column: pred.argmax(axis=1)}) else: col_dict = {prediction_column: pred} if apply_shap: import shap explainer = shap.TreeExplainer(bst) shap_values = explainer.shap_values(new_df[features]) shap_expected_value = explainer.expected_value if params["objective"] == "multi:softprob": shap_values_multiclass = { f"shap_values_{class_index}": list(value) for (class_index, value) in enumerate(shap_values) } shap_expected_value_multiclass = { f"shap_expected_value_{class_index}": np.repeat(expected_value, len(class_shap_values)) for (class_index, (expected_value, class_shap_values) ) in enumerate(zip(shap_expected_value, shap_values)) } shap_output = merge(shap_values_multiclass, shap_expected_value_multiclass) else: shap_values = list(shap_values) shap_output = { "shap_values": shap_values, "shap_expected_value": np.repeat(shap_expected_value, len(shap_values)) } col_dict = merge(col_dict, shap_output) return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring("xgb_classification_learner", shap=True) log = { 'xgb_classification_learner': { 'features': features, 'target': target, 'prediction_column': prediction_column, 'package': "xgboost", 'package_version': xgb.__version__, 'parameters': assoc(params, "num_estimators", num_estimators), 'feature_importance': bst.get_score(), 'training_samples': len(df) }, 'object': bst } return p, p(df), log
def target_categorizer(df: pd.DataFrame, columns_to_categorize: List[str], target_column: str, smoothing: float = 1.0, ignore_unseen: bool = True, store_mapping: bool = False) -> LearnerReturnType: """ Replaces categorical variables with the smoothed mean of the target variable by category. Uses a weighted average with the overall mean of the target variable for smoothing. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame that must contain `columns_to_categorize` and `target_column` columns. columns_to_categorize : list of str A list of categorical column names. target_column : str Target column name. Target can be binary or continuous. smoothing : float Weight given to overall target mean against target mean by category. The value must be greater than or equal to 0 ignore_unseen : bool If True, unseen values will be encoded as nan If False, these will be replaced by target mean. store_mapping : bool (default: False) Whether to store the feature value -> float dictionary in the log. """ target_mean = df[target_column].mean() replace_unseen = nan if ignore_unseen else target_mean def categ_target_dict(column: str) -> Dict: column_agg = df.groupby(column)[target_column].agg(['count', 'mean']) column_target_mean = column_agg['mean'] column_target_count = column_agg['count'] smoothed_target_mean = (column_target_count * column_target_mean + smoothing * target_mean) / \ (column_target_count + smoothing) return smoothed_target_mean.to_dict() vec = { column: categ_target_dict(column) for column in columns_to_categorize } def p(new_df: pd.DataFrame) -> pd.DataFrame: return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen) p.__doc__ = learner_pred_fn_docstring("target_categorizer") log = { 'target_categorizer': { 'transformed_columns': columns_to_categorize, 'target_column': target_column, 'smoothing': smoothing, 'ignore_unseen': ignore_unseen } } if store_mapping: log['target_categorizer']['mapping'] = vec return p, p(df), log
def gp_regression_learner(df: pd.DataFrame, features: List[str], target: str, kernel: kernels.Kernel = None, alpha: float = 0.1, extra_variance: Union[str, float] = "fit", return_std: bool = False, extra_params: Dict[str, Any] = None, prediction_column: str = "prediction", encode_extra_cols: bool = True) -> LearnerReturnType: """ Fits an gaussian process regressor to the dataset. Parameters ---------- df: pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. features: list of str A list os column names that are used as features for the model. All this names should be in `df`. target: str The name of the column in `df` that should be used as target for the model. This column should be numerical and continuous, since this is a regression model. kernel: sklearn.gaussian_process.kernels The kernel specifying the covariance function of the GP. If None is passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters are optimized during fitting. alpha: float Value added to the diagonal of the kernel matrix during fitting. Larger values correspond to increased noise level in the observations. This can also prevent a potential numerical issue during fitting, by ensuring that the calculated values form a positive definite matrix. extra_variance: float The amount of extra variance to scale to the predictions in standard deviations. If left as the default "fit", Uses the standard deviation of the target. return_std: bool If True, the standard-deviation of the predictive distribution at the query points is returned along with the mean. extra_params: dict {"hyperparameter_name" : hyperparameter_value}, optional Other parameters for the GaussianProcessRegressor model. See the list in: http://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html If not passed, the default will be used. prediction_column : str The name of the column with the predictions from the model. encode_extra_cols : bool (default: True) If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns. """ params = extra_params if extra_params else {} params['alpha'] = alpha params['kernel'] = kernel features = features if not encode_extra_cols else expand_features_encoded( df, features) gp = GaussianProcessRegressor(**params) gp.fit(df[features], df[target]) extra_variance = df[target].std( ) if extra_variance == "fit" else extra_variance if extra_variance else 1 def p(new_df: pd.DataFrame) -> pd.DataFrame: if return_std: pred_mean, pred_std = gp.predict(df[features], return_std=True) pred_std *= extra_variance return new_df.assign(**{ prediction_column: pred_mean, prediction_column + "_std": pred_std }) else: return new_df.assign( **{prediction_column: gp.predict(df[features])}) p.__doc__ = learner_pred_fn_docstring("gp_regression_learner") log = { 'gp_regression_learner': { 'features': features, 'target': target, 'parameters': merge(params, { 'extra_variance': extra_variance, 'return_std': return_std }), 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sk_version, 'training_samples': len(df) }, 'object': gp } return p, p(df), log
def custom_supervised_model_learner( df: pd.DataFrame, features: List[str], target: str, model: Any, supervised_type: str, log: Dict[str, Dict], prediction_column: str = "prediction") -> LearnerReturnType: """ Fits a custom model to the dataset. Return the predict function, the predictions for the input dataset and a log describing the model. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. model: Object Machine learning model to be used for regression or clasisfication. model object must have ".fit" attribute to train the data. For classification problems, it also needs ".predict_proba" attribute. For regression problemsm it needs ".predict" attribute. supervised_type: str Type of supervised learning to be used The options are: 'classification' or 'regression' log : Dict[str, Dict] Log with additional information of the custom model used. It must start with just one element with the model name. prediction_column : str The name of the column with the predictions from the model. For classification problems, all probabilities wiill be added: for i in range(0,n_classes). For regression just prediction_column will be added. """ if len(log) != 1: raise ValueError("\'log\' dictionary must start with model name") if supervised_type not in ('classification', 'regression'): raise TypeError( "supervised_type options are: \'classification\' or \'regression\'" ) if not hasattr(model, 'fit'): raise AttributeError("\'model\' object must have \'fit\' attribute") if supervised_type == 'classification' and not hasattr( model, 'predict_proba'): raise AttributeError( "\'model\' object for classification must have \'predict_proba\' attribute" ) if supervised_type == 'regression' and not hasattr(model, 'predict'): raise AttributeError( "\'model\' object for regression must have \'predict\' attribute") model.fit(df[features].values, df[target].values) def p(new_df: pd.DataFrame) -> pd.DataFrame: if supervised_type == 'classification': pred = model.predict_proba(new_df[features].values) col_dict = {} for (key, value) in enumerate(pred.T): col_dict.update({prediction_column + "_" + str(key): value}) elif supervised_type == 'regression': col_dict = { prediction_column: model.predict(new_df[features].values) } return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring("custom_supervised_model_learner") log["object"] = model return p, p(df), log