def _split_groups_and_values(X, groups, name="", min_value_cols=1, check_X=True, **kwargs) -> Tuple[pd.DataFrame, np.ndarray]: _data_format_checks(X, name=name) _shape_check(X, min_value_cols) try: if isinstance(X, pd.DataFrame): X_group = X.loc[:, as_list(groups)] X_value = X.drop(columns=groups).values else: X_group = pd.DataFrame(X[:, as_list(groups)]) pos_indexes = range(X.shape[1]) X_value = np.delete(X, [pos_indexes[g] for g in as_list(groups)], axis=1) except (KeyError, IndexError): raise ValueError(f"Could not drop groups {groups} from columns of X") X_group = _check_grouping_columns(X_group, **kwargs) if check_X: X_value = check_array(X_value, **kwargs) return X_group, X_value
def fit(self, X, y=None): """ Fit the model using X, y as training data. Will also learn the groups that exist within the dataset. :param X: array-like, shape=(n_columns, n_samples,) training data. :param y: array-like, shape=(n_samples,) training data. :return: Returns an instance of self. """ X, y = self.__prepare_input_data(X, y) if self.shrinkage is not None: self.__set_shrinkage_function() self.group_colnames_ = [str(_) for _ in as_list(self.groups)] if self.value_columns is not None: self.value_colnames_ = [ str(_) for _ in as_list(self.value_columns) ] else: self.value_colnames_ = [ _ for _ in X.columns if _ not in self.group_colnames_ ] self.__validate(X, y) # List of all hierarchical subsets of columns self.group_colnames_hierarchical_ = expanding_list( self.group_colnames_, list) self.fallback_ = None if self.shrinkage is None and self.use_global_model: subset_x = X[self.value_colnames_] self.fallback_ = clone(self.estimator).fit(subset_x, y) if self.shrinkage is not None: self.estimators_ = {} for level_colnames in self.group_colnames_hierarchical_: self.estimators_.update( self.__fit_grouped_estimator(X, y, self.value_colnames_, level_colnames)) else: self.estimators_ = self.__fit_grouped_estimator( X, y, self.value_colnames_, self.group_colnames_) self.groups_ = as_list(self.estimators_.keys()) if self.shrinkage is not None: self.shrinkage_factors_ = self.__get_shrinkage_factor(X) return self
def __get_shrinkage_factor(self, X_group): """Get for all complete groups an array of shrinkages""" group_colnames = X_group.columns.to_list() counts = X_group.groupby(group_colnames).size() # Groups that are split on all most_granular_groups = [ grp for grp in self.groups_ if len(as_list(grp)) == len(group_colnames) ] # For each hierarchy level in each most granular group, get the number of observations hierarchical_counts = { granular_group: [ counts[tuple(subgroup)].sum() for subgroup in expanding_list(granular_group, tuple) ] for granular_group in most_granular_groups } # For each hierarchy level in each most granular group, get the shrinkage factor shrinkage_factors = { group: self.shrinkage_function_(counts, **self.shrinkage_kwargs) for group, counts in hierarchical_counts.items() } # Make sure that the factors sum to one shrinkage_factors = { group: value / value.sum() for group, value in shrinkage_factors.items() } return shrinkage_factors
def __set_shrinkage_function(self): if ( self.shrinkage and len(as_list(self.groups)) == 1 and not self.use_global_model ): raise ValueError( "Cannot do shrinkage with a single group if use_global_model is False" ) if isinstance(self.shrinkage, str): # Predefined shrinkage functions shrink_options = { "constant": constant_shrinkage, "relative": relative_shrinkage, "min_n_obs": min_n_obs_shrinkage, } try: self.shrinkage_function_ = shrink_options[self.shrinkage] except KeyError: raise ValueError( f"The specified shrinkage function {self.shrinkage} is not valid, " f"choose from {list(shrink_options.keys())} or supply a callable." ) elif callable(self.shrinkage): self.__check_shrinkage_func() self.shrinkage_function_ = self.shrinkage else: raise ValueError( "Invalid shrinkage specified. Should be either None (no shrinkage), str or callable." )
def fit(self, X, y): """ Fit the model using X, y as training data. Will also learn the groups that exist within the dataset. :param X: array-like, shape=(n_columns, n_samples,) training data. :param y: array-like, shape=(n_samples,) training data. :return: Returns an instance of self. """ check_X_y(X, y) pred_col = 'the-column-that-i-want-to-predict-but-dont-have-the-name-for' if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=[str(_) for _ in range(X.shape[1])]) X = X.assign(**{pred_col: y}) self.group_colnames_ = [str(_) for _ in as_list(self.groups)] if any([c not in X.columns for c in self.group_colnames_]): raise ValueError(f"{self.group_colnames_} not in {X.columns}") self.X_colnames_ = [_ for _ in X.columns if _ not in self.group_colnames_ and _ is not pred_col] self.fallback_ = None if self.use_fallback: subset_x = X[self.X_colnames_] self.fallback_ = clone(self.estimator).fit(subset_x, y) self.groups_ = X[self.group_colnames_].drop_duplicates() self.estimators_ = (X .groupby(self.group_colnames_) .apply(lambda d: clone(self.estimator).fit(d[self.X_colnames_], d[pred_col])) .to_dict()) return self
def _add_lagged_numpy_columns(X, cols, lags, drop_na): """ Append a lag columns. :param df: the input ``np.ndarray``. :param cols: column index / indices. :param drop_na: remove rows that contain NA values. :returns: ``np.ndarray`` with the concatenated lagged cols. """ cols = as_list(cols) if not all([isinstance(col, int) for col in cols]): raise ValueError("Matrix columns are indexed by integers") if not all([col < X.shape[1] for col in cols]): raise KeyError("The column does not exist") combos = (shift(X[:, col], -lag, cval=np.NaN) for col in cols for lag in lags) # In integer-based ndarrays, NaN values are represented as # -9223372036854775808, so we convert back and forth from # original to float and back to original dtype original_type = X.dtype X = np.asarray(X, dtype=float) answer = np.column_stack((X, *combos)) # Remove rows that contain NA values when drop_na is truthy if drop_na: answer = answer[~np.isnan(answer).any(axis=1)] # Change dtype back to its original answer = np.asarray(answer, dtype=original_type) return answer
def _add_lagged_pandas_columns(df, cols, lags, drop_na): """ Append a lag columns. :param df: the input ``pd.DataFrame``. :param cols: column name(s). :param drop_na: remove rows that contain NA values. :returns: ``pd.DataFrame`` with the concatenated lagged cols. """ cols = as_list(cols) # Indexes are not supported as pandas column names may be # integers themselves, introducing unexpected behaviour if not all([col in df.columns.values for col in cols]): raise KeyError("The column does not exist") combos = (df[col].shift(-lag).rename(col + str(lag)) for col in cols for lag in lags) answer = pd.concat([df, *combos], axis=1) # Remove rows that contain NA values when drop_na is truthy if drop_na: answer = answer.dropna() return answer
def __check_cols_exist(X, cols): """Check whether the specified grouping columns are in X""" if X.shape[1] == 0: raise ValueError(f"0 feature(s) (shape=({X.shape[0]}, 0)) while a minimum of 1 is required.") # X has been converted to a DataFrame x_cols = set(X.columns) diff = set(as_list(cols)) - x_cols if len(diff) > 0: raise ValueError(f'{diff} not in columns of X {x_cols}')
def _check_coltype(self, X): for col in as_list(self.columns): if isinstance(col, str): if isinstance(X, np.ndarray): raise ValueError(f"column {col} is a string but datatype receive is numpy.") if isinstance(X, pd.DataFrame): if col not in X.columns: raise ValueError(f"column {col} is not in {X.columns}") if isinstance(col, int): if col not in range(np.atleast_2d(np.array(X)).shape[1]): raise ValueError(f"column {col} is out of bounds for input shape {X.shape}")
def __check_group_cols_exist(self, X): """Check whether the specified grouping columns are in X""" if isinstance(X, pd.DataFrame): x_cols = set(X.columns) else: ncols = 1 if X.ndim == 1 else X.shape[1] x_cols = set(range(ncols)) diff = set(as_list(self.groups)) - x_cols if len(diff) > 0: raise KeyError(f'{diff} not in columns of X ({x_cols})')
def fit(self, X, y=None): """ Checks 1) if input is a DataFrame, and 2) if column names are in this DataFrame :param X: ``pd.DataFrame`` on which we apply the column selection :param y: ``pd.Series`` labels for X. unused for column selection :returns: ``ColumnSelector`` object. """ self.columns_ = as_list(self.columns) self._check_X_for_type(X) self._check_column_length() self._check_column_names(X) return self
def fit(self, X, y=None): """Learn the projection required to make the dataset orthogonal to sensitive columns.""" self._check_coltype(X) self.col_ids_ = [v if isinstance(v, int) else self._col_idx(X, v) for v in as_list(self.columns)] X = check_array(X, estimator=self) X_fair = X.copy() v_vectors = self._make_v_vectors(X, self.col_ids_) # gram smidt process but only on sensitive attributes for i, col in enumerate(X_fair.T): for v in v_vectors.T: X_fair[:, i] = X_fair[:, i] - vector_projection(X_fair[:, i], v) # we want to learn matrix P: X P = X_fair # this means we first need to create X_fair in order to learn P self.projection_, resid, rank, s = np.linalg.lstsq(X, X_fair, rcond=None) return self
def __prepare_input_data(self, X, y=None): if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=[str(_) for _ in range(X.shape[1])]) if self.shrinkage is not None and self.use_global_model: global_col = "a-column-that-is-constant-for-all-data" X = X.assign(**{global_col: "global"}) self.groups = [global_col] + as_list(self.groups) if y is not None: if isinstance(y, np.ndarray): pred_col = 'the-column-that-i-want-to-predict-but-dont-have-the-name-for' cols = pred_col if y.ndim == 1 else ["_".join([pred_col, i]) for i in range(y.shape[1])] y = pd.Series(y, name=cols) if y.ndim == 1 else pd.DataFrame(y, columns=cols) return X, y return X
def fit(self, X, y=None): """ Fit the model using X, y as training data. Will also learn the groups that exist within the dataset. :param X: array-like, shape=(n_columns, n_samples,) training data. :param y: array-like, shape=(n_samples,) training data. :return: Returns an instance of self. """ X_group, X_value = _split_groups_and_values(X, self.groups, min_value_cols=0, check_X=self.check_X, **self._check_kwargs) X_group = self.__add_shrinkage_column(X_group) if y is not None: y = check_array(y, ensure_2d=False) if self.shrinkage is not None: self.__set_shrinkage_function() # List of all hierarchical subsets of columns self.group_colnames_hierarchical_ = expanding_list( X_group.columns, list) self.fallback_ = None if self.shrinkage is None and self.use_global_model: self.fallback_ = clone(self.estimator).fit(X_value, y) if self.shrinkage is not None: self.estimators_ = self.__fit_shrinkage_groups(X_group, X_value, y) else: self.estimators_ = self.__fit_grouped_estimator( X_group, X_value, y) self.groups_ = as_list(self.estimators_.keys()) if self.shrinkage is not None: self.shrinkage_factors_ = self.__get_shrinkage_factor(X_group) return self
def __validate(self, X, y=None): """Validate the input, used in both fit and predict""" if self.shrinkage and len(as_list(self.groups)) == 1 and not self.use_global_model: raise ValueError("Cannot do shrinkage with a single group if use_global_model is False") self.__check_cols_exist(X, self.value_colnames_) self.__check_cols_exist(X, self.group_colnames_) # Split the model data from the grouping columns, this part is checked `regularly` X_data = X.loc[:, self.value_colnames_] # y can be None because __validate used in predict, X can have no columns if the estimator only uses y if X_data.shape[1] > 0 and y is not None: check_X_y(X_data, y, multi_output=True) elif y is not None: check_array(y, ensure_2d=False) elif X_data.shape[1] > 0: check_array(X_data) self.__check_missing_and_inf(X)
def add_lags(X, cols, lags, drop_na=True): """ Appends lag column(s). :param X: array-like, shape=(n_columns, n_samples,) training data. :param cols: column name(s) or index (indices). :param lags: the amount of lag for each col. :param drop_na: remove rows that contain NA values. :returns: ``pd.DataFrame | np.ndarray`` with only the selected cols. :Example: >>> import pandas as pd >>> df = pd.DataFrame([[1, 2, 3], ... [4, 5, 6], ... [7, 8, 9]], ... columns=['a', 'b', 'c'], ... index=[1, 2, 3]) >>> add_lags(df, 'a', [1]) # doctest: +NORMALIZE_WHITESPACE a b c a1 1 1 2 3 4.0 2 4 5 6 7.0 >>> add_lags(df, ['a', 'b'], 2) # doctest: +NORMALIZE_WHITESPACE a b c a2 b2 1 1 2 3 7.0 8.0 >>> import numpy as np >>> X = np.array([[1, 2, 3], ... [4, 5, 6], ... [7, 8, 9]]) >>> add_lags(X, 0, [1]) array([[1, 2, 3, 4], [4, 5, 6, 7]]) >>> add_lags(X, 1, [-1, 1]) array([[4, 5, 6, 2, 8]]) """ # A single lag will be put in a list lags = as_list(lags) # Now we can iterate over the list to determine # whether it is a list of integers if not all(isinstance(x, int) for x in lags): raise ValueError("lags must be a list of type: " + str(int)) # The keys of the allowed_inputs dict contain the allowed # types, and the values contain the associated handlers allowed_inputs = { pd.core.frame.DataFrame: _add_lagged_pandas_columns, np.ndarray: _add_lagged_numpy_columns, } # Choose the correct handler based on the input class for allowed_input, handler in allowed_inputs.items(): if isinstance(X, allowed_input): return handler(X, cols, lags, drop_na) # Otherwise, raise a ValueError allowed_input_names = list(allowed_inputs.keys()) raise ValueError("X type should be one of:", allowed_input_names)
def __init__(self, columns, alpha=1): self.columns = columns # sklearn does not allow `as_list` immediately because of cloning reasons self.cols = as_list(columns) self.alpha = alpha
def test_as_list_strings(): assert as_list("test") == ["test"] assert as_list(["test1", "test2"]) == ["test1", "test2"]
def __init__(self, columns: list): # if the columns parameter is not a list, make it into a list self.columns = as_list(columns)
def test_as_list_ints(): assert as_list(123) == [123] assert as_list([1, 2, 3]) == [1, 2, 3]
def test_as_list_strings(): assert as_list('test') == ['test'] assert as_list(['test1', 'test2']) == ['test1', 'test2']
def test_as_list_other(): def f(): return 123 assert as_list(f) == [f] assert as_list(range(1, 4)) == [1, 2, 3]