Esempio n. 1
0
def _split_groups_and_values(X,
                             groups,
                             name="",
                             min_value_cols=1,
                             check_X=True,
                             **kwargs) -> Tuple[pd.DataFrame, np.ndarray]:
    _data_format_checks(X, name=name)
    _shape_check(X, min_value_cols)

    try:
        if isinstance(X, pd.DataFrame):
            X_group = X.loc[:, as_list(groups)]
            X_value = X.drop(columns=groups).values
        else:
            X_group = pd.DataFrame(X[:, as_list(groups)])
            pos_indexes = range(X.shape[1])
            X_value = np.delete(X, [pos_indexes[g] for g in as_list(groups)],
                                axis=1)
    except (KeyError, IndexError):
        raise ValueError(f"Could not drop groups {groups} from columns of X")

    X_group = _check_grouping_columns(X_group, **kwargs)

    if check_X:
        X_value = check_array(X_value, **kwargs)

    return X_group, X_value
Esempio n. 2
0
    def fit(self, X, y=None):
        """
        Fit the model using X, y as training data. Will also learn the groups that exist within the dataset.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: array-like, shape=(n_samples,) training data.
        :return: Returns an instance of self.
        """
        X, y = self.__prepare_input_data(X, y)

        if self.shrinkage is not None:
            self.__set_shrinkage_function()

        self.group_colnames_ = [str(_) for _ in as_list(self.groups)]

        if self.value_columns is not None:
            self.value_colnames_ = [
                str(_) for _ in as_list(self.value_columns)
            ]
        else:
            self.value_colnames_ = [
                _ for _ in X.columns if _ not in self.group_colnames_
            ]
        self.__validate(X, y)

        # List of all hierarchical subsets of columns
        self.group_colnames_hierarchical_ = expanding_list(
            self.group_colnames_, list)

        self.fallback_ = None

        if self.shrinkage is None and self.use_global_model:
            subset_x = X[self.value_colnames_]
            self.fallback_ = clone(self.estimator).fit(subset_x, y)

        if self.shrinkage is not None:
            self.estimators_ = {}

            for level_colnames in self.group_colnames_hierarchical_:
                self.estimators_.update(
                    self.__fit_grouped_estimator(X, y, self.value_colnames_,
                                                 level_colnames))
        else:
            self.estimators_ = self.__fit_grouped_estimator(
                X, y, self.value_colnames_, self.group_colnames_)

        self.groups_ = as_list(self.estimators_.keys())

        if self.shrinkage is not None:
            self.shrinkage_factors_ = self.__get_shrinkage_factor(X)

        return self
    def __get_shrinkage_factor(self, X_group):
        """Get for all complete groups an array of shrinkages"""
        group_colnames = X_group.columns.to_list()
        counts = X_group.groupby(group_colnames).size()

        # Groups that are split on all
        most_granular_groups = [
            grp for grp in self.groups_ if len(as_list(grp)) == len(group_colnames)
        ]

        # For each hierarchy level in each most granular group, get the number of observations
        hierarchical_counts = {
            granular_group: [
                counts[tuple(subgroup)].sum()
                for subgroup in expanding_list(granular_group, tuple)
            ]
            for granular_group in most_granular_groups
        }

        # For each hierarchy level in each most granular group, get the shrinkage factor
        shrinkage_factors = {
            group: self.shrinkage_function_(counts, **self.shrinkage_kwargs)
            for group, counts in hierarchical_counts.items()
        }

        # Make sure that the factors sum to one
        shrinkage_factors = {
            group: value / value.sum() for group, value in shrinkage_factors.items()
        }

        return shrinkage_factors
    def __set_shrinkage_function(self):
        if (
            self.shrinkage
            and len(as_list(self.groups)) == 1
            and not self.use_global_model
        ):
            raise ValueError(
                "Cannot do shrinkage with a single group if use_global_model is False"
            )

        if isinstance(self.shrinkage, str):
            # Predefined shrinkage functions
            shrink_options = {
                "constant": constant_shrinkage,
                "relative": relative_shrinkage,
                "min_n_obs": min_n_obs_shrinkage,
            }

            try:
                self.shrinkage_function_ = shrink_options[self.shrinkage]
            except KeyError:
                raise ValueError(
                    f"The specified shrinkage function {self.shrinkage} is not valid, "
                    f"choose from {list(shrink_options.keys())} or supply a callable."
                )
        elif callable(self.shrinkage):
            self.__check_shrinkage_func()
            self.shrinkage_function_ = self.shrinkage
        else:
            raise ValueError(
                "Invalid shrinkage specified. Should be either None (no shrinkage), str or callable."
            )
Esempio n. 5
0
    def fit(self, X, y):
        """
        Fit the model using X, y as training data. Will also learn the groups
        that exist within the dataset.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: array-like, shape=(n_samples,) training data.
        :return: Returns an instance of self.
        """
        check_X_y(X, y)
        pred_col = 'the-column-that-i-want-to-predict-but-dont-have-the-name-for'
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=[str(_) for _ in range(X.shape[1])])
        X = X.assign(**{pred_col: y})

        self.group_colnames_ = [str(_) for _ in as_list(self.groups)]
        if any([c not in X.columns for c in self.group_colnames_]):
            raise ValueError(f"{self.group_colnames_} not in {X.columns}")
        self.X_colnames_ = [_ for _ in X.columns if _ not in self.group_colnames_ and _ is not pred_col]
        self.fallback_ = None
        if self.use_fallback:
            subset_x = X[self.X_colnames_]
            self.fallback_ = clone(self.estimator).fit(subset_x, y)

        self.groups_ = X[self.group_colnames_].drop_duplicates()

        self.estimators_ = (X
                            .groupby(self.group_colnames_)
                            .apply(lambda d: clone(self.estimator).fit(d[self.X_colnames_], d[pred_col]))
                            .to_dict())
        return self
Esempio n. 6
0
def _add_lagged_numpy_columns(X, cols, lags, drop_na):
    """
    Append a lag columns.

    :param df: the input ``np.ndarray``.
    :param cols: column index / indices.
    :param drop_na: remove rows that contain NA values.
    :returns: ``np.ndarray`` with the concatenated lagged cols.
    """

    cols = as_list(cols)

    if not all([isinstance(col, int) for col in cols]):
        raise ValueError("Matrix columns are indexed by integers")

    if not all([col < X.shape[1] for col in cols]):
        raise KeyError("The column does not exist")

    combos = (shift(X[:, col], -lag, cval=np.NaN) for col in cols
              for lag in lags)

    # In integer-based ndarrays, NaN values are represented as
    # -9223372036854775808, so we convert back and forth from
    # original to float and back to original dtype
    original_type = X.dtype
    X = np.asarray(X, dtype=float)
    answer = np.column_stack((X, *combos))

    # Remove rows that contain NA values when drop_na is truthy
    if drop_na:
        answer = answer[~np.isnan(answer).any(axis=1)]

    # Change dtype back to its original
    answer = np.asarray(answer, dtype=original_type)
    return answer
Esempio n. 7
0
def _add_lagged_pandas_columns(df, cols, lags, drop_na):
    """
    Append a lag columns.

    :param df: the input ``pd.DataFrame``.
    :param cols: column name(s).
    :param drop_na: remove rows that contain NA values.
    :returns: ``pd.DataFrame`` with the concatenated lagged cols.
    """

    cols = as_list(cols)

    # Indexes are not supported as pandas column names may be
    # integers themselves, introducing unexpected behaviour
    if not all([col in df.columns.values for col in cols]):
        raise KeyError("The column does not exist")

    combos = (df[col].shift(-lag).rename(col + str(lag)) for col in cols
              for lag in lags)

    answer = pd.concat([df, *combos], axis=1)

    # Remove rows that contain NA values when drop_na is truthy
    if drop_na:
        answer = answer.dropna()

    return answer
Esempio n. 8
0
    def __check_cols_exist(X, cols):
        """Check whether the specified grouping columns are in X"""
        if X.shape[1] == 0:
            raise ValueError(f"0 feature(s) (shape=({X.shape[0]}, 0)) while a minimum of 1 is required.")

        # X has been converted to a DataFrame
        x_cols = set(X.columns)
        diff = set(as_list(cols)) - x_cols

        if len(diff) > 0:
            raise ValueError(f'{diff} not in columns of X {x_cols}')
Esempio n. 9
0
 def _check_coltype(self, X):
     for col in as_list(self.columns):
         if isinstance(col, str):
             if isinstance(X, np.ndarray):
                 raise ValueError(f"column {col} is a string but datatype receive is numpy.")
             if isinstance(X, pd.DataFrame):
                 if col not in X.columns:
                     raise ValueError(f"column {col} is not in {X.columns}")
         if isinstance(col, int):
             if col not in range(np.atleast_2d(np.array(X)).shape[1]):
                 raise ValueError(f"column {col} is out of bounds for input shape {X.shape}")
Esempio n. 10
0
    def __check_group_cols_exist(self, X):
        """Check whether the specified grouping columns are in X"""
        if isinstance(X, pd.DataFrame):
            x_cols = set(X.columns)
        else:
            ncols = 1 if X.ndim == 1 else X.shape[1]

            x_cols = set(range(ncols))

        diff = set(as_list(self.groups)) - x_cols
        if len(diff) > 0:
            raise KeyError(f'{diff} not in columns of X ({x_cols})')
Esempio n. 11
0
    def fit(self, X, y=None):
        """
        Checks 1) if input is a DataFrame, and 2) if column names are in this DataFrame

        :param X: ``pd.DataFrame`` on which we apply the column selection
        :param y: ``pd.Series`` labels for X. unused for column selection
        :returns: ``ColumnSelector`` object.
        """
        self.columns_ = as_list(self.columns)
        self._check_X_for_type(X)
        self._check_column_length()
        self._check_column_names(X)
        return self
Esempio n. 12
0
 def fit(self, X, y=None):
     """Learn the projection required to make the dataset orthogonal to sensitive columns."""
     self._check_coltype(X)
     self.col_ids_ = [v if isinstance(v, int) else self._col_idx(X, v) for v in as_list(self.columns)]
     X = check_array(X, estimator=self)
     X_fair = X.copy()
     v_vectors = self._make_v_vectors(X, self.col_ids_)
     # gram smidt process but only on sensitive attributes
     for i, col in enumerate(X_fair.T):
         for v in v_vectors.T:
             X_fair[:, i] = X_fair[:, i] - vector_projection(X_fair[:, i], v)
     # we want to learn matrix P: X P = X_fair
     # this means we first need to create X_fair in order to learn P
     self.projection_, resid, rank, s = np.linalg.lstsq(X, X_fair, rcond=None)
     return self
Esempio n. 13
0
    def __prepare_input_data(self, X, y=None):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=[str(_) for _ in range(X.shape[1])])

        if self.shrinkage is not None and self.use_global_model:
            global_col = "a-column-that-is-constant-for-all-data"
            X = X.assign(**{global_col: "global"})
            self.groups = [global_col] + as_list(self.groups)

        if y is not None:
            if isinstance(y, np.ndarray):
                pred_col = 'the-column-that-i-want-to-predict-but-dont-have-the-name-for'
                cols = pred_col if y.ndim == 1 else ["_".join([pred_col, i]) for i in range(y.shape[1])]
                y = pd.Series(y, name=cols) if y.ndim == 1 else pd.DataFrame(y, columns=cols)

            return X, y

        return X
Esempio n. 14
0
    def fit(self, X, y=None):
        """
        Fit the model using X, y as training data. Will also learn the groups that exist within the dataset.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: array-like, shape=(n_samples,) training data.
        :return: Returns an instance of self.
        """

        X_group, X_value = _split_groups_and_values(X,
                                                    self.groups,
                                                    min_value_cols=0,
                                                    check_X=self.check_X,
                                                    **self._check_kwargs)

        X_group = self.__add_shrinkage_column(X_group)

        if y is not None:
            y = check_array(y, ensure_2d=False)

        if self.shrinkage is not None:
            self.__set_shrinkage_function()

        # List of all hierarchical subsets of columns
        self.group_colnames_hierarchical_ = expanding_list(
            X_group.columns, list)

        self.fallback_ = None

        if self.shrinkage is None and self.use_global_model:
            self.fallback_ = clone(self.estimator).fit(X_value, y)

        if self.shrinkage is not None:
            self.estimators_ = self.__fit_shrinkage_groups(X_group, X_value, y)
        else:
            self.estimators_ = self.__fit_grouped_estimator(
                X_group, X_value, y)

        self.groups_ = as_list(self.estimators_.keys())

        if self.shrinkage is not None:
            self.shrinkage_factors_ = self.__get_shrinkage_factor(X_group)

        return self
Esempio n. 15
0
    def __validate(self, X, y=None):
        """Validate the input, used in both fit and predict"""
        if self.shrinkage and len(as_list(self.groups)) == 1 and not self.use_global_model:
            raise ValueError("Cannot do shrinkage with a single group if use_global_model is False")

        self.__check_cols_exist(X, self.value_colnames_)
        self.__check_cols_exist(X, self.group_colnames_)

        # Split the model data from the grouping columns, this part is checked `regularly`
        X_data = X.loc[:, self.value_colnames_]

        # y can be None because __validate used in predict, X can have no columns if the estimator only uses y
        if X_data.shape[1] > 0 and y is not None:
            check_X_y(X_data, y, multi_output=True)
        elif y is not None:
            check_array(y, ensure_2d=False)
        elif X_data.shape[1] > 0:
            check_array(X_data)

        self.__check_missing_and_inf(X)
Esempio n. 16
0
def add_lags(X, cols, lags, drop_na=True):
    """
    Appends lag column(s).

    :param X: array-like, shape=(n_columns, n_samples,) training data.
    :param cols: column name(s) or index (indices).
    :param lags: the amount of lag for each col.
    :param drop_na: remove rows that contain NA values.
    :returns: ``pd.DataFrame | np.ndarray`` with only the selected cols.

    :Example:

    >>> import pandas as pd
    >>> df = pd.DataFrame([[1, 2, 3],
    ...                    [4, 5, 6],
    ...                    [7, 8, 9]],
    ...                    columns=['a', 'b', 'c'],
    ...                    index=[1, 2, 3])

    >>> add_lags(df, 'a', [1]) # doctest: +NORMALIZE_WHITESPACE
       a  b  c  a1
    1  1  2  3  4.0
    2  4  5  6  7.0

    >>> add_lags(df, ['a', 'b'], 2) # doctest: +NORMALIZE_WHITESPACE
       a  b  c  a2   b2
    1  1  2  3  7.0  8.0

    >>> import numpy as np
    >>> X = np.array([[1, 2, 3],
    ...               [4, 5, 6],
    ...               [7, 8, 9]])

    >>> add_lags(X, 0, [1])
    array([[1, 2, 3, 4],
           [4, 5, 6, 7]])

    >>> add_lags(X, 1, [-1, 1])
    array([[4, 5, 6, 2, 8]])
    """

    # A single lag will be put in a list
    lags = as_list(lags)

    # Now we can iterate over the list to determine
    # whether it is a list of integers
    if not all(isinstance(x, int) for x in lags):
        raise ValueError("lags must be a list of type: " + str(int))

    # The keys of the allowed_inputs dict contain the allowed
    # types, and the values contain the associated handlers
    allowed_inputs = {
        pd.core.frame.DataFrame: _add_lagged_pandas_columns,
        np.ndarray: _add_lagged_numpy_columns,
    }

    # Choose the correct handler based on the input class
    for allowed_input, handler in allowed_inputs.items():
        if isinstance(X, allowed_input):
            return handler(X, cols, lags, drop_na)

    # Otherwise, raise a ValueError
    allowed_input_names = list(allowed_inputs.keys())
    raise ValueError("X type should be one of:", allowed_input_names)
Esempio n. 17
0
 def __init__(self, columns, alpha=1):
     self.columns = columns
     # sklearn does not allow `as_list` immediately because of cloning reasons
     self.cols = as_list(columns)
     self.alpha = alpha
Esempio n. 18
0
def test_as_list_strings():
    assert as_list("test") == ["test"]
    assert as_list(["test1", "test2"]) == ["test1", "test2"]
Esempio n. 19
0
 def __init__(self, columns: list):
     # if the columns parameter is not a list, make it into a list
     self.columns = as_list(columns)
Esempio n. 20
0
def test_as_list_ints():
    assert as_list(123) == [123]
    assert as_list([1, 2, 3]) == [1, 2, 3]
Esempio n. 21
0
def test_as_list_strings():
    assert as_list('test') == ['test']
    assert as_list(['test1', 'test2']) == ['test1', 'test2']
Esempio n. 22
0
def test_as_list_other():
    def f():
        return 123

    assert as_list(f) == [f]
    assert as_list(range(1, 4)) == [1, 2, 3]