Exemple #1
0
def _transform_pandas_df(
    data: DataFrame,
    enable_categorical: bool,
    feature_names: Optional[List[str]] = None,
    feature_types: Optional[List[str]] = None,
    meta: Optional[str] = None,
    meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
    import pandas as pd
    from pandas.api.types import is_sparse, is_categorical_dtype

    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
               (is_categorical_dtype(dtype) and enable_categorical)
               for dtype in data.dtypes):
        _invalid_dataframe_dtype(data)

    # handle feature names
    if feature_names is None and meta is None:
        if isinstance(data.columns, pd.MultiIndex):
            feature_names = [
                " ".join([str(x) for x in i]) for i in data.columns
            ]
        elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()

    # handle feature types
    if feature_types is None and meta is None:
        feature_types = []
        for i, dtype in enumerate(data.dtypes):
            if is_sparse(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append(CAT_T)
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])

    # handle category codes.
    transformed = pd.DataFrame()
    if enable_categorical:
        for i, dtype in enumerate(data.dtypes):
            if is_categorical_dtype(dtype):
                # pandas uses -1 as default missing value for categorical data
                transformed[data.columns[i]] = (
                    data[data.columns[i]].cat.codes.astype(np.float32).replace(
                        -1.0, np.NaN))
            else:
                transformed[data.columns[i]] = data[data.columns[i]]
    else:
        transformed = data

    if meta and len(data.columns) > 1 and meta not in _matrix_meta:
        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

    dtype = meta_type if meta_type else np.float32
    arr = transformed.values
    if meta_type:
        arr = arr.astype(meta_type)
    return arr, feature_names, feature_types
Exemple #2
0
    def _maybe_pandas_data(self,
                           data,
                           feature_names,
                           feature_types,
                           meta=None,
                           meta_type=None):
        """Extract internal data from pd.DataFrame for DMatrix data"""
        if lazy_isinstance(data, 'pandas.core.series', 'Series'):
            dtype = meta_type if meta_type else 'float'
            return data.values.astype(dtype), feature_names, feature_types

        from pandas.api.types import is_sparse
        from pandas import MultiIndex, Int64Index

        data_dtypes = data.dtypes
        if not all(dtype.name in self.pandas_dtype_mapper or is_sparse(dtype)
                   for dtype in data_dtypes):
            bad_fields = [
                str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
                if dtype.name not in self.pandas_dtype_mapper
            ]

            msg = """DataFrame.dtypes for data must be int, float or bool.
                    Did not expect the data types in fields """
            raise ValueError(msg + ', '.join(bad_fields))

        if feature_names is None and meta is None:
            if isinstance(data.columns, MultiIndex):
                feature_names = [
                    ' '.join([str(x) for x in i]) for i in data.columns
                ]
            elif isinstance(data.columns, Int64Index):
                feature_names = list(map(str, data.columns))
            else:
                feature_names = data.columns.format()

        if feature_types is None and meta is None:
            feature_types = []
            for dtype in data_dtypes:
                if is_sparse(dtype):
                    feature_types.append(
                        self.pandas_dtype_mapper[dtype.subtype.name])
                else:
                    feature_types.append(self.pandas_dtype_mapper[dtype.name])

        if meta and len(data.columns) > 1:
            raise ValueError(
                'DataFrame for {meta} cannot have multiple columns'.format(
                    meta=meta))

        dtype = meta_type if meta_type else 'float'
        data = data.values.astype(dtype)

        return data, feature_names, feature_types
Exemple #3
0
def _transform_pandas_df(data,
                         enable_categorical,
                         feature_names=None,
                         feature_types=None,
                         meta=None,
                         meta_type=None):
    from pandas import MultiIndex, Int64Index, RangeIndex
    from pandas.api.types import is_sparse, is_categorical_dtype

    data_dtypes = data.dtypes
    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
               (is_categorical_dtype(dtype) and enable_categorical)
               for dtype in data_dtypes):
        bad_fields = [
            str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
            if dtype.name not in _pandas_dtype_mapper
        ]

        msg = """DataFrame.dtypes for data must be int, float, bool or categorical.  When
                categorical type is supplied, DMatrix parameter
                `enable_categorical` must be set to `True`."""
        raise ValueError(msg + ', '.join(bad_fields))

    if feature_names is None and meta is None:
        if isinstance(data.columns, MultiIndex):
            feature_names = [
                ' '.join([str(x) for x in i]) for i in data.columns
            ]
        elif isinstance(data.columns, (Int64Index, RangeIndex)):
            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()

    if feature_types is None and meta is None:
        feature_types = []
        for dtype in data_dtypes:
            if is_sparse(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append('categorical')
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])

    if meta and len(data.columns) > 1:
        raise ValueError(
            'DataFrame for {meta} cannot have multiple columns'.format(
                meta=meta))

    dtype = meta_type if meta_type else np.float32
    data = data.values
    if meta_type:
        data = data.astype(meta_type)
    return data, feature_names, feature_types
Exemple #4
0
def _pandas_feature_info(
    data: DataFrame,
    meta: Optional[str],
    feature_names: FeatureNames,
    feature_types: FeatureTypes,
    enable_categorical: bool,
) -> Tuple[FeatureNames, FeatureTypes]:
    import pandas as pd
    from pandas.api.types import (
        is_sparse,
        is_categorical_dtype,
    )

    # handle feature names
    if feature_names is None and meta is None:
        if isinstance(data.columns, pd.MultiIndex):
            feature_names = [
                " ".join([str(x) for x in i]) for i in data.columns
            ]
        elif isinstance(data.columns, (pd.Index, pd.RangeIndex)):
            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()

    # handle feature types
    if feature_types is None and meta is None:
        feature_types = []
        for dtype in data.dtypes:
            if is_sparse(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append(CAT_T)
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])
    return feature_names, feature_types
Exemple #5
0
def _transform_pandas_df(
    data: DataFrame,
    enable_categorical: bool,
    feature_names: FeatureNames = None,
    feature_types: FeatureTypes = None,
    meta: Optional[str] = None,
    meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
    from pandas.api.types import (
        is_sparse,
        is_categorical_dtype,
    )

    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype)
               or is_nullable_dtype(dtype) or
               (is_categorical_dtype(dtype) and enable_categorical)
               for dtype in data.dtypes):
        _invalid_dataframe_dtype(data)

    feature_names, feature_types = _pandas_feature_info(
        data, meta, feature_names, feature_types, enable_categorical)

    transformed = _pandas_cat_null(data)

    if meta and len(data.columns) > 1 and meta not in _matrix_meta:
        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

    dtype: Union[Type[np.floating],
                 str] = meta_type if meta_type else np.float32
    arr: np.ndarray = transformed.values
    if meta_type:
        arr = arr.astype(dtype)
    return arr, feature_names, feature_types
    def inner(series: pd.Series, state: dict, *args, **kwargs) -> bool:
        if pdt.is_sparse(series):
            dtype = series.dtype.subtype
        else:
            dtype = series.dtype
        state["dtype"] = dtype

        return fn(series, state, *args, **kwargs)
Exemple #7
0
def _transform_pandas_df(data,
                         feature_names=None,
                         feature_types=None,
                         meta=None,
                         meta_type=None):
    from pandas import MultiIndex, Int64Index
    from pandas.api.types import is_sparse
    data_dtypes = data.dtypes
    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype)
               for dtype in data_dtypes):
        bad_fields = [
            str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
            if dtype.name not in _pandas_dtype_mapper
        ]

        msg = """DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields """
        raise ValueError(msg + ', '.join(bad_fields))

    if feature_names is None and meta is None:
        if isinstance(data.columns, MultiIndex):
            feature_names = [
                ' '.join([str(x) for x in i]) for i in data.columns
            ]
        elif isinstance(data.columns, Int64Index):
            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()

    if feature_types is None and meta is None:
        feature_types = []
        for dtype in data_dtypes:
            if is_sparse(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])

    if meta and len(data.columns) > 1:
        raise ValueError(
            'DataFrame for {meta} cannot have multiple columns'.format(
                meta=meta))

    dtype = meta_type if meta_type else np.float32
    data = np.ascontiguousarray(data.values, dtype=dtype)

    return data, feature_names, feature_types
Exemple #8
0
def _meta_from_pandas_series(data, name: str, dtype: Optional[str],
                             handle: ctypes.c_void_p) -> None:
    """Help transform pandas series for meta data like labels"""
    data = data.values.astype('float')
    from pandas.api.types import is_sparse
    if is_sparse(data):
        data = data.to_dense()
    assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
    _meta_from_numpy(data, name, dtype, handle)
Exemple #9
0
    def test_isna_returns_copy(self, data_missing, na_func):
        result = pd.Series(data_missing)
        expected = result.copy()
        mask = getattr(result, na_func)()
        if is_sparse(mask):
            mask = np.array(mask)

        mask[:] = True
        self.assert_series_equal(result, expected)
Exemple #10
0
def _split_sparse(df):
    sparse_col = []
    non_sparse_col = []
    for col in df.columns:
        if is_sparse(df[col]):
            sparse_col.append(col)
        else:
            non_sparse_col.append(col)
    return df[sparse_col], df[non_sparse_col]
Exemple #11
0
def get_bad_pandas_dtypes(dtypes: list) -> list:
    # from lightgbm's python-package/lightgbm/basic.py
    pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int',
                           'int64': 'int', 'uint8': 'int', 'uint16': 'int',
                           'uint32': 'int', 'uint64': 'int', 'bool': 'int',
                           'float16': 'float', 'float32': 'float', 'float64': 'float'}
    bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper
                                                           and (not is_sparse(dtype)
                                                                or dtype.subtype.name not in pandas_dtype_mapper))]
    return bad_indices
Exemple #12
0
def _nonempty_series(s, idx=None):
    # TODO: Use register dtypes with make_array_nonempty
    if idx is None:
        idx = _nonempty_index(s.index)
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp("1970-01-01", tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        if len(s.cat.categories):
            data = [s.cat.categories[0]] * 2
            cats = s.cat.categories
        else:
            data = _nonempty_index(s.cat.categories)
            cats = s.cat.categories[:0]
        data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered)
    elif is_integer_na_dtype(dtype):
        data = pd.array([1, None], dtype=dtype)
    elif is_period_dtype(dtype):
        # pandas 0.24.0+ should infer this to be Series[Period[freq]]
        freq = dtype.freq
        data = [pd.Period("2000", freq), pd.Period("2001", freq)]
    elif is_sparse(dtype):
        # TODO: pandas <0.24
        # Pandas <= 0.23.4:
        if PANDAS_GT_0240:
            entry = _scalar_from_dtype(dtype.subtype)
        else:
            entry = _scalar_from_dtype(dtype.subtype)
        if PANDAS_GT_100:
            data = pd.array([entry, entry], dtype=dtype)
        else:
            data = pd.SparseArray([entry, entry], dtype=dtype)
    elif is_interval_dtype(dtype):
        entry = _scalar_from_dtype(dtype.subtype)
        if PANDAS_GT_0240:
            data = pd.array([entry, entry], dtype=dtype)
        else:
            data = np.array([entry, entry], dtype=dtype)
    elif type(dtype) in make_array_nonempty._lookup:
        data = make_array_nonempty(dtype)
    else:
        entry = _scalar_from_dtype(dtype)
        data = np.array([entry, entry], dtype=dtype)

    out = pd.Series(data, name=s.name, index=idx)
    if PANDAS_GT_100:
        out.attrs = s.attrs
    return out
Exemple #13
0
def to_dense(dataframe, nonsparse=NONSPARSE_COLUMNS):
    """
    Convert a sparse CLICnet dataframe to dense.
    """
    nonsparse = nonsparse & set(dataframe.columns)
    saved_columns = {}
    for column in nonsparse:
        saved_columns[column] = dataframe.loc[:, column].tolist()
    dataframe = dataframe.drop(columns=nonsparse)
    if is_sparse(dataframe.iloc[:, 0]):
        ret = dataframe.sparse.to_dense()
    else:
        ret = dataframe
    for column, value in saved_columns.items():
        ret.loc[:, column] = value
    return ret
Exemple #14
0
def _nonempty_series(s, idx=None):
    # TODO: Use register dtypes with make_array_nonempty
    if idx is None:
        idx = _nonempty_index(s.index)
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        if len(s.cat.categories):
            data = [s.cat.categories[0]] * 2
            cats = s.cat.categories
        else:
            data = _nonempty_index(s.cat.categories)
            cats = None
        data = pd.Categorical(data, categories=cats,
                              ordered=s.cat.ordered)
    elif is_integer_na_dtype(dtype):
        data = pd.array([1, None], dtype=dtype)
    elif is_period_dtype(dtype):
        # pandas 0.24.0+ should infer this to be Series[Period[freq]]
        freq = dtype.freq
        data = [pd.Period('2000', freq), pd.Period('2001', freq)]
    elif is_sparse(dtype):
        # TODO: pandas <0.24
        # Pandas <= 0.23.4:
        if PANDAS_GT_0240:
            entry = _scalar_from_dtype(dtype.subtype)
        else:
            entry = _scalar_from_dtype(dtype.subtype)
        data = pd.SparseArray([entry, entry], dtype=dtype)
    elif is_interval_dtype(dtype):
        entry = _scalar_from_dtype(dtype.subtype)
        if PANDAS_GT_0240:
            data = pd.array([entry, entry], dtype=dtype)
        else:
            data = np.array([entry, entry], dtype=dtype)
    elif type(dtype) in make_array_nonempty._lookup:
        data = make_array_nonempty(dtype)
    else:
        entry = _scalar_from_dtype(dtype)
        data = np.array([entry, entry], dtype=dtype)

    return pd.Series(data, name=s.name, index=idx)
Exemple #15
0
def check_user_df(ktk_cube_dataset_id, df, cube, existing_payload,
                  partition_on):
    """
    Check user-provided DataFrame for sanity.

    Parameters
    ----------
    ktk_cube_dataset_id: str
        Ktk_cube dataset UUID (w/o cube prefix).
    df: Optional[pandas.DataFrame]
        DataFrame to be passed to Kartothek.
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    existing_payload: Set[str]
        Existing payload columns.
    partition_on: Iterable[str]
        Partition-on attribute for given dataset.

    Raises
    ------
    ValueError
        In case anything is fishy.
    """
    if df is None:
        return
    if not (isinstance(df, pd.DataFrame) or isinstance(df, dd.DataFrame)):
        raise TypeError(
            'Provided DataFrame is not a pandas.DataFrame or None, but is a "{t}"'
            .format(t=type(df).__name__))
    if any(is_sparse(dtype) for dtype in df.dtypes):
        raise TypeError("Sparse data is not supported.")

    # call this once since `df.columns` can be quite slow
    df_columns = list(df.columns)
    df_columns_set = set(df_columns)
    dcols_present = set(cube.dimension_columns) & df_columns_set

    if len(df_columns) != len(df_columns_set):
        raise ValueError(
            'Duplicate columns found in dataset "{ktk_cube_dataset_id}": {df_columns}'
            .format(
                ktk_cube_dataset_id=ktk_cube_dataset_id,
                df_columns=", ".join(df_columns),
            ))

    if ktk_cube_dataset_id == cube.seed_dataset:
        missing_dimension_columns = set(
            cube.dimension_columns) - df_columns_set
        if missing_dimension_columns:
            raise ValueError(
                'Missing dimension columns in seed data "{ktk_cube_dataset_id}": {missing_dimension_columns}'
                .format(
                    ktk_cube_dataset_id=ktk_cube_dataset_id,
                    missing_dimension_columns=", ".join(
                        sorted(missing_dimension_columns)),
                ))
    else:
        if len(dcols_present) == 0:
            raise ValueError(
                'Dataset "{ktk_cube_dataset_id}" must have at least 1 of the following dimension columns: {dims}'
                .format(
                    ktk_cube_dataset_id=ktk_cube_dataset_id,
                    dims=", ".join(cube.dimension_columns),
                ))

    missing_partition_columns = set(partition_on) - df_columns_set
    if missing_partition_columns:
        raise ValueError(
            'Missing partition columns in dataset "{ktk_cube_dataset_id}": {missing_partition_columns}'
            .format(
                ktk_cube_dataset_id=ktk_cube_dataset_id,
                missing_partition_columns=", ".join(
                    sorted(missing_partition_columns)),
            ))

    # Factor this check out. All others can be performed on the dask.DataFrame.
    # This one can only be executed on a pandas DataFame
    if isinstance(df, pd.DataFrame):
        assert_dimesion_index_cols_notnull(
            ktk_cube_dataset_id=ktk_cube_dataset_id,
            df=df,
            cube=cube,
            partition_on=partition_on,
        )

    payload = get_payload_subset(df.columns, cube)
    payload_overlap = payload & existing_payload
    if payload_overlap:
        raise ValueError(
            'Payload written in "{ktk_cube_dataset_id}" is already present in cube: {payload_overlap}'
            .format(
                ktk_cube_dataset_id=ktk_cube_dataset_id,
                payload_overlap=", ".join(sorted(payload_overlap)),
            ))

    unspecified_partition_columns = (df_columns_set - set(partition_on)) & set(
        cube.partition_columns)
    if unspecified_partition_columns:
        raise ValueError(
            f"Unspecified but provided partition columns in {ktk_cube_dataset_id}: "
            f"{', '.join(sorted(unspecified_partition_columns))}")
 def inner(series: pd.Series, *args, **kwargs) -> bool:
     if pdt.is_sparse(series):
         return False
     return fn(series, *args, **kwargs)
Exemple #17
0
    def get_feat_type_from_columns(
        self,
        X: pd.DataFrame,
    ) -> typing.Dict[typing.Union[str, int], str]:
        """
        Returns a dictionary that maps pandas dataframe columns to a feature type.
        This feature type can be categorical or numerical

        Parameters
        ----------
            X: pd.DataFrame
                A set of features that are going to be validated (type and dimensionality
                checks) and a encoder fitted in the case the data needs encoding
        Returns
        -------
            feat_type:
                dictionary with column to feature type mapping
        """

        # Also, register the feature types for the estimator
        feat_type = {}

        # Make sure each column is a valid type
        for i, column in enumerate(X.columns):
            if is_sparse(X[column]):
                raise ValueError(
                    "Auto-sklearn does not yet support sparse pandas Series."
                    f" Please convert {column} to a dense format.")
            elif X[column].dtype.name in ['category', 'bool']:

                feat_type[column] = 'categorical'
            # Move away from np.issubdtype as it causes
            # TypeError: data type not understood in certain pandas types
            elif not is_numeric_dtype(X[column]):
                if X[column].dtype.name == 'object':
                    raise ValueError(
                        "Input Column {} has invalid type object. "
                        "Cast it to a valid dtype before using it in Auto-Sklearn. "
                        "Valid types are numerical, categorical or boolean. "
                        "You can cast it to a valid dtype using "
                        "pandas.Series.astype ."
                        "If working with string objects, the following "
                        "tutorial illustrates how to work with text data: "
                        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html"
                        .format(  # noqa: E501
                            column, ))
                elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(
                        X[column].dtype):
                    raise ValueError(
                        "Auto-sklearn does not support time and/or date datatype as given "
                        "in column {}. Please convert the time information to a numerical value "
                        "first. One example on how to do this can be found on "
                        "https://stats.stackexchange.com/questions/311494/".
                        format(column, ))
                else:
                    raise ValueError(
                        "Input Column {} has unsupported dtype {}. "
                        "Supported column types are categorical/bool/numerical dtypes. "
                        "Make sure your data is formatted in a correct way, "
                        "before feeding it to Auto-Sklearn.".format(
                            column,
                            X[column].dtype.name,
                        ))
            else:
                feat_type[column] = 'numerical'
        return feat_type
Exemple #18
0
def sparse_contains(series: pd.Series, state: dict) -> bool:
    return pdt.is_sparse(series)
Exemple #19
0
 def contains_op(cls, series: pd.Series) -> bool:
     return pdt.is_sparse(series)