def _transform_pandas_df( data: DataFrame, enable_categorical: bool, feature_names: Optional[List[str]] = None, feature_types: Optional[List[str]] = None, meta: Optional[str] = None, meta_type: Optional[str] = None, ) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]: import pandas as pd from pandas.api.types import is_sparse, is_categorical_dtype if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or (is_categorical_dtype(dtype) and enable_categorical) for dtype in data.dtypes): _invalid_dataframe_dtype(data) # handle feature names if feature_names is None and meta is None: if isinstance(data.columns, pd.MultiIndex): feature_names = [ " ".join([str(x) for x in i]) for i in data.columns ] elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() # handle feature types if feature_types is None and meta is None: feature_types = [] for i, dtype in enumerate(data.dtypes): if is_sparse(dtype): feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) elif is_categorical_dtype(dtype) and enable_categorical: feature_types.append(CAT_T) else: feature_types.append(_pandas_dtype_mapper[dtype.name]) # handle category codes. transformed = pd.DataFrame() if enable_categorical: for i, dtype in enumerate(data.dtypes): if is_categorical_dtype(dtype): # pandas uses -1 as default missing value for categorical data transformed[data.columns[i]] = ( data[data.columns[i]].cat.codes.astype(np.float32).replace( -1.0, np.NaN)) else: transformed[data.columns[i]] = data[data.columns[i]] else: transformed = data if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") dtype = meta_type if meta_type else np.float32 arr = transformed.values if meta_type: arr = arr.astype(meta_type) return arr, feature_names, feature_types
def _maybe_pandas_data(self, data, feature_names, feature_types, meta=None, meta_type=None): """Extract internal data from pd.DataFrame for DMatrix data""" if lazy_isinstance(data, 'pandas.core.series', 'Series'): dtype = meta_type if meta_type else 'float' return data.values.astype(dtype), feature_names, feature_types from pandas.api.types import is_sparse from pandas import MultiIndex, Int64Index data_dtypes = data.dtypes if not all(dtype.name in self.pandas_dtype_mapper or is_sparse(dtype) for dtype in data_dtypes): bad_fields = [ str(data.columns[i]) for i, dtype in enumerate(data_dtypes) if dtype.name not in self.pandas_dtype_mapper ] msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """ raise ValueError(msg + ', '.join(bad_fields)) if feature_names is None and meta is None: if isinstance(data.columns, MultiIndex): feature_names = [ ' '.join([str(x) for x in i]) for i in data.columns ] elif isinstance(data.columns, Int64Index): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() if feature_types is None and meta is None: feature_types = [] for dtype in data_dtypes: if is_sparse(dtype): feature_types.append( self.pandas_dtype_mapper[dtype.subtype.name]) else: feature_types.append(self.pandas_dtype_mapper[dtype.name]) if meta and len(data.columns) > 1: raise ValueError( 'DataFrame for {meta} cannot have multiple columns'.format( meta=meta)) dtype = meta_type if meta_type else 'float' data = data.values.astype(dtype) return data, feature_names, feature_types
def _transform_pandas_df(data, enable_categorical, feature_names=None, feature_types=None, meta=None, meta_type=None): from pandas import MultiIndex, Int64Index, RangeIndex from pandas.api.types import is_sparse, is_categorical_dtype data_dtypes = data.dtypes if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or (is_categorical_dtype(dtype) and enable_categorical) for dtype in data_dtypes): bad_fields = [ str(data.columns[i]) for i, dtype in enumerate(data_dtypes) if dtype.name not in _pandas_dtype_mapper ] msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When categorical type is supplied, DMatrix parameter `enable_categorical` must be set to `True`.""" raise ValueError(msg + ', '.join(bad_fields)) if feature_names is None and meta is None: if isinstance(data.columns, MultiIndex): feature_names = [ ' '.join([str(x) for x in i]) for i in data.columns ] elif isinstance(data.columns, (Int64Index, RangeIndex)): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() if feature_types is None and meta is None: feature_types = [] for dtype in data_dtypes: if is_sparse(dtype): feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) elif is_categorical_dtype(dtype) and enable_categorical: feature_types.append('categorical') else: feature_types.append(_pandas_dtype_mapper[dtype.name]) if meta and len(data.columns) > 1: raise ValueError( 'DataFrame for {meta} cannot have multiple columns'.format( meta=meta)) dtype = meta_type if meta_type else np.float32 data = data.values if meta_type: data = data.astype(meta_type) return data, feature_names, feature_types
def _pandas_feature_info( data: DataFrame, meta: Optional[str], feature_names: FeatureNames, feature_types: FeatureTypes, enable_categorical: bool, ) -> Tuple[FeatureNames, FeatureTypes]: import pandas as pd from pandas.api.types import ( is_sparse, is_categorical_dtype, ) # handle feature names if feature_names is None and meta is None: if isinstance(data.columns, pd.MultiIndex): feature_names = [ " ".join([str(x) for x in i]) for i in data.columns ] elif isinstance(data.columns, (pd.Index, pd.RangeIndex)): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() # handle feature types if feature_types is None and meta is None: feature_types = [] for dtype in data.dtypes: if is_sparse(dtype): feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) elif is_categorical_dtype(dtype) and enable_categorical: feature_types.append(CAT_T) else: feature_types.append(_pandas_dtype_mapper[dtype.name]) return feature_names, feature_types
def _transform_pandas_df( data: DataFrame, enable_categorical: bool, feature_names: FeatureNames = None, feature_types: FeatureTypes = None, meta: Optional[str] = None, meta_type: Optional[str] = None, ) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]: from pandas.api.types import ( is_sparse, is_categorical_dtype, ) if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or is_nullable_dtype(dtype) or (is_categorical_dtype(dtype) and enable_categorical) for dtype in data.dtypes): _invalid_dataframe_dtype(data) feature_names, feature_types = _pandas_feature_info( data, meta, feature_names, feature_types, enable_categorical) transformed = _pandas_cat_null(data) if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32 arr: np.ndarray = transformed.values if meta_type: arr = arr.astype(dtype) return arr, feature_names, feature_types
def inner(series: pd.Series, state: dict, *args, **kwargs) -> bool: if pdt.is_sparse(series): dtype = series.dtype.subtype else: dtype = series.dtype state["dtype"] = dtype return fn(series, state, *args, **kwargs)
def _transform_pandas_df(data, feature_names=None, feature_types=None, meta=None, meta_type=None): from pandas import MultiIndex, Int64Index from pandas.api.types import is_sparse data_dtypes = data.dtypes if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) for dtype in data_dtypes): bad_fields = [ str(data.columns[i]) for i, dtype in enumerate(data_dtypes) if dtype.name not in _pandas_dtype_mapper ] msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """ raise ValueError(msg + ', '.join(bad_fields)) if feature_names is None and meta is None: if isinstance(data.columns, MultiIndex): feature_names = [ ' '.join([str(x) for x in i]) for i in data.columns ] elif isinstance(data.columns, Int64Index): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() if feature_types is None and meta is None: feature_types = [] for dtype in data_dtypes: if is_sparse(dtype): feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) else: feature_types.append(_pandas_dtype_mapper[dtype.name]) if meta and len(data.columns) > 1: raise ValueError( 'DataFrame for {meta} cannot have multiple columns'.format( meta=meta)) dtype = meta_type if meta_type else np.float32 data = np.ascontiguousarray(data.values, dtype=dtype) return data, feature_names, feature_types
def _meta_from_pandas_series(data, name: str, dtype: Optional[str], handle: ctypes.c_void_p) -> None: """Help transform pandas series for meta data like labels""" data = data.values.astype('float') from pandas.api.types import is_sparse if is_sparse(data): data = data.to_dense() assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 _meta_from_numpy(data, name, dtype, handle)
def test_isna_returns_copy(self, data_missing, na_func): result = pd.Series(data_missing) expected = result.copy() mask = getattr(result, na_func)() if is_sparse(mask): mask = np.array(mask) mask[:] = True self.assert_series_equal(result, expected)
def _split_sparse(df): sparse_col = [] non_sparse_col = [] for col in df.columns: if is_sparse(df[col]): sparse_col.append(col) else: non_sparse_col.append(col) return df[sparse_col], df[non_sparse_col]
def get_bad_pandas_dtypes(dtypes: list) -> list: # from lightgbm's python-package/lightgbm/basic.py pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int', 'bool': 'int', 'float16': 'float', 'float32': 'float', 'float64': 'float'} bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper and (not is_sparse(dtype) or dtype.subtype.name not in pandas_dtype_mapper))] return bad_indices
def _nonempty_series(s, idx=None): # TODO: Use register dtypes with make_array_nonempty if idx is None: idx = _nonempty_index(s.index) dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp("1970-01-01", tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): if len(s.cat.categories): data = [s.cat.categories[0]] * 2 cats = s.cat.categories else: data = _nonempty_index(s.cat.categories) cats = s.cat.categories[:0] data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered) elif is_integer_na_dtype(dtype): data = pd.array([1, None], dtype=dtype) elif is_period_dtype(dtype): # pandas 0.24.0+ should infer this to be Series[Period[freq]] freq = dtype.freq data = [pd.Period("2000", freq), pd.Period("2001", freq)] elif is_sparse(dtype): # TODO: pandas <0.24 # Pandas <= 0.23.4: if PANDAS_GT_0240: entry = _scalar_from_dtype(dtype.subtype) else: entry = _scalar_from_dtype(dtype.subtype) if PANDAS_GT_100: data = pd.array([entry, entry], dtype=dtype) else: data = pd.SparseArray([entry, entry], dtype=dtype) elif is_interval_dtype(dtype): entry = _scalar_from_dtype(dtype.subtype) if PANDAS_GT_0240: data = pd.array([entry, entry], dtype=dtype) else: data = np.array([entry, entry], dtype=dtype) elif type(dtype) in make_array_nonempty._lookup: data = make_array_nonempty(dtype) else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) out = pd.Series(data, name=s.name, index=idx) if PANDAS_GT_100: out.attrs = s.attrs return out
def to_dense(dataframe, nonsparse=NONSPARSE_COLUMNS): """ Convert a sparse CLICnet dataframe to dense. """ nonsparse = nonsparse & set(dataframe.columns) saved_columns = {} for column in nonsparse: saved_columns[column] = dataframe.loc[:, column].tolist() dataframe = dataframe.drop(columns=nonsparse) if is_sparse(dataframe.iloc[:, 0]): ret = dataframe.sparse.to_dense() else: ret = dataframe for column, value in saved_columns.items(): ret.loc[:, column] = value return ret
def _nonempty_series(s, idx=None): # TODO: Use register dtypes with make_array_nonempty if idx is None: idx = _nonempty_index(s.index) dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp('1970-01-01', tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): if len(s.cat.categories): data = [s.cat.categories[0]] * 2 cats = s.cat.categories else: data = _nonempty_index(s.cat.categories) cats = None data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered) elif is_integer_na_dtype(dtype): data = pd.array([1, None], dtype=dtype) elif is_period_dtype(dtype): # pandas 0.24.0+ should infer this to be Series[Period[freq]] freq = dtype.freq data = [pd.Period('2000', freq), pd.Period('2001', freq)] elif is_sparse(dtype): # TODO: pandas <0.24 # Pandas <= 0.23.4: if PANDAS_GT_0240: entry = _scalar_from_dtype(dtype.subtype) else: entry = _scalar_from_dtype(dtype.subtype) data = pd.SparseArray([entry, entry], dtype=dtype) elif is_interval_dtype(dtype): entry = _scalar_from_dtype(dtype.subtype) if PANDAS_GT_0240: data = pd.array([entry, entry], dtype=dtype) else: data = np.array([entry, entry], dtype=dtype) elif type(dtype) in make_array_nonempty._lookup: data = make_array_nonempty(dtype) else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) return pd.Series(data, name=s.name, index=idx)
def check_user_df(ktk_cube_dataset_id, df, cube, existing_payload, partition_on): """ Check user-provided DataFrame for sanity. Parameters ---------- ktk_cube_dataset_id: str Ktk_cube dataset UUID (w/o cube prefix). df: Optional[pandas.DataFrame] DataFrame to be passed to Kartothek. cube: kartothek.core.cube.cube.Cube Cube specification. existing_payload: Set[str] Existing payload columns. partition_on: Iterable[str] Partition-on attribute for given dataset. Raises ------ ValueError In case anything is fishy. """ if df is None: return if not (isinstance(df, pd.DataFrame) or isinstance(df, dd.DataFrame)): raise TypeError( 'Provided DataFrame is not a pandas.DataFrame or None, but is a "{t}"' .format(t=type(df).__name__)) if any(is_sparse(dtype) for dtype in df.dtypes): raise TypeError("Sparse data is not supported.") # call this once since `df.columns` can be quite slow df_columns = list(df.columns) df_columns_set = set(df_columns) dcols_present = set(cube.dimension_columns) & df_columns_set if len(df_columns) != len(df_columns_set): raise ValueError( 'Duplicate columns found in dataset "{ktk_cube_dataset_id}": {df_columns}' .format( ktk_cube_dataset_id=ktk_cube_dataset_id, df_columns=", ".join(df_columns), )) if ktk_cube_dataset_id == cube.seed_dataset: missing_dimension_columns = set( cube.dimension_columns) - df_columns_set if missing_dimension_columns: raise ValueError( 'Missing dimension columns in seed data "{ktk_cube_dataset_id}": {missing_dimension_columns}' .format( ktk_cube_dataset_id=ktk_cube_dataset_id, missing_dimension_columns=", ".join( sorted(missing_dimension_columns)), )) else: if len(dcols_present) == 0: raise ValueError( 'Dataset "{ktk_cube_dataset_id}" must have at least 1 of the following dimension columns: {dims}' .format( ktk_cube_dataset_id=ktk_cube_dataset_id, dims=", ".join(cube.dimension_columns), )) missing_partition_columns = set(partition_on) - df_columns_set if missing_partition_columns: raise ValueError( 'Missing partition columns in dataset "{ktk_cube_dataset_id}": {missing_partition_columns}' .format( ktk_cube_dataset_id=ktk_cube_dataset_id, missing_partition_columns=", ".join( sorted(missing_partition_columns)), )) # Factor this check out. All others can be performed on the dask.DataFrame. # This one can only be executed on a pandas DataFame if isinstance(df, pd.DataFrame): assert_dimesion_index_cols_notnull( ktk_cube_dataset_id=ktk_cube_dataset_id, df=df, cube=cube, partition_on=partition_on, ) payload = get_payload_subset(df.columns, cube) payload_overlap = payload & existing_payload if payload_overlap: raise ValueError( 'Payload written in "{ktk_cube_dataset_id}" is already present in cube: {payload_overlap}' .format( ktk_cube_dataset_id=ktk_cube_dataset_id, payload_overlap=", ".join(sorted(payload_overlap)), )) unspecified_partition_columns = (df_columns_set - set(partition_on)) & set( cube.partition_columns) if unspecified_partition_columns: raise ValueError( f"Unspecified but provided partition columns in {ktk_cube_dataset_id}: " f"{', '.join(sorted(unspecified_partition_columns))}")
def inner(series: pd.Series, *args, **kwargs) -> bool: if pdt.is_sparse(series): return False return fn(series, *args, **kwargs)
def get_feat_type_from_columns( self, X: pd.DataFrame, ) -> typing.Dict[typing.Union[str, int], str]: """ Returns a dictionary that maps pandas dataframe columns to a feature type. This feature type can be categorical or numerical Parameters ---------- X: pd.DataFrame A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding Returns ------- feat_type: dictionary with column to feature type mapping """ # Also, register the feature types for the estimator feat_type = {} # Make sure each column is a valid type for i, column in enumerate(X.columns): if is_sparse(X[column]): raise ValueError( "Auto-sklearn does not yet support sparse pandas Series." f" Please convert {column} to a dense format.") elif X[column].dtype.name in ['category', 'bool']: feat_type[column] = 'categorical' # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types elif not is_numeric_dtype(X[column]): if X[column].dtype.name == 'object': raise ValueError( "Input Column {} has invalid type object. " "Cast it to a valid dtype before using it in Auto-Sklearn. " "Valid types are numerical, categorical or boolean. " "You can cast it to a valid dtype using " "pandas.Series.astype ." "If working with string objects, the following " "tutorial illustrates how to work with text data: " "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html" .format( # noqa: E501 column, )) elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( X[column].dtype): raise ValueError( "Auto-sklearn does not support time and/or date datatype as given " "in column {}. Please convert the time information to a numerical value " "first. One example on how to do this can be found on " "https://stats.stackexchange.com/questions/311494/". format(column, )) else: raise ValueError( "Input Column {} has unsupported dtype {}. " "Supported column types are categorical/bool/numerical dtypes. " "Make sure your data is formatted in a correct way, " "before feeding it to Auto-Sklearn.".format( column, X[column].dtype.name, )) else: feat_type[column] = 'numerical' return feat_type
def sparse_contains(series: pd.Series, state: dict) -> bool: return pdt.is_sparse(series)
def contains_op(cls, series: pd.Series) -> bool: return pdt.is_sparse(series)