def read_partition(fs, piece, columns, index, categories=(), partitions=(), **kwargs): if columns is not None: columns = [c for c in columns] if isinstance(index, list): columns += index if isinstance(piece, str): path = piece row_group = None partition_keys = [] else: (path, row_group, partition_keys) = piece strings_to_cats = kwargs.get("strings_to_categorical", False) if cudf.utils.ioutils._is_local_filesystem(fs): df = cudf.read_parquet( path, engine="cudf", columns=columns, row_groups=row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) else: with fs.open(path, mode="rb") as f: df = cudf.read_parquet( f, engine="cudf", columns=columns, row_groups=row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) if index and (index[0] in df.columns): df = df.set_index(index[0]) if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(partition_keys): categories = [ val.as_py() for val in partitions.levels[i].dictionary ] col = as_column(index2).as_frame().repeat(len(df))._data[None] df[name] = build_categorical_column( categories=categories, codes=as_column(col.base_data, dtype=col.dtype), size=col.size, offset=col.offset, ordered=False, ) return df
def array_to_series(array): if isinstance(array, pa.ChunkedArray): return Series._concat( [array_to_series(chunk) for chunk in array.chunks]) array_len = len(array) null_count = array.null_count buffers = make_device_arrays(array) mask, data = buffers[0], buffers[1] dtype = arrow_to_pandas_dtype(array.type) if pa.types.is_dictionary(array.type): from cudf.core.column import build_categorical_column from cudf.core.buffer import Buffer codes = array_to_series(array.indices) categories = array_to_series(array.dictionary) if mask is not None: mask = Buffer(mask) data = build_categorical_column(categories=categories, codes=codes, mask=mask) elif pa.types.is_string(array.type): import nvstrings offs, data = buffers[1], buffers[2] offs = offs[array.offset:array.offset + array_len + 1] data = None if data is None else data.device_ctypes_pointer.value mask = None if mask is None else mask.device_ctypes_pointer.value data = nvstrings.from_offsets( data, offs.device_ctypes_pointer.value, array_len, mask, null_count, True, ) elif data is not None: data = data[array.offset:array.offset + len(array)] series = Series(data, dtype=dtype) if null_count > 0 and mask is not None and not series.nullable: return series.set_mask(mask, null_count) return series
def _copy_categories(self, other, include_index=True): """ Utility that copies category information from `other` to `self`. """ for name, col, other_col in zip(self._column_names, self._columns, other._columns): if is_categorical_dtype( other_col) and not is_categorical_dtype(col): self._data[name] = build_categorical_column( categories=other_col.categories, codes=col, mask=col.mask, ordered=other_col.ordered, ) if include_index: if self._index is not None: self._index._copy_categories(other._index) return self
def _categorical_scalar_broadcast_to(cat_scalar, size): if isinstance(cat_scalar, (cudf.Series, pd.Series)): cats = cat_scalar.cat.categories code = cat_scalar.cat.codes[0] ordered = cat_scalar.cat.ordered else: # handles pd.Categorical, cudf.categorical.CategoricalColumn cats = cat_scalar.categories code = cat_scalar.codes[0] ordered = cat_scalar.ordered cats = column.as_column(cats) codes = scalar_broadcast_to(code, size) return column.build_categorical_column( categories=cats, codes=codes, mask=codes.base_mask, size=codes.size, offset=codes.offset, ordered=ordered, )
def melt( frame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. default: None value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. default: all columns that are not set as `id_vars`. var_name : scalar Name to use for the `variable` column. default: frame.columns.name or 'variable' value_name : str Name to use for the `value` column. default: 'value' Returns ------- out : DataFrame Melted result Difference from pandas: * Does not support 'col_level' because cuDF does not have multi-index Examples -------- >>> import cudf >>> import numpy as np >>> df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5}, ... 'B': {0: 1, 1: 3, 2: 6}, ... 'C': {0: 1.0, 1: np.nan, 2: 4.0}, ... 'D': {0: 2.0, 1: 5.0, 2: 6.0}}) >>> cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D']) A B variable value 0 1 1 C 1.0 1 1 3 C 2 5 6 C 4.0 3 1 1 D 2.0 4 1 3 D 5.0 5 5 6 D 6.0 """ assert col_level in (None,) # Arg cleaning import collections # id_vars if id_vars is not None: if not isinstance(id_vars, collections.abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) missing = set(id_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing)) ) else: id_vars = [] # value_vars if value_vars is not None: if not isinstance(value_vars, collections.abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) missing = set(value_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'value_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing)) ) else: # then all remaining columns in frame value_vars = frame.columns.drop(id_vars) value_vars = list(value_vars) # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] if any(is_categorical_dtype(t) for t in dtypes): raise NotImplementedError( "Categorical columns are not yet " "supported for function" ) # Check dtype homogeneity in value_var # Because heterogeneous concat is unimplemented dtypes = [frame[col].dtype for col in value_vars] if len(dtypes) > 0: dtype = dtypes[0] if any(t != dtype for t in dtypes): raise ValueError("all cols in value_vars must have the same dtype") # overlap overlap = set(id_vars).intersection(set(value_vars)) if not len(overlap) == 0: raise KeyError( "'value_vars' and 'id_vars' cannot have overlap." " The following 'value_vars' are ALSO present" " in 'id_vars': {overlap}" "".format(overlap=list(overlap)) ) N = len(frame) K = len(value_vars) def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series([], dtype=A.dtype) # Step 1: tile id_vars mdata = collections.OrderedDict() for col in id_vars: mdata[col] = _tile(frame[col], K) # Step 2: add variable var_cols = [] for i, var in enumerate(value_vars): var_cols.append(Series(cudautils.full(size=N, value=i, dtype=np.int8))) temp = Series._concat(objs=var_cols, index=None) if not var_name: var_name = "variable" mdata[var_name] = Series( build_categorical_column( categories=value_vars, codes=as_column(temp._column.base_data, dtype=temp._column.dtype), mask=temp._column.base_mask, size=temp._column.size, offset=temp._column.offset, ordered=False, ) ) # Step 3: add values mdata[value_name] = Series._concat( objs=[frame[val] for val in value_vars], index=None ) return DataFrame(mdata)
def read_partition(fs, pieces, columns, index, categories=(), partitions=(), **kwargs): if columns is not None: columns = [c for c in columns] if isinstance(index, list): columns += index if not isinstance(pieces, list): pieces = [pieces] strings_to_cats = kwargs.get("strings_to_categorical", False) if len(pieces) > 1: paths = [] rgs = [] partition_keys = [] for piece in pieces: if isinstance(piece, str): paths.append(piece) rgs.append(None) else: (path, row_group, partition_keys) = piece row_group = None if row_group == [None] else row_group paths.append(path) rgs.append([row_group] if not isinstance(row_group, list) else row_group) df = cudf.read_parquet( paths, engine="cudf", columns=columns, row_groups=rgs if rgs else None, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) else: # Single-piece read if isinstance(pieces[0], str): path = pieces[0] row_group = None partition_keys = [] else: (path, row_group, partition_keys) = pieces[0] row_group = None if row_group == [None] else row_group if cudf.utils.ioutils._is_local_filesystem(fs): df = cudf.read_parquet( path, engine="cudf", columns=columns, row_groups=row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) else: with fs.open(path, mode="rb") as f: df = cudf.read_parquet( f, engine="cudf", columns=columns, row_groups=row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) # Re-set "object" dtypes align with pa schema set_object_dtypes_from_pa_schema(df, kwargs.get("schema", None)) if index and (index[0] in df.columns): df = df.set_index(index[0]) elif index is False and set(df.index.names).issubset(columns): # If index=False, we need to make sure all of the # names in `columns` are actually in `df.columns` df.reset_index(inplace=True) if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(partition_keys): # Build the column from `codes` directly # (since the category is often a larger dtype) codes = (as_column( partitions[i].keys.index(index2)).as_frame().repeat( len(df))._data[None]) df[name] = build_categorical_column( categories=partitions[i].keys, codes=codes, size=codes.size, offset=codes.offset, ordered=False, ) return df
def read_partition(fs, piece, columns, index, categories=(), partitions=(), **kwargs): if columns is not None: columns = [c for c in columns] if isinstance(index, list): columns += index if isinstance(piece, str): # `piece` is a file-path string piece = pq.ParquetDatasetPiece(piece, open_file_func=partial(fs.open, mode="rb")) else: # `piece` = (path, row_group, partition_keys) (path, row_group, partition_keys) = piece piece = pq.ParquetDatasetPiece( path, row_group=row_group, partition_keys=partition_keys, open_file_func=partial(fs.open, mode="rb"), ) strings_to_cats = kwargs.get("strings_to_categorical", False) if cudf.utils.ioutils._is_local_filesystem(fs): df = cudf.read_parquet( piece.path, engine="cudf", columns=columns, row_group=piece.row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) else: with fs.open(piece.path, mode="rb") as f: df = cudf.read_parquet( f, engine="cudf", columns=columns, row_group=piece.row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) if index and index[0] in df.columns: df = df.set_index(index[0]) if len(piece.partition_keys) > 0: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(piece.partition_keys): categories = [ val.as_py() for val in partitions.levels[i].dictionary ] sr = cudf.Series(index2).astype(type(index2)).repeat(len(df)) df[name] = build_categorical_column(categories=categories, codes=sr._column, ordered=False) return df
def _optimized_read_partition_remote(fs, pieces, columns, index, categories=(), partitions=(), **kwargs): # This is a specialized version of `CudfEngine.read_partition` # for remote filesystems. This implementation is intended to # replace the upstream `read_partition` classmethod until # remote-filesystem handling is optimized in cudf/dask-cudf if columns is not None: columns = list(columns) if isinstance(index, list): columns += index # Check that this is a single-piece read on a non-local filesystem if not isinstance(pieces, list): pieces = [pieces] if len(pieces) > 1: raise ValueError( "The `_custom_read_partition` code path is not designed to " "handle a multi-element `pieces` argument.") if cudf.utils.ioutils._is_local_filesystem(fs): raise ValueError( "The `_custom_read_partition` code path is not intended " "for use on local filesystems.") # Unpack contents of the single piece if isinstance(pieces[0], str): path = pieces[0] row_group = None partition_keys = [] else: (path, row_group, partition_keys) = pieces[0] # Call optimized read utility df = _optimized_read_remote(path, row_group, columns, fs, **kwargs) # # Code below is directly copied from cudf-21.08 # if index and (index[0] in df.columns): df = df.set_index(index[0]) elif index is False and set(df.index.names).issubset(columns): # If index=False, we need to make sure all of the # names in `columns` are actually in `df.columns` df.reset_index(inplace=True) if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(partition_keys): categories = [ val.as_py() for val in partitions.levels[i].dictionary ] col = as_column(index2).as_frame().repeat(len(df))._data[None] df[name] = build_categorical_column( categories=categories, codes=as_column(col.base_data, dtype=col.dtype), size=col.size, offset=col.offset, ordered=False, ) return df
def _parquet_to_frame( paths_or_buffers, *args, row_groups=None, partition_keys=None, partition_categories=None, **kwargs, ): # If this is not a partitioned read, only need # one call to `_read_parquet` if not partition_keys: return _read_parquet( paths_or_buffers, *args, row_groups=row_groups, **kwargs, ) # For partitioned data, we need a distinct read for each # unique set of partition keys. Therefore, we start by # aggregating all paths with matching keys using a dict plan = {} for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)): rgs = row_groups[i] if row_groups else None tkeys = tuple(keys) if tkeys in plan: plan[tkeys][0].append(path) if rgs is not None: plan[tkeys][1].append(rgs) else: plan[tkeys] = ([path], None if rgs is None else [rgs]) dfs = [] for part_key, (key_paths, key_row_groups) in plan.items(): # Add new DataFrame to our list dfs.append( _read_parquet( key_paths, *args, row_groups=key_row_groups, **kwargs, )) # Add partition columns to the last DataFrame for (name, value) in part_key: if partition_categories and name in partition_categories: # Build the categorical column from `codes` codes = as_column( partition_categories[name].index(value), length=len(dfs[-1]), ) dfs[-1][name] = build_categorical_column( categories=partition_categories[name], codes=codes, size=codes.size, offset=codes.offset, ordered=False, ) else: # Not building categorical columns, so # `value` is already what we want dfs[-1][name] = as_column(value, length=len(dfs[-1])) # Concatenate dfs and return. # Assume we can ignore the index if it has no name. return (cudf.concat(dfs, ignore_index=dfs[-1].index.name is None) if len(dfs) > 1 else dfs[0])
def _read_paths( cls, paths, fs, columns=None, row_groups=None, strings_to_categorical=None, partitions=None, partitioning=None, partition_keys=None, open_file_options=None, **kwargs, ): # Simplify row_groups if all None if row_groups == [None for path in paths]: row_groups = None with ExitStack() as stack: # Non-local filesystem handling paths_or_fobs = paths if not _is_local_filesystem(fs): paths_or_fobs = _open_remote_files( paths_or_fobs, fs, context_stack=stack, **_default_open_file_options(open_file_options, columns, row_groups), ) # Use cudf to read in data df = cudf.read_parquet( paths_or_fobs, engine="cudf", columns=columns, row_groups=row_groups if row_groups else None, strings_to_categorical=strings_to_categorical, **kwargs, ) if partitions and partition_keys is None: # Use `HivePartitioning` by default partitioning = partitioning or {"obj": pa_ds.HivePartitioning} ds = pa_ds.dataset( paths, filesystem=fs, format="parquet", partitioning=partitioning["obj"].discover( *partitioning.get("args", []), **partitioning.get("kwargs", {}), ), ) frag = next(ds.get_fragments()) if frag: # Extract hive-partition keys, and make sure they # are ordered the same as they are in `partitions` raw_keys = pa_ds._get_partition_keys(frag.partition_expression) partition_keys = [(hive_part.name, raw_keys[hive_part.name]) for hive_part in partitions] if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(partition_keys): # Build the column from `codes` directly # (since the category is often a larger dtype) codes = as_column( partitions[i].keys.index(index2), length=len(df), ) df[name] = build_categorical_column( categories=partitions[i].keys, codes=codes, size=codes.size, offset=codes.offset, ordered=False, ) return df
def cut( x, bins, right: bool = True, labels=None, retbins: bool = False, precision: int = 3, include_lowest: bool = False, duplicates: str = "raise", ordered: bool = True, ): """ Bin values into discrete intervals. Use cut when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable. Parameters ---------- x : array-like The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. * int : Defines the number of equal-width bins in the range of x. The range of x is extended by .1% on each side to include the minimum and maximum values of x. right : bool, default True Indicates whether bins includes the rightmost edge or not. labels : array or False, default None Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of thebins. If True,raises an error. When ordered=False, labels must be provided. retbins : bool, default False Whether to return the bins or not. precision : int, default 3 The precision at which to store and display the bins labels. include_lowest : bool, default False Whether the first interval should be left-inclusive or not. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. ordered : bool, default True Whether the labels are ordered or not. Applies to returned types Categorical and Series (with Categorical dtype). If True, the resulting categorical will be ordered. If False, the resulting categorical will be unordered (labels must be provided). Returns ------- out : CategoricalIndex An array-like object representing the respective bin for each value of x. The type depends on the value of labels. bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when retbins=True. For scalar or sequence bins, this is an ndarray with the computed bins. If set duplicates=drop, bins will drop non-unique bin. For an IntervalIndex bins, this is equal to bins. Examples -------- Discretize into three equal-sized bins. >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3) CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], ... (5.0, 7.0],(0.994, 3.0]], categories=[(0.994, 3.0], ... (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category') >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], ... (5.0, 7.0],(0.994, 3.0]],categories=[(0.994, 3.0], ... (3.0, 5.0], (5.0, 7.0]],ordered=True, dtype='category'), array([0.994, 3. , 5. , 7. ])) >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), ... 3, labels=["bad", "medium", "good"]) CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'], ... categories=['bad', 'medium', 'good'],ordered=True, ... dtype='category') >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, ... labels=["B", "A", "B"], ordered=False) CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'], ... ordered=False, dtype='category') >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False) array([0, 1, 1, 3], dtype=int32) Passing a Series as an input returns a Series with categorical dtype: >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]), ... index=['a', 'b', 'c', 'd', 'e']) >>> cudf.cut(s, 3) """ left_inclusive = False right_inclusive = True # saving the original input x for use in case its a series orig_x = x old_bins = bins if not ordered and labels is None: raise ValueError("'labels' must be provided if 'ordered = False'") if duplicates not in ["raise", "drop"]: raise ValueError( "invalid value for 'duplicates' parameter, valid options are: " "raise, drop") if labels is not False: if not (labels is None or is_list_like(labels)): raise ValueError( "Bin labels must either be False, None or passed in as a " "list-like argument") elif ordered and labels is not None: if len(set(labels)) != len(labels): raise ValueError("labels must be unique if ordered=True;" "pass ordered=False for duplicate labels") # bins can either be an int, sequence of scalars or an intervalIndex if isinstance(bins, Sequence): if len(set(bins)) is not len(bins): if duplicates == "raise": raise ValueError( f"Bin edges must be unique: {repr(bins)}.\n" f"You can drop duplicate edges by setting the 'duplicates'" "kwarg") elif duplicates == "drop": # get unique values but maintain list dtype bins = list(dict.fromkeys(bins)) # if bins is an intervalIndex we ignore the value of right elif isinstance(bins, (pd.IntervalIndex, cudf.IntervalIndex)): right = bins.closed == "right" # create bins if given an int or single scalar if not isinstance(bins, pd.IntervalIndex): if not isinstance(bins, (Sequence)): if isinstance(x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)): mn = x.min() mx = x.max() else: mn = min(x) mx = max(x) bins = np.linspace(mn, mx, bins + 1, endpoint=True) adj = (mx - mn) * 0.001 if right: bins[0] -= adj else: bins[-1] += adj # if right and include lowest we adjust the first # bin edge to make sure it is included if right and include_lowest: bins[0] = bins[0] - 10**(-precision) # if right is false the last bin edge is not included if not right: right_edge = bins[-1] x = cupy.asarray(x) x[x == right_edge] = right_edge + 1 # adjust bin edges decimal precision int_label_bins = np.around(bins, precision) # the inputs is a column of the values in the array x input_arr = as_column(x) # checking for the correct inclusivity values if right: closed = "right" else: closed = "left" left_inclusive = True if isinstance(bins, pd.IntervalIndex): interval_labels = bins elif labels is None: if duplicates == "drop" and len(bins) == 1 and len(old_bins) != 1: if right and include_lowest: old_bins[0] = old_bins[0] - 10**(-precision) interval_labels = interval_range(old_bins[0], old_bins[1], periods=1, closed=closed) else: interval_labels = IntervalIndex.from_breaks(old_bins, closed=closed) else: # get labels for categories interval_labels = IntervalIndex.from_breaks(int_label_bins, closed=closed) elif labels is not False: if not (is_list_like(labels)): raise ValueError( "Bin labels must either be False, None or passed in as a " "list-like argument") if ordered and len(set(labels)) != len(labels): raise ValueError( "labels must be unique if ordered=True; pass ordered=False for" "duplicate labels") else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) if not ordered and len(set(labels)) != len(labels): interval_labels = cudf.CategoricalIndex(labels, categories=None, ordered=False) else: interval_labels = (labels if len(set(labels)) == len(labels) else None) if isinstance(bins, pd.IntervalIndex): # get the left and right edges of the bins as columns # we cannot typecast an IntervalIndex, so we need to # make the edges the same type as the input array left_edges = as_column(bins.left).astype(input_arr.dtype) right_edges = as_column(bins.right).astype(input_arr.dtype) else: # get the left and right edges of the bins as columns left_edges = as_column(bins[:-1:], dtype="float64") right_edges = as_column(bins[+1::], dtype="float64") # the input arr must be changed to the same type as the edges input_arr = input_arr.astype(left_edges.dtype) # get the indexes for the appropriate number index_labels = cudf._lib.labeling.label_bins(input_arr, left_edges, left_inclusive, right_edges, right_inclusive) if labels is False: # if labels is false we return the index labels, we return them # as a series if we have a series input if isinstance(orig_x, (pd.Series, cudf.Series)): # need to run more tests but looks like in this case pandas # always returns a float64 dtype indx_arr_series = cudf.Series(index_labels, dtype="float64") # if retbins we return the bins as well if retbins: return indx_arr_series, bins else: return indx_arr_series elif retbins: return index_labels.values, bins else: return index_labels.values if labels is not None: if labels is not ordered and len(set(labels)) != len(labels): # when we have duplicate labels and ordered is False, we # should allow duplicate categories. The categories are # returned in order new_data = [interval_labels[i][0] for i in index_labels.values] return cudf.CategoricalIndex(new_data, categories=sorted(set(labels)), ordered=False) col = build_categorical_column( categories=interval_labels, codes=index_labels, mask=index_labels.base_mask, offset=index_labels.offset, size=index_labels.size, ordered=ordered, ) # we return a categorical index, as we don't have a Categorical method categorical_index = cudf.core.index.as_index(col) if isinstance(orig_x, (pd.Series, cudf.Series)): # if we have a series input we return a series output res_series = cudf.Series(categorical_index, index=orig_x.index) if retbins: return res_series, bins else: return res_series elif retbins: # if retbins is true we return the bins as well return categorical_index, bins else: return categorical_index