Ejemplo n.º 1
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       partitions=(),
                       **kwargs):
        if columns is not None:
            columns = [c for c in columns]
        if isinstance(index, list):
            columns += index

        if isinstance(piece, str):
            path = piece
            row_group = None
            partition_keys = []
        else:
            (path, row_group, partition_keys) = piece

        strings_to_cats = kwargs.get("strings_to_categorical", False)
        if cudf.utils.ioutils._is_local_filesystem(fs):
            df = cudf.read_parquet(
                path,
                engine="cudf",
                columns=columns,
                row_groups=row_group,
                strings_to_categorical=strings_to_cats,
                **kwargs.get("read", {}),
            )
        else:
            with fs.open(path, mode="rb") as f:
                df = cudf.read_parquet(
                    f,
                    engine="cudf",
                    columns=columns,
                    row_groups=row_group,
                    strings_to_categorical=strings_to_cats,
                    **kwargs.get("read", {}),
                )

        if index and (index[0] in df.columns):
            df = df.set_index(index[0])
        if partition_keys:
            if partitions is None:
                raise ValueError("Must pass partition sets")
            for i, (name, index2) in enumerate(partition_keys):
                categories = [
                    val.as_py() for val in partitions.levels[i].dictionary
                ]

                col = as_column(index2).as_frame().repeat(len(df))._data[None]
                df[name] = build_categorical_column(
                    categories=categories,
                    codes=as_column(col.base_data, dtype=col.dtype),
                    size=col.size,
                    offset=col.offset,
                    ordered=False,
                )

        return df
Ejemplo n.º 2
0
def array_to_series(array):
    if isinstance(array, pa.ChunkedArray):
        return Series._concat(
            [array_to_series(chunk) for chunk in array.chunks])

    array_len = len(array)
    null_count = array.null_count
    buffers = make_device_arrays(array)
    mask, data = buffers[0], buffers[1]
    dtype = arrow_to_pandas_dtype(array.type)

    if pa.types.is_dictionary(array.type):
        from cudf.core.column import build_categorical_column
        from cudf.core.buffer import Buffer

        codes = array_to_series(array.indices)
        categories = array_to_series(array.dictionary)
        if mask is not None:
            mask = Buffer(mask)
        data = build_categorical_column(categories=categories,
                                        codes=codes,
                                        mask=mask)

    elif pa.types.is_string(array.type):
        import nvstrings

        offs, data = buffers[1], buffers[2]
        offs = offs[array.offset:array.offset + array_len + 1]
        data = None if data is None else data.device_ctypes_pointer.value
        mask = None if mask is None else mask.device_ctypes_pointer.value
        data = nvstrings.from_offsets(
            data,
            offs.device_ctypes_pointer.value,
            array_len,
            mask,
            null_count,
            True,
        )
    elif data is not None:
        data = data[array.offset:array.offset + len(array)]

    series = Series(data, dtype=dtype)

    if null_count > 0 and mask is not None and not series.nullable:
        return series.set_mask(mask, null_count)

    return series
Ejemplo n.º 3
0
 def _copy_categories(self, other, include_index=True):
     """
     Utility that copies category information from `other`
     to `self`.
     """
     for name, col, other_col in zip(self._column_names, self._columns,
                                     other._columns):
         if is_categorical_dtype(
                 other_col) and not is_categorical_dtype(col):
             self._data[name] = build_categorical_column(
                 categories=other_col.categories,
                 codes=col,
                 mask=col.mask,
                 ordered=other_col.ordered,
             )
     if include_index:
         if self._index is not None:
             self._index._copy_categories(other._index)
     return self
Ejemplo n.º 4
0
def _categorical_scalar_broadcast_to(cat_scalar, size):
    if isinstance(cat_scalar, (cudf.Series, pd.Series)):
        cats = cat_scalar.cat.categories
        code = cat_scalar.cat.codes[0]
        ordered = cat_scalar.cat.ordered
    else:
        # handles pd.Categorical, cudf.categorical.CategoricalColumn
        cats = cat_scalar.categories
        code = cat_scalar.codes[0]
        ordered = cat_scalar.ordered

    cats = column.as_column(cats)
    codes = scalar_broadcast_to(code, size)

    return column.build_categorical_column(
        categories=cats,
        codes=codes,
        mask=codes.base_mask,
        size=codes.size,
        offset=codes.offset,
        ordered=ordered,
    )
Ejemplo n.º 5
0
def melt(
    frame,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
):
    """Unpivots a DataFrame from wide format to long format,
    optionally leaving identifier variables set.

    Parameters
    ----------
    frame : DataFrame
    id_vars : tuple, list, or ndarray, optional
        Column(s) to use as identifier variables.
        default: None
    value_vars : tuple, list, or ndarray, optional
        Column(s) to unpivot.
        default: all columns that are not set as `id_vars`.
    var_name : scalar
        Name to use for the `variable` column.
        default: frame.columns.name or 'variable'
    value_name : str
        Name to use for the `value` column.
        default: 'value'

    Returns
    -------
    out : DataFrame
        Melted result

    Difference from pandas:
     * Does not support 'col_level' because cuDF does not have multi-index

    Examples
    --------
    >>> import cudf
    >>> import numpy as np
    >>> df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5},
    ...                      'B': {0: 1, 1: 3, 2: 6},
    ...                      'C': {0: 1.0, 1: np.nan, 2: 4.0},
    ...                      'D': {0: 2.0, 1: 5.0, 2: 6.0}})
    >>> cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D'])
         A    B variable value
    0    1    1        C   1.0
    1    1    3        C
    2    5    6        C   4.0
    3    1    1        D   2.0
    4    1    3        D   5.0
    5    5    6        D   6.0
    """
    assert col_level in (None,)

    # Arg cleaning
    import collections

    # id_vars
    if id_vars is not None:
        if not isinstance(id_vars, collections.abc.Sequence):
            id_vars = [id_vars]
        id_vars = list(id_vars)
        missing = set(id_vars) - set(frame.columns)
        if not len(missing) == 0:
            raise KeyError(
                "The following 'id_vars' are not present"
                " in the DataFrame: {missing}"
                "".format(missing=list(missing))
            )
    else:
        id_vars = []

    # value_vars
    if value_vars is not None:
        if not isinstance(value_vars, collections.abc.Sequence):
            value_vars = [value_vars]
        value_vars = list(value_vars)
        missing = set(value_vars) - set(frame.columns)
        if not len(missing) == 0:
            raise KeyError(
                "The following 'value_vars' are not present"
                " in the DataFrame: {missing}"
                "".format(missing=list(missing))
            )
    else:
        # then all remaining columns in frame
        value_vars = frame.columns.drop(id_vars)
        value_vars = list(value_vars)

    # Error for unimplemented support for datatype
    dtypes = [frame[col].dtype for col in id_vars + value_vars]
    if any(is_categorical_dtype(t) for t in dtypes):
        raise NotImplementedError(
            "Categorical columns are not yet " "supported for function"
        )

    # Check dtype homogeneity in value_var
    # Because heterogeneous concat is unimplemented
    dtypes = [frame[col].dtype for col in value_vars]
    if len(dtypes) > 0:
        dtype = dtypes[0]
        if any(t != dtype for t in dtypes):
            raise ValueError("all cols in value_vars must have the same dtype")

    # overlap
    overlap = set(id_vars).intersection(set(value_vars))
    if not len(overlap) == 0:
        raise KeyError(
            "'value_vars' and 'id_vars' cannot have overlap."
            " The following 'value_vars' are ALSO present"
            " in 'id_vars': {overlap}"
            "".format(overlap=list(overlap))
        )

    N = len(frame)
    K = len(value_vars)

    def _tile(A, reps):
        series_list = [A] * reps
        if reps > 0:
            return Series._concat(objs=series_list, index=None)
        else:
            return Series([], dtype=A.dtype)

    # Step 1: tile id_vars
    mdata = collections.OrderedDict()
    for col in id_vars:
        mdata[col] = _tile(frame[col], K)

    # Step 2: add variable
    var_cols = []
    for i, var in enumerate(value_vars):
        var_cols.append(Series(cudautils.full(size=N, value=i, dtype=np.int8)))
    temp = Series._concat(objs=var_cols, index=None)

    if not var_name:
        var_name = "variable"

    mdata[var_name] = Series(
        build_categorical_column(
            categories=value_vars,
            codes=as_column(temp._column.base_data, dtype=temp._column.dtype),
            mask=temp._column.base_mask,
            size=temp._column.size,
            offset=temp._column.offset,
            ordered=False,
        )
    )

    # Step 3: add values
    mdata[value_name] = Series._concat(
        objs=[frame[val] for val in value_vars], index=None
    )

    return DataFrame(mdata)
Ejemplo n.º 6
0
    def read_partition(fs,
                       pieces,
                       columns,
                       index,
                       categories=(),
                       partitions=(),
                       **kwargs):
        if columns is not None:
            columns = [c for c in columns]
        if isinstance(index, list):
            columns += index

        if not isinstance(pieces, list):
            pieces = [pieces]

        strings_to_cats = kwargs.get("strings_to_categorical", False)
        if len(pieces) > 1:

            paths = []
            rgs = []
            partition_keys = []

            for piece in pieces:
                if isinstance(piece, str):
                    paths.append(piece)
                    rgs.append(None)
                else:
                    (path, row_group, partition_keys) = piece

                    row_group = None if row_group == [None] else row_group

                    paths.append(path)
                    rgs.append([row_group] if not isinstance(row_group, list)
                               else row_group)

            df = cudf.read_parquet(
                paths,
                engine="cudf",
                columns=columns,
                row_groups=rgs if rgs else None,
                strings_to_categorical=strings_to_cats,
                **kwargs.get("read", {}),
            )

        else:
            # Single-piece read
            if isinstance(pieces[0], str):
                path = pieces[0]
                row_group = None
                partition_keys = []
            else:
                (path, row_group, partition_keys) = pieces[0]
                row_group = None if row_group == [None] else row_group

            if cudf.utils.ioutils._is_local_filesystem(fs):
                df = cudf.read_parquet(
                    path,
                    engine="cudf",
                    columns=columns,
                    row_groups=row_group,
                    strings_to_categorical=strings_to_cats,
                    **kwargs.get("read", {}),
                )
            else:
                with fs.open(path, mode="rb") as f:
                    df = cudf.read_parquet(
                        f,
                        engine="cudf",
                        columns=columns,
                        row_groups=row_group,
                        strings_to_categorical=strings_to_cats,
                        **kwargs.get("read", {}),
                    )

        # Re-set "object" dtypes align with pa schema
        set_object_dtypes_from_pa_schema(df, kwargs.get("schema", None))

        if index and (index[0] in df.columns):
            df = df.set_index(index[0])
        elif index is False and set(df.index.names).issubset(columns):
            # If index=False, we need to make sure all of the
            # names in `columns` are actually in `df.columns`
            df.reset_index(inplace=True)

        if partition_keys:
            if partitions is None:
                raise ValueError("Must pass partition sets")

            for i, (name, index2) in enumerate(partition_keys):

                # Build the column from `codes` directly
                # (since the category is often a larger dtype)
                codes = (as_column(
                    partitions[i].keys.index(index2)).as_frame().repeat(
                        len(df))._data[None])
                df[name] = build_categorical_column(
                    categories=partitions[i].keys,
                    codes=codes,
                    size=codes.size,
                    offset=codes.offset,
                    ordered=False,
                )

        return df
Ejemplo n.º 7
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       partitions=(),
                       **kwargs):
        if columns is not None:
            columns = [c for c in columns]
        if isinstance(index, list):
            columns += index

        if isinstance(piece, str):
            # `piece` is a file-path string
            piece = pq.ParquetDatasetPiece(piece,
                                           open_file_func=partial(fs.open,
                                                                  mode="rb"))
        else:
            # `piece` = (path, row_group, partition_keys)
            (path, row_group, partition_keys) = piece
            piece = pq.ParquetDatasetPiece(
                path,
                row_group=row_group,
                partition_keys=partition_keys,
                open_file_func=partial(fs.open, mode="rb"),
            )

        strings_to_cats = kwargs.get("strings_to_categorical", False)
        if cudf.utils.ioutils._is_local_filesystem(fs):
            df = cudf.read_parquet(
                piece.path,
                engine="cudf",
                columns=columns,
                row_group=piece.row_group,
                strings_to_categorical=strings_to_cats,
                **kwargs.get("read", {}),
            )
        else:
            with fs.open(piece.path, mode="rb") as f:
                df = cudf.read_parquet(
                    f,
                    engine="cudf",
                    columns=columns,
                    row_group=piece.row_group,
                    strings_to_categorical=strings_to_cats,
                    **kwargs.get("read", {}),
                )

        if index and index[0] in df.columns:
            df = df.set_index(index[0])

        if len(piece.partition_keys) > 0:
            if partitions is None:
                raise ValueError("Must pass partition sets")
            for i, (name, index2) in enumerate(piece.partition_keys):
                categories = [
                    val.as_py() for val in partitions.levels[i].dictionary
                ]
                sr = cudf.Series(index2).astype(type(index2)).repeat(len(df))
                df[name] = build_categorical_column(categories=categories,
                                                    codes=sr._column,
                                                    ordered=False)

        return df
Ejemplo n.º 8
0
def _optimized_read_partition_remote(fs,
                                     pieces,
                                     columns,
                                     index,
                                     categories=(),
                                     partitions=(),
                                     **kwargs):
    # This is a specialized version of `CudfEngine.read_partition`
    # for remote filesystems. This implementation is intended to
    # replace the upstream `read_partition` classmethod until
    # remote-filesystem handling is optimized in cudf/dask-cudf

    if columns is not None:
        columns = list(columns)
    if isinstance(index, list):
        columns += index

    # Check that this is a single-piece read on a non-local filesystem
    if not isinstance(pieces, list):
        pieces = [pieces]
    if len(pieces) > 1:
        raise ValueError(
            "The `_custom_read_partition` code path is not designed to "
            "handle a multi-element `pieces` argument.")
    if cudf.utils.ioutils._is_local_filesystem(fs):
        raise ValueError(
            "The `_custom_read_partition` code path is not intended "
            "for use on local filesystems.")

    # Unpack contents of the single piece
    if isinstance(pieces[0], str):
        path = pieces[0]
        row_group = None
        partition_keys = []
    else:
        (path, row_group, partition_keys) = pieces[0]

    # Call optimized read utility
    df = _optimized_read_remote(path, row_group, columns, fs, **kwargs)

    #
    # Code below is directly copied from cudf-21.08
    #

    if index and (index[0] in df.columns):
        df = df.set_index(index[0])
    elif index is False and set(df.index.names).issubset(columns):
        # If index=False, we need to make sure all of the
        # names in `columns` are actually in `df.columns`
        df.reset_index(inplace=True)

    if partition_keys:
        if partitions is None:
            raise ValueError("Must pass partition sets")
        for i, (name, index2) in enumerate(partition_keys):
            categories = [
                val.as_py() for val in partitions.levels[i].dictionary
            ]

            col = as_column(index2).as_frame().repeat(len(df))._data[None]
            df[name] = build_categorical_column(
                categories=categories,
                codes=as_column(col.base_data, dtype=col.dtype),
                size=col.size,
                offset=col.offset,
                ordered=False,
            )

    return df
Ejemplo n.º 9
0
def _parquet_to_frame(
    paths_or_buffers,
    *args,
    row_groups=None,
    partition_keys=None,
    partition_categories=None,
    **kwargs,
):

    # If this is not a partitioned read, only need
    # one call to `_read_parquet`
    if not partition_keys:
        return _read_parquet(
            paths_or_buffers,
            *args,
            row_groups=row_groups,
            **kwargs,
        )

    # For partitioned data, we need a distinct read for each
    # unique set of partition keys. Therefore, we start by
    # aggregating all paths with matching keys using a dict
    plan = {}
    for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)):
        rgs = row_groups[i] if row_groups else None
        tkeys = tuple(keys)
        if tkeys in plan:
            plan[tkeys][0].append(path)
            if rgs is not None:
                plan[tkeys][1].append(rgs)
        else:
            plan[tkeys] = ([path], None if rgs is None else [rgs])

    dfs = []
    for part_key, (key_paths, key_row_groups) in plan.items():
        # Add new DataFrame to our list
        dfs.append(
            _read_parquet(
                key_paths,
                *args,
                row_groups=key_row_groups,
                **kwargs,
            ))
        # Add partition columns to the last DataFrame
        for (name, value) in part_key:
            if partition_categories and name in partition_categories:
                # Build the categorical column from `codes`
                codes = as_column(
                    partition_categories[name].index(value),
                    length=len(dfs[-1]),
                )
                dfs[-1][name] = build_categorical_column(
                    categories=partition_categories[name],
                    codes=codes,
                    size=codes.size,
                    offset=codes.offset,
                    ordered=False,
                )
            else:
                # Not building categorical columns, so
                # `value` is already what we want
                dfs[-1][name] = as_column(value, length=len(dfs[-1]))

    # Concatenate dfs and return.
    # Assume we can ignore the index if it has no name.
    return (cudf.concat(dfs, ignore_index=dfs[-1].index.name is None)
            if len(dfs) > 1 else dfs[0])
Ejemplo n.º 10
0
    def _read_paths(
        cls,
        paths,
        fs,
        columns=None,
        row_groups=None,
        strings_to_categorical=None,
        partitions=None,
        partitioning=None,
        partition_keys=None,
        open_file_options=None,
        **kwargs,
    ):

        # Simplify row_groups if all None
        if row_groups == [None for path in paths]:
            row_groups = None

        with ExitStack() as stack:

            # Non-local filesystem handling
            paths_or_fobs = paths
            if not _is_local_filesystem(fs):
                paths_or_fobs = _open_remote_files(
                    paths_or_fobs,
                    fs,
                    context_stack=stack,
                    **_default_open_file_options(open_file_options, columns,
                                                 row_groups),
                )

            # Use cudf to read in data
            df = cudf.read_parquet(
                paths_or_fobs,
                engine="cudf",
                columns=columns,
                row_groups=row_groups if row_groups else None,
                strings_to_categorical=strings_to_categorical,
                **kwargs,
            )

        if partitions and partition_keys is None:

            # Use `HivePartitioning` by default
            partitioning = partitioning or {"obj": pa_ds.HivePartitioning}
            ds = pa_ds.dataset(
                paths,
                filesystem=fs,
                format="parquet",
                partitioning=partitioning["obj"].discover(
                    *partitioning.get("args", []),
                    **partitioning.get("kwargs", {}),
                ),
            )
            frag = next(ds.get_fragments())
            if frag:
                # Extract hive-partition keys, and make sure they
                # are ordered the same as they are in `partitions`
                raw_keys = pa_ds._get_partition_keys(frag.partition_expression)
                partition_keys = [(hive_part.name, raw_keys[hive_part.name])
                                  for hive_part in partitions]

        if partition_keys:
            if partitions is None:
                raise ValueError("Must pass partition sets")

            for i, (name, index2) in enumerate(partition_keys):

                # Build the column from `codes` directly
                # (since the category is often a larger dtype)
                codes = as_column(
                    partitions[i].keys.index(index2),
                    length=len(df),
                )
                df[name] = build_categorical_column(
                    categories=partitions[i].keys,
                    codes=codes,
                    size=codes.size,
                    offset=codes.offset,
                    ordered=False,
                )

        return df
Ejemplo n.º 11
0
def cut(
    x,
    bins,
    right: bool = True,
    labels=None,
    retbins: bool = False,
    precision: int = 3,
    include_lowest: bool = False,
    duplicates: str = "raise",
    ordered: bool = True,
):
    """
    Bin values into discrete intervals.
    Use cut when you need to segment and sort data values into bins. This
    function is also useful for going from a continuous variable to a
    categorical variable.
    Parameters
    ----------
    x : array-like
        The input array to be binned. Must be 1-dimensional.
    bins : int, sequence of scalars, or IntervalIndex
        The criteria to bin by.
        * int : Defines the number of equal-width bins in the
        range of x. The range of x is extended by .1% on each
        side to include the minimum and maximum values of x.
    right : bool, default True
        Indicates whether bins includes the rightmost edge or not.
    labels : array or False, default None
        Specifies the labels for the returned bins. Must be the same
        length as the resulting bins. If False, returns only integer
        indicators of thebins. If True,raises an error. When ordered=False,
        labels must be provided.
    retbins : bool, default False
        Whether to return the bins or not.
    precision : int, default 3
        The precision at which to store and display the bins labels.
    include_lowest : bool, default False
        Whether the first interval should be left-inclusive or not.
    duplicates : {default 'raise', 'drop'}, optional
        If bin edges are not unique, raise ValueError or drop non-uniques.
    ordered : bool, default True
        Whether the labels are ordered or not. Applies to returned types
        Categorical and Series (with Categorical dtype). If True,
        the resulting categorical will be ordered. If False, the resulting
        categorical will be unordered (labels must be provided).
    Returns
    -------
    out : CategoricalIndex
        An array-like object representing the respective bin for each value
        of x. The type depends on the value of labels.
    bins : numpy.ndarray or IntervalIndex.
        The computed or specified bins. Only returned when retbins=True.
        For scalar or sequence bins, this is an ndarray with the computed
        bins. If set duplicates=drop, bins will drop non-unique bin. For
        an IntervalIndex bins, this is equal to bins.
    Examples
    --------
    Discretize into three equal-sized bins.
    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
    CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
    ...         (5.0, 7.0],(0.994, 3.0]], categories=[(0.994, 3.0],
    ...         (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category')
    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
    (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
    ...         (5.0, 7.0],(0.994, 3.0]],categories=[(0.994, 3.0],
    ...         (3.0, 5.0], (5.0, 7.0]],ordered=True, dtype='category'),
    array([0.994, 3.   , 5.   , 7.   ]))
    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]),
    ...        3, labels=["bad", "medium", "good"])
    CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'],
    ...       categories=['bad', 'medium', 'good'],ordered=True,
    ...       dtype='category')
    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
    ...       labels=["B", "A", "B"], ordered=False)
    CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'],
    ...        ordered=False, dtype='category')
    >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False)
    array([0, 1, 1, 3], dtype=int32)
    Passing a Series as an input returns a Series with categorical dtype:
    >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]),
    ...        index=['a', 'b', 'c', 'd', 'e'])
    >>> cudf.cut(s, 3)
    """
    left_inclusive = False
    right_inclusive = True
    # saving the original input x for use in case its a series
    orig_x = x
    old_bins = bins

    if not ordered and labels is None:
        raise ValueError("'labels' must be provided if 'ordered = False'")

    if duplicates not in ["raise", "drop"]:
        raise ValueError(
            "invalid value for 'duplicates' parameter, valid options are: "
            "raise, drop")

    if labels is not False:
        if not (labels is None or is_list_like(labels)):
            raise ValueError(
                "Bin labels must either be False, None or passed in as a "
                "list-like argument")
        elif ordered and labels is not None:
            if len(set(labels)) != len(labels):
                raise ValueError("labels must be unique if ordered=True;"
                                 "pass ordered=False for duplicate labels")

    # bins can either be an int, sequence of scalars or an intervalIndex
    if isinstance(bins, Sequence):
        if len(set(bins)) is not len(bins):
            if duplicates == "raise":
                raise ValueError(
                    f"Bin edges must be unique: {repr(bins)}.\n"
                    f"You can drop duplicate edges by setting the 'duplicates'"
                    "kwarg")
            elif duplicates == "drop":
                # get unique values but maintain list dtype
                bins = list(dict.fromkeys(bins))

    # if bins is an intervalIndex we ignore the value of right
    elif isinstance(bins, (pd.IntervalIndex, cudf.IntervalIndex)):
        right = bins.closed == "right"

    # create bins if given an int or single scalar
    if not isinstance(bins, pd.IntervalIndex):
        if not isinstance(bins, (Sequence)):
            if isinstance(x,
                          (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)):
                mn = x.min()
                mx = x.max()
            else:
                mn = min(x)
                mx = max(x)
            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
            adj = (mx - mn) * 0.001
            if right:
                bins[0] -= adj
            else:
                bins[-1] += adj

        # if right and include lowest we adjust the first
        # bin edge to make sure it is included
        if right and include_lowest:
            bins[0] = bins[0] - 10**(-precision)

        # if right is false the last bin edge is not included
        if not right:
            right_edge = bins[-1]
            x = cupy.asarray(x)
            x[x == right_edge] = right_edge + 1

        # adjust bin edges decimal precision
        int_label_bins = np.around(bins, precision)

    # the inputs is a column of the values in the array x
    input_arr = as_column(x)

    # checking for the correct inclusivity values
    if right:
        closed = "right"
    else:
        closed = "left"
        left_inclusive = True

    if isinstance(bins, pd.IntervalIndex):
        interval_labels = bins
    elif labels is None:
        if duplicates == "drop" and len(bins) == 1 and len(old_bins) != 1:
            if right and include_lowest:
                old_bins[0] = old_bins[0] - 10**(-precision)
                interval_labels = interval_range(old_bins[0],
                                                 old_bins[1],
                                                 periods=1,
                                                 closed=closed)
            else:
                interval_labels = IntervalIndex.from_breaks(old_bins,
                                                            closed=closed)
        else:
            # get labels for categories
            interval_labels = IntervalIndex.from_breaks(int_label_bins,
                                                        closed=closed)
    elif labels is not False:
        if not (is_list_like(labels)):
            raise ValueError(
                "Bin labels must either be False, None or passed in as a "
                "list-like argument")
        if ordered and len(set(labels)) != len(labels):
            raise ValueError(
                "labels must be unique if ordered=True; pass ordered=False for"
                "duplicate labels")
        else:
            if len(labels) != len(bins) - 1:
                raise ValueError(
                    "Bin labels must be one fewer than the number of bin edges"
                )
            if not ordered and len(set(labels)) != len(labels):
                interval_labels = cudf.CategoricalIndex(labels,
                                                        categories=None,
                                                        ordered=False)
            else:
                interval_labels = (labels if len(set(labels)) == len(labels)
                                   else None)

    if isinstance(bins, pd.IntervalIndex):
        # get the left and right edges of the bins as columns
        # we cannot typecast an IntervalIndex, so we need to
        # make the edges the same type as the input array
        left_edges = as_column(bins.left).astype(input_arr.dtype)
        right_edges = as_column(bins.right).astype(input_arr.dtype)
    else:
        # get the left and right edges of the bins as columns
        left_edges = as_column(bins[:-1:], dtype="float64")
        right_edges = as_column(bins[+1::], dtype="float64")
        # the input arr must be changed to the same type as the edges
        input_arr = input_arr.astype(left_edges.dtype)
    # get the indexes for the appropriate number
    index_labels = cudf._lib.labeling.label_bins(input_arr, left_edges,
                                                 left_inclusive, right_edges,
                                                 right_inclusive)

    if labels is False:
        # if labels is false we return the index labels, we return them
        # as a series if we have a series input
        if isinstance(orig_x, (pd.Series, cudf.Series)):
            # need to run more tests but looks like in this case pandas
            # always returns a float64 dtype
            indx_arr_series = cudf.Series(index_labels, dtype="float64")
            # if retbins we return the bins as well
            if retbins:
                return indx_arr_series, bins
            else:
                return indx_arr_series
        elif retbins:
            return index_labels.values, bins
        else:
            return index_labels.values

    if labels is not None:
        if labels is not ordered and len(set(labels)) != len(labels):
            # when we have duplicate labels and ordered is False, we
            # should allow duplicate categories. The categories are
            # returned in order
            new_data = [interval_labels[i][0] for i in index_labels.values]
            return cudf.CategoricalIndex(new_data,
                                         categories=sorted(set(labels)),
                                         ordered=False)

    col = build_categorical_column(
        categories=interval_labels,
        codes=index_labels,
        mask=index_labels.base_mask,
        offset=index_labels.offset,
        size=index_labels.size,
        ordered=ordered,
    )

    # we return a categorical index, as we don't have a Categorical method
    categorical_index = cudf.core.index.as_index(col)

    if isinstance(orig_x, (pd.Series, cudf.Series)):
        # if we have a series input we return a series output
        res_series = cudf.Series(categorical_index, index=orig_x.index)
        if retbins:
            return res_series, bins
        else:
            return res_series
    elif retbins:
        # if retbins is true we return the bins as well
        return categorical_index, bins
    else:
        return categorical_index