Beispiel #1
0
    def _is_monotonic_increasing(self) -> Series:
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)

        cond = SF.lit(True)
        has_not_null = SF.lit(True)
        for scol in self._internal.index_spark_columns[::-1]:
            data_type = self._internal.spark_type_for(scol)
            prev = F.lag(scol, 1).over(window)
            compare = MultiIndex._comparator_for_monotonic_increasing(
                data_type)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & scol.isNotNull()
            cond = F.when(scol.eqNullSafe(prev),
                          cond).otherwise(compare(scol, prev, Column.__gt__))

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(
                self._internal.index_spark_columns),
            "__is_monotonic_increasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)])

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_fields=self._internal.index_fields,
        )

        return first_series(DataFrame(internal))
Beispiel #2
0
def _as_bool_type(index_ops: IndexOpsLike,
                  dtype: Union[str, type, Dtype]) -> IndexOpsLike:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    spark_type = BooleanType()
    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(spark_type)
    else:
        scol = F.when(index_ops.spark.column.isNull(),
                      SF.lit(False)).otherwise(
                          index_ops.spark.column.cast(spark_type))
    return index_ops._with_new_scol(
        scol,
        field=index_ops._internal.data_fields[0].copy(dtype=dtype,
                                                      spark_type=spark_type))
 def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     _sanitize_list_like(right)
     # Note that date subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction.
     msg = ("Note that there is a behavior difference of date subtraction. "
            "The date subtraction returns an integer in days, "
            "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, datetime.date) and not isinstance(
             right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return -column_op(F.datediff)(left, SF.lit(right)).astype("long")
     else:
         raise TypeError(
             "Date subtraction can only be applied to date series.")
Beispiel #4
0
def _as_bool_type(index_ops: IndexOpsLike,
                  dtype: Union[str, type, Dtype]) -> IndexOpsLike:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    from pyspark.pandas.internal import InternalField

    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(BooleanType())
    else:
        scol = F.when(index_ops.spark.column.isNull(),
                      SF.lit(False)).otherwise(
                          index_ops.spark.column.cast(BooleanType()))
    return index_ops._with_new_scol(
        scol.alias(index_ops._internal.data_spark_column_names[0]),
        field=InternalField(dtype=dtype),
    )
Beispiel #5
0
 def floordiv(left: Column, right: Any) -> Column:
     return F.when(SF.lit(right is np.nan), np.nan).otherwise(
         F.when(
             SF.lit(right != 0) | SF.lit(right).isNull(), F.floor(left.__div__(right))
         ).otherwise(
             F.when(SF.lit(left == np.inf) | SF.lit(left == -np.inf), left).otherwise(
                 SF.lit(np.inf).__div__(left)
             )
         )
     )
Beispiel #6
0
 def rsub(self, left: T_IndexOps, right: Any) -> IndexOpsLike:
     # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
     msg = (
         "Note that there is a behavior difference of timestamp subtraction. "
         "The timestamp subtraction returns an integer in seconds, "
         "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return cast(
             IndexOpsLike,
             left.spark.transform(lambda scol: SF.lit(right).cast(
                 as_spark_type("long")) - scol.astype("long")),
         )
     else:
         raise TypeError(
             "datetime subtraction can only be applied to datetime series.")
Beispiel #7
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #8
0
 def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
     msg = (
         "Note that there is a behavior difference of timestamp subtraction. "
         "The timestamp subtraction returns an integer in seconds, "
         "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 SF.lit(right).cast(LongType()) -
                 left.spark.column.cast(LongType()),
                 field=left._internal.data_fields[0].copy(
                     dtype=np.dtype("int64"), spark_type=LongType()),
             ),
         )
     else:
         raise TypeError(
             "Datetime subtraction can only be applied to datetime series.")
Beispiel #9
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)

        if isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise(
                    F.length(index_ops.spark.column) > 0
                )
            return index_ops._with_new_scol(
                scol,
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)
            return _as_string_type(index_ops, dtype, null_str=null_str)
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #10
0
    def astype(self, index_ops: T_IndexOps, dtype: Union[str, type,
                                                         Dtype]) -> T_IndexOps:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)

        if isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(index_ops.spark.column.isNull(),
                              SF.lit(False)).otherwise(
                                  F.length(index_ops.spark.column) > 0)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype)
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #11
0
 def test_lit(self):
     self.assertTrue(
         spark_column_equals(SF.lit(np.int64(1)),
                             F.lit(1).astype(LongType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.int32(1)),
                             F.lit(1).astype(IntegerType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.int8(1)),
                             F.lit(1).astype(ByteType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.byte(1)),
                             F.lit(1).astype(ByteType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.float32(1)),
                             F.lit(float(1)).astype(FloatType())))
     self.assertTrue(spark_column_equals(SF.lit(1), F.lit(1)))
Beispiel #12
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)
        if get_option("compute.eager_check"):
            if is_integer_dtype(dtype) and not isinstance(
                    dtype, extension_dtypes):
                if index_ops.hasnans:
                    raise ValueError(
                        "Cannot convert %s with missing values to integer" %
                        self.pretty_name)
            elif is_bool_dtype(dtype) and not isinstance(
                    dtype, extension_dtypes):
                if index_ops.hasnans:
                    raise ValueError(
                        "Cannot convert %s with missing values to bool" %
                        self.pretty_name)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull()
                    | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(
                    dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #13
0
 def convert_arguments(*args):
     args = [
         SF.lit(inp) if not isinstance(inp, Column) else inp
         for inp in args
     ]
     return np_spark_map_func(*args)
Beispiel #14
0
 def pow_func(left: Column, right: Any) -> Column:
     return (
         F.when(left == 1, left)
         .when(SF.lit(right) == 0, 1)
         .otherwise(Column.__pow__(left, right))
     )
Beispiel #15
0
 def rfloordiv(left: Column, right: Any) -> Column:
     return F.when(SF.lit(left == 0), SF.lit(np.inf).__div__(right)).otherwise(
         F.when(SF.lit(left) == np.nan, np.nan).otherwise(
             F.floor(SF.lit(right).__div__(left))
         )
     )
Beispiel #16
0
 def rtruediv(left: Column, right: Any) -> Column:
     return F.when(left == 0, SF.lit(np.inf).__div__(right)).otherwise(
         SF.lit(right).__truediv__(left)
     )
Beispiel #17
0
 def truediv(left: Column, right: Any) -> Column:
     return F.when(
         SF.lit(right != 0) | SF.lit(right).isNull(), left.__div__(right)
     ).otherwise(SF.lit(np.inf).__div__(left))
Beispiel #18
0
 def rpow_func(left: Column, right: Any) -> Column:
     return F.when(SF.lit(right == 1), right).otherwise(Column.__rpow__(left, right))
Beispiel #19
0
def align_diff_frames(
    resolve_func: Callable[
        ["DataFrame", List[Tuple], List[Tuple]], Iterator[Tuple["Series", Tuple]]
    ],
    this: "DataFrame",
    that: "DataFrame",
    fillna: bool = True,
    how: str = "full",
    preserve_order_column: bool = False,
) -> "DataFrame":
    """
    This method aligns two different DataFrames with a given `func`. Columns are resolved and
    handled within the given `func`.
    To use this, `compute.ops_on_diff_frames` should be True, for now.

    :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
        the column of another DataFrame. It returns an iterable that produces Series.

        >>> from pyspark.pandas.config import set_option, reset_option
        >>>
        >>> set_option("compute.ops_on_diff_frames", True)
        >>>
        >>> psdf1 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>> psdf2 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>>
        >>> def func(psdf, this_column_labels, that_column_labels):
        ...    psdf  # conceptually this is A + B.
        ...
        ...    # Within this function, Series from A or B can be performed against `psdf`.
        ...    this_label = this_column_labels[0]  # this is ('a',) from psdf1.
        ...    that_label = that_column_labels[0]  # this is ('a',) from psdf2.
        ...    new_series = (psdf[this_label] - psdf[that_label]).rename(str(this_label))
        ...
        ...    # This new series will be placed in new DataFrame.
        ...    yield (new_series, this_label)
        >>>
        >>>
        >>> align_diff_frames(func, psdf1, psdf2).sort_index()
           a
        0  0
        1  0
        2  0
        3  0
        4  0
        5  0
        6  0
        7  0
        8  0
        >>> reset_option("compute.ops_on_diff_frames")

    :param this: a DataFrame to align
    :param that: another DataFrame to align
    :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
        Otherwise, it returns as are.
    :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
        - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
            'that_columns' in this function are B, C and B, C.
        - left: `resolve_func` should resolve columns including that columns.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
            B, C but `that_columns` are B, C, D.
        - inner: Same as 'full' mode; however, internally performs inner join instead.
    :return: Aligned DataFrame
    """
    from pyspark.pandas.frame import DataFrame

    assert how == "full" or how == "left" or how == "inner"

    this_column_labels = this._internal.column_labels
    that_column_labels = that._internal.column_labels
    common_column_labels = set(this_column_labels).intersection(that_column_labels)

    # 1. Perform the join given two dataframes.
    combined = combine_frames(this, that, how=how, preserve_order_column=preserve_order_column)

    # 2. Apply the given function to transform the columns in a batch and keep the new columns.
    combined_column_labels = combined._internal.column_labels

    that_columns_to_apply = []  # type: List[Tuple]
    this_columns_to_apply = []  # type: List[Tuple]
    additional_that_columns = []  # type: List[Tuple]
    columns_to_keep = []  # type: List[Union[Series, Column]]
    column_labels_to_keep = []  # type: List[Tuple]

    for combined_label in combined_column_labels:
        for common_label in common_column_labels:
            if combined_label == tuple(["this", *common_label]):
                this_columns_to_apply.append(combined_label)
                break
            elif combined_label == tuple(["that", *common_label]):
                that_columns_to_apply.append(combined_label)
                break
        else:
            if how == "left" and combined_label in [
                tuple(["that", *label]) for label in that_column_labels
            ]:
                # In this case, we will drop `that_columns` in `columns_to_keep` but passes
                # it later to `func`. `func` should resolve it.
                # Note that adding this into a separate list (`additional_that_columns`)
                # is intentional so that `this_columns` and `that_columns` can be paired.
                additional_that_columns.append(combined_label)
            elif fillna:
                columns_to_keep.append(SF.lit(None).cast(DoubleType()).alias(str(combined_label)))
                column_labels_to_keep.append(combined_label)
            else:
                columns_to_keep.append(combined._psser_for(combined_label))
                column_labels_to_keep.append(combined_label)

    that_columns_to_apply += additional_that_columns

    # Should extract columns to apply and do it in a batch in case
    # it adds new columns for example.
    if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0:
        psser_set, column_labels_set = zip(
            *resolve_func(combined, this_columns_to_apply, that_columns_to_apply)
        )
        columns_applied = list(psser_set)  # type: List[Union[Series, Column]]
        column_labels_applied = list(column_labels_set)  # type: List[Tuple]
    else:
        columns_applied = []
        column_labels_applied = []

    applied = DataFrame(
        combined._internal.with_new_columns(
            columns_applied + columns_to_keep,
            column_labels=column_labels_applied + column_labels_to_keep,
        )
    )  # type: DataFrame

    # 3. Restore the names back and deduplicate columns.
    this_labels = OrderedDict()
    # Add columns in an order of its original frame.
    for this_label in this_column_labels:
        for new_label in applied._internal.column_labels:
            if new_label[1:] not in this_labels and this_label == new_label[1:]:
                this_labels[new_label[1:]] = new_label

    # After that, we will add the rest columns.
    other_labels = OrderedDict()
    for new_label in applied._internal.column_labels:
        if new_label[1:] not in this_labels:
            other_labels[new_label[1:]] = new_label

    psdf = applied[list(this_labels.values()) + list(other_labels.values())]
    psdf.columns = psdf.columns.droplevel()
    return psdf
Beispiel #20
0
 def rpow_func(left: Column, right: Any) -> Column:
     return (F.when(left.isNull(),
                    np.nan).when(SF.lit(right == 1), right).otherwise(
                        Column.__rpow__(left, right)))
Beispiel #21
0
 def or_func(left: Column, right: Any) -> Column:
     if not isinstance(right, Column) and pd.isna(right):
         return SF.lit(False)
     else:
         scol = left | SF.lit(right)
         return F.when(left.isNull() | scol.isNull(), False).otherwise(scol)
Beispiel #22
0
    def compute_hist(psdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        sdf = psdf._internal.spark_frame
        scols = []
        input_column_names = []
        for label in psdf._internal.column_labels:
            input_column_name = name_like_string(label)
            input_column_names.append(input_column_name)
            scols.append(psdf._internal.spark_column_for(label).alias(input_column_name))
        sdf = sdf.select(*scols)

        # 1. Make the bucket output flat to:
        #     +----------+-------+
        #     |__group_id|buckets|
        #     +----------+-------+
        #     |0         |0.0    |
        #     |0         |0.0    |
        #     |0         |1.0    |
        #     |0         |2.0    |
        #     |0         |3.0    |
        #     |0         |3.0    |
        #     |1         |0.0    |
        #     |1         |1.0    |
        #     |1         |1.0    |
        #     |1         |2.0    |
        #     |1         |1.0    |
        #     |1         |0.0    |
        #     +----------+-------+
        colnames = sdf.columns
        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]

        output_df = None
        for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)):
            # creates a Bucketizer to get corresponding bin of each value
            bucketizer = Bucketizer(
                splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
            )

            bucket_df = bucketizer.transform(sdf)

            if output_df is None:
                output_df = bucket_df.select(
                    SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
                )
            else:
                output_df = output_df.union(
                    bucket_df.select(
                        SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
                    )
                )

        # 2. Calculate the count based on each group and bucket.
        #     +----------+-------+------+
        #     |__group_id|buckets| count|
        #     +----------+-------+------+
        #     |0         |0.0    |2     |
        #     |0         |1.0    |1     |
        #     |0         |2.0    |1     |
        #     |0         |3.0    |2     |
        #     |1         |0.0    |2     |
        #     |1         |1.0    |3     |
        #     |1         |2.0    |1     |
        #     +----------+-------+------+
        result = (
            output_df.groupby("__group_id", "__bucket")
            .agg(F.count("*").alias("count"))
            .toPandas()
            .sort_values(by=["__group_id", "__bucket"])
        )

        # 3. Fill empty bins and calculate based on each group id. From:
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |0         |0.0     |2     |
        #     |0         |1.0     |1     |
        #     |0         |2.0     |1     |
        #     |0         |3.0     |2     |
        #     +----------+--------+------+
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |1         |0.0     |2     |
        #     |1         |1.0     |3     |
        #     |1         |2.0     |1     |
        #     +----------+--------+------+
        #
        # to:
        #     +-----------------+
        #     |__values1__bucket|
        #     +-----------------+
        #     |2                |
        #     |1                |
        #     |1                |
        #     |2                |
        #     |0                |
        #     +-----------------+
        #     +-----------------+
        #     |__values2__bucket|
        #     +-----------------+
        #     |2                |
        #     |3                |
        #     |1                |
        #     |0                |
        #     |0                |
        #     +-----------------+
        output_series = []
        for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)):
            current_bucket_result = result[result["__group_id"] == i]
            # generates a pandas DF with one row for each bin
            # we need this as some of the bins may be empty
            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
            # merges the bins with counts on it and fills remaining ones with zeros
            pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[
                ["count"]
            ]
            pdf.columns = [input_column_name]
            output_series.append(pdf[input_column_name])

        return output_series
Beispiel #23
0
    def set_categories(
        self,
        new_categories: Union[pd.Index, List],
        ordered: Optional[bool] = None,
        rename: bool = False,
        inplace: bool = False,
    ) -> Optional["ps.Series"]:
        """
        Set the categories to the specified new_categories.

        `new_categories` can include new categories (which will result in
        unused categories) or remove old categories (which results in values
        set to NaN). If `rename==True`, the categories will simple be renamed
        (less or more items than in old categories will result in values set to
        NaN or in unused categories respectively).

        This method can be used to perform more than one action of adding,
        removing, and reordering simultaneously and is therefore faster than
        performing the individual steps via the more specialised methods.

        On the other hand this methods does not do checks (e.g., whether the
        old categories are included in the new categories on a reorder), which
        can result in surprising changes, for example when using special string
        dtypes, which does not considers a S1 string equal to a single char
        python string.

        Parameters
        ----------
        new_categories : Index-like
           The categories in new order.
        ordered : bool, default False
           Whether or not the categorical is treated as a ordered categorical.
           If not given, do not change the ordered information.
        rename : bool, default False
           Whether or not the new_categories should be considered as a rename
           of the old categories or as reordered categories.
        inplace : bool, default False
           Whether or not to reorder the categories in-place or return a copy
           of this categorical with reordered categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        Series with reordered categories or None if inplace.

        Raises
        ------
        ValueError
            If new_categories does not validate as categories

        See Also
        --------
        rename_categories : Rename categories.
        reorder_categories : Reorder categories.
        add_categories : Add new categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.set_categories(['b', 'c'])  # doctest: +SKIP
        0    NaN
        1      b
        2      b
        3      c
        4      c
        5      c
        dtype: category
        Categories (2, object): ['b', 'c']

        >>> s.cat.set_categories([1, 2, 3], rename=True)  # doctest: +SKIP
        0    1
        1    2
        2    2
        3    3
        4    3
        5    3
        dtype: category
        Categories (3, int64): [1, 2, 3]

        >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True)  # doctest: +SKIP
        0    1
        1    2
        2    2
        3    3
        4    3
        5    3
        dtype: category
        Categories (3, int64): [1 < 2 < 3]
        """
        from pyspark.pandas.frame import DataFrame

        if inplace:
            warnings.warn(
                "The `inplace` parameter in set_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        if not is_list_like(new_categories):
            raise TypeError(
                "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
            )

        if ordered is None:
            ordered = self.ordered

        new_dtype = CategoricalDtype(new_categories, ordered=ordered)
        scol = self._data.spark.column

        if rename:
            new_scol = (
                F.when(scol >= len(new_categories), SF.lit(-1).cast(self._data.spark.data_type))
                .otherwise(scol)
                .alias(self._data._internal.data_spark_column_names[0])
            )

            internal = self._data._psdf._internal.with_new_spark_column(
                self._data._column_label,
                new_scol,
                field=self._data._internal.data_fields[0].copy(dtype=new_dtype),
            )

            if inplace:
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return DataFrame(internal)._psser_for(self._data._column_label).copy()
        else:
            psser = self._data.astype(new_dtype)
            if inplace:
                internal = self._data._psdf._internal.with_new_spark_column(
                    self._data._column_label,
                    psser.spark.column,
                    field=psser._internal.data_fields[0],
                )
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return psser