Python lit Examples, pyspark.pandas.spark.functions.lit Python Examples

Example #1

0

Show file

File: multi.py Project: zoelin7/spark

    def _is_monotonic_increasing(self) -> Series:
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)

        cond = SF.lit(True)
        has_not_null = SF.lit(True)
        for scol in self._internal.index_spark_columns[::-1]:
            data_type = self._internal.spark_type_for(scol)
            prev = F.lag(scol, 1).over(window)
            compare = MultiIndex._comparator_for_monotonic_increasing(
                data_type)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & scol.isNotNull()
            cond = F.when(scol.eqNullSafe(prev),
                          cond).otherwise(compare(scol, prev, Column.__gt__))

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(
                self._internal.index_spark_columns),
            "__is_monotonic_increasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)])

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_fields=self._internal.index_fields,
        )

        return first_series(DataFrame(internal))

Example #2

0

Show file

File: base.py Project: soumya1984/spark

def _as_bool_type(index_ops: IndexOpsLike,
                  dtype: Union[str, type, Dtype]) -> IndexOpsLike:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    spark_type = BooleanType()
    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(spark_type)
    else:
        scol = F.when(index_ops.spark.column.isNull(),
                      SF.lit(False)).otherwise(
                          index_ops.spark.column.cast(spark_type))
    return index_ops._with_new_scol(
        scol,
        field=index_ops._internal.data_fields[0].copy(dtype=dtype,
                                                      spark_type=spark_type))

Example #3

0

Show file

File: date_ops.py Project: goncaloperes/Framework_Spark

 def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     _sanitize_list_like(right)
     # Note that date subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' in days from date's subtraction.
     msg = ("Note that there is a behavior difference of date subtraction. "
            "The date subtraction returns an integer in days, "
            "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, datetime.date) and not isinstance(
             right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return -column_op(F.datediff)(left, SF.lit(right)).astype("long")
     else:
         raise TypeError(
             "Date subtraction can only be applied to date series.")

Example #4

0

Show file

File: base.py Project: wwjiang007/spark

def _as_bool_type(index_ops: IndexOpsLike,
                  dtype: Union[str, type, Dtype]) -> IndexOpsLike:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    from pyspark.pandas.internal import InternalField

    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(BooleanType())
    else:
        scol = F.when(index_ops.spark.column.isNull(),
                      SF.lit(False)).otherwise(
                          index_ops.spark.column.cast(BooleanType()))
    return index_ops._with_new_scol(
        scol.alias(index_ops._internal.data_spark_column_names[0]),
        field=InternalField(dtype=dtype),
    )

Example #5

0

Show file

 def floordiv(left: Column, right: Any) -> Column:
     return F.when(SF.lit(right is np.nan), np.nan).otherwise(
         F.when(
             SF.lit(right != 0) | SF.lit(right).isNull(), F.floor(left.__div__(right))
         ).otherwise(
             F.when(SF.lit(left == np.inf) | SF.lit(left == -np.inf), left).otherwise(
                 SF.lit(np.inf).__div__(left)
             )
         )
     )

Example #6

0

Show file

File: datetime_ops.py Project: williamhyun/spark

 def rsub(self, left: T_IndexOps, right: Any) -> IndexOpsLike:
     # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
     msg = (
         "Note that there is a behavior difference of timestamp subtraction. "
         "The timestamp subtraction returns an integer in seconds, "
         "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return cast(
             IndexOpsLike,
             left.spark.transform(lambda scol: SF.lit(right).cast(
                 as_spark_type("long")) - scol.astype("long")),
         )
     else:
         raise TypeError(
             "datetime subtraction can only be applied to datetime series.")

Example #7

0

Show file

    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)

Example #8

0

Show file

 def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
     msg = (
         "Note that there is a behavior difference of timestamp subtraction. "
         "The timestamp subtraction returns an integer in seconds, "
         "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 SF.lit(right).cast(LongType()) -
                 left.spark.column.cast(LongType()),
                 field=left._internal.data_fields[0].copy(
                     dtype=np.dtype("int64"), spark_type=LongType()),
             ),
         )
     else:
         raise TypeError(
             "Datetime subtraction can only be applied to datetime series.")

Example #9

0

Show file

    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)

        if isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise(
                    F.length(index_ops.spark.column) > 0
                )
            return index_ops._with_new_scol(
                scol,
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)
            return _as_string_type(index_ops, dtype, null_str=null_str)
        else:
            return _as_other_type(index_ops, dtype, spark_type)

Example #10

0

Show file

File: string_ops.py Project: williamhyun/spark

    def astype(self, index_ops: T_IndexOps, dtype: Union[str, type,
                                                         Dtype]) -> T_IndexOps:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)

        if isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(index_ops.spark.column.isNull(),
                              SF.lit(False)).otherwise(
                                  F.length(index_ops.spark.column) > 0)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype)
        else:
            return _as_other_type(index_ops, dtype, spark_type)

Example #11

0

Show file

 def test_lit(self):
     self.assertTrue(
         spark_column_equals(SF.lit(np.int64(1)),
                             F.lit(1).astype(LongType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.int32(1)),
                             F.lit(1).astype(IntegerType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.int8(1)),
                             F.lit(1).astype(ByteType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.byte(1)),
                             F.lit(1).astype(ByteType())))
     self.assertTrue(
         spark_column_equals(SF.lit(np.float32(1)),
                             F.lit(float(1)).astype(FloatType())))
     self.assertTrue(spark_column_equals(SF.lit(1), F.lit(1)))

Example #12

0

Show file

File: num_ops.py Project: zhengruifeng/spark

    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)
        if get_option("compute.eager_check"):
            if is_integer_dtype(dtype) and not isinstance(
                    dtype, extension_dtypes):
                if index_ops.hasnans:
                    raise ValueError(
                        "Cannot convert %s with missing values to integer" %
                        self.pretty_name)
            elif is_bool_dtype(dtype) and not isinstance(
                    dtype, extension_dtypes):
                if index_ops.hasnans:
                    raise ValueError(
                        "Cannot convert %s with missing values to bool" %
                        self.pretty_name)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull()
                    | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(
                    dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)

Example #13

0

Show file

File: numpy_compat.py Project: zoelin7/spark

 def convert_arguments(*args):
     args = [
         SF.lit(inp) if not isinstance(inp, Column) else inp
         for inp in args
     ]
     return np_spark_map_func(*args)

Example #14

0

Show file

 def pow_func(left: Column, right: Any) -> Column:
     return (
         F.when(left == 1, left)
         .when(SF.lit(right) == 0, 1)
         .otherwise(Column.__pow__(left, right))
     )

Example #15

0

Show file

 def rfloordiv(left: Column, right: Any) -> Column:
     return F.when(SF.lit(left == 0), SF.lit(np.inf).__div__(right)).otherwise(
         F.when(SF.lit(left) == np.nan, np.nan).otherwise(
             F.floor(SF.lit(right).__div__(left))
         )
     )

Example #16

0

Show file

 def rtruediv(left: Column, right: Any) -> Column:
     return F.when(left == 0, SF.lit(np.inf).__div__(right)).otherwise(
         SF.lit(right).__truediv__(left)
     )

Example #17

0

Show file

 def truediv(left: Column, right: Any) -> Column:
     return F.when(
         SF.lit(right != 0) | SF.lit(right).isNull(), left.__div__(right)
     ).otherwise(SF.lit(np.inf).__div__(left))

Example #18

0

Show file

 def rpow_func(left: Column, right: Any) -> Column:
     return F.when(SF.lit(right == 1), right).otherwise(Column.__rpow__(left, right))

Example #19

0

Show file

File: utils.py Project: jerqi/spark

def align_diff_frames(
    resolve_func: Callable[
        ["DataFrame", List[Tuple], List[Tuple]], Iterator[Tuple["Series", Tuple]]
    ],
    this: "DataFrame",
    that: "DataFrame",
    fillna: bool = True,
    how: str = "full",
    preserve_order_column: bool = False,
) -> "DataFrame":
    """
    This method aligns two different DataFrames with a given `func`. Columns are resolved and
    handled within the given `func`.
    To use this, `compute.ops_on_diff_frames` should be True, for now.

    :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
        the column of another DataFrame. It returns an iterable that produces Series.

        >>> from pyspark.pandas.config import set_option, reset_option
        >>>
        >>> set_option("compute.ops_on_diff_frames", True)
        >>>
        >>> psdf1 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>> psdf2 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>>
        >>> def func(psdf, this_column_labels, that_column_labels):
        ...    psdf  # conceptually this is A + B.
        ...
        ...    # Within this function, Series from A or B can be performed against `psdf`.
        ...    this_label = this_column_labels[0]  # this is ('a',) from psdf1.
        ...    that_label = that_column_labels[0]  # this is ('a',) from psdf2.
        ...    new_series = (psdf[this_label] - psdf[that_label]).rename(str(this_label))
        ...
        ...    # This new series will be placed in new DataFrame.
        ...    yield (new_series, this_label)
        >>>
        >>>
        >>> align_diff_frames(func, psdf1, psdf2).sort_index()
           a
        0  0
        1  0
        2  0
        3  0
        4  0
        5  0
        6  0
        7  0
        8  0
        >>> reset_option("compute.ops_on_diff_frames")

    :param this: a DataFrame to align
    :param that: another DataFrame to align
    :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
        Otherwise, it returns as are.
    :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
        - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
            'that_columns' in this function are B, C and B, C.
        - left: `resolve_func` should resolve columns including that columns.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
            B, C but `that_columns` are B, C, D.
        - inner: Same as 'full' mode; however, internally performs inner join instead.
    :return: Aligned DataFrame
    """
    from pyspark.pandas.frame import DataFrame

    assert how == "full" or how == "left" or how == "inner"

    this_column_labels = this._internal.column_labels
    that_column_labels = that._internal.column_labels
    common_column_labels = set(this_column_labels).intersection(that_column_labels)

    # 1. Perform the join given two dataframes.
    combined = combine_frames(this, that, how=how, preserve_order_column=preserve_order_column)

    # 2. Apply the given function to transform the columns in a batch and keep the new columns.
    combined_column_labels = combined._internal.column_labels

    that_columns_to_apply = []  # type: List[Tuple]
    this_columns_to_apply = []  # type: List[Tuple]
    additional_that_columns = []  # type: List[Tuple]
    columns_to_keep = []  # type: List[Union[Series, Column]]
    column_labels_to_keep = []  # type: List[Tuple]

    for combined_label in combined_column_labels:
        for common_label in common_column_labels:
            if combined_label == tuple(["this", *common_label]):
                this_columns_to_apply.append(combined_label)
                break
            elif combined_label == tuple(["that", *common_label]):
                that_columns_to_apply.append(combined_label)
                break
        else:
            if how == "left" and combined_label in [
                tuple(["that", *label]) for label in that_column_labels
            ]:
                # In this case, we will drop `that_columns` in `columns_to_keep` but passes
                # it later to `func`. `func` should resolve it.
                # Note that adding this into a separate list (`additional_that_columns`)
                # is intentional so that `this_columns` and `that_columns` can be paired.
                additional_that_columns.append(combined_label)
            elif fillna:
                columns_to_keep.append(SF.lit(None).cast(DoubleType()).alias(str(combined_label)))
                column_labels_to_keep.append(combined_label)
            else:
                columns_to_keep.append(combined._psser_for(combined_label))
                column_labels_to_keep.append(combined_label)

    that_columns_to_apply += additional_that_columns

    # Should extract columns to apply and do it in a batch in case
    # it adds new columns for example.
    if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0:
        psser_set, column_labels_set = zip(
            *resolve_func(combined, this_columns_to_apply, that_columns_to_apply)
        )
        columns_applied = list(psser_set)  # type: List[Union[Series, Column]]
        column_labels_applied = list(column_labels_set)  # type: List[Tuple]
    else:
        columns_applied = []
        column_labels_applied = []

    applied = DataFrame(
        combined._internal.with_new_columns(
            columns_applied + columns_to_keep,
            column_labels=column_labels_applied + column_labels_to_keep,
        )
    )  # type: DataFrame

    # 3. Restore the names back and deduplicate columns.
    this_labels = OrderedDict()
    # Add columns in an order of its original frame.
    for this_label in this_column_labels:
        for new_label in applied._internal.column_labels:
            if new_label[1:] not in this_labels and this_label == new_label[1:]:
                this_labels[new_label[1:]] = new_label

    # After that, we will add the rest columns.
    other_labels = OrderedDict()
    for new_label in applied._internal.column_labels:
        if new_label[1:] not in this_labels:
            other_labels[new_label[1:]] = new_label

    psdf = applied[list(this_labels.values()) + list(other_labels.values())]
    psdf.columns = psdf.columns.droplevel()
    return psdf

Example #20

0

Show file

File: num_ops.py Project: zhengruifeng/spark

 def rpow_func(left: Column, right: Any) -> Column:
     return (F.when(left.isNull(),
                    np.nan).when(SF.lit(right == 1), right).otherwise(
                        Column.__rpow__(left, right)))

Example #21

0

Show file

 def or_func(left: Column, right: Any) -> Column:
     if not isinstance(right, Column) and pd.isna(right):
         return SF.lit(False)
     else:
         scol = left | SF.lit(right)
         return F.when(left.isNull() | scol.isNull(), False).otherwise(scol)

Example #22

0

Show file

File: core.py Project: yangrong688/spark

    def compute_hist(psdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        sdf = psdf._internal.spark_frame
        scols = []
        input_column_names = []
        for label in psdf._internal.column_labels:
            input_column_name = name_like_string(label)
            input_column_names.append(input_column_name)
            scols.append(psdf._internal.spark_column_for(label).alias(input_column_name))
        sdf = sdf.select(*scols)

        # 1. Make the bucket output flat to:
        #     +----------+-------+
        #     |__group_id|buckets|
        #     +----------+-------+
        #     |0         |0.0    |
        #     |0         |0.0    |
        #     |0         |1.0    |
        #     |0         |2.0    |
        #     |0         |3.0    |
        #     |0         |3.0    |
        #     |1         |0.0    |
        #     |1         |1.0    |
        #     |1         |1.0    |
        #     |1         |2.0    |
        #     |1         |1.0    |
        #     |1         |0.0    |
        #     +----------+-------+
        colnames = sdf.columns
        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]

        output_df = None
        for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)):
            # creates a Bucketizer to get corresponding bin of each value
            bucketizer = Bucketizer(
                splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
            )

            bucket_df = bucketizer.transform(sdf)

            if output_df is None:
                output_df = bucket_df.select(
                    SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
                )
            else:
                output_df = output_df.union(
                    bucket_df.select(
                        SF.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
                    )
                )

        # 2. Calculate the count based on each group and bucket.
        #     +----------+-------+------+
        #     |__group_id|buckets| count|
        #     +----------+-------+------+
        #     |0         |0.0    |2     |
        #     |0         |1.0    |1     |
        #     |0         |2.0    |1     |
        #     |0         |3.0    |2     |
        #     |1         |0.0    |2     |
        #     |1         |1.0    |3     |
        #     |1         |2.0    |1     |
        #     +----------+-------+------+
        result = (
            output_df.groupby("__group_id", "__bucket")
            .agg(F.count("*").alias("count"))
            .toPandas()
            .sort_values(by=["__group_id", "__bucket"])
        )

        # 3. Fill empty bins and calculate based on each group id. From:
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |0         |0.0     |2     |
        #     |0         |1.0     |1     |
        #     |0         |2.0     |1     |
        #     |0         |3.0     |2     |
        #     +----------+--------+------+
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |1         |0.0     |2     |
        #     |1         |1.0     |3     |
        #     |1         |2.0     |1     |
        #     +----------+--------+------+
        #
        # to:
        #     +-----------------+
        #     |__values1__bucket|
        #     +-----------------+
        #     |2                |
        #     |1                |
        #     |1                |
        #     |2                |
        #     |0                |
        #     +-----------------+
        #     +-----------------+
        #     |__values2__bucket|
        #     +-----------------+
        #     |2                |
        #     |3                |
        #     |1                |
        #     |0                |
        #     |0                |
        #     +-----------------+
        output_series = []
        for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)):
            current_bucket_result = result[result["__group_id"] == i]
            # generates a pandas DF with one row for each bin
            # we need this as some of the bins may be empty
            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
            # merges the bins with counts on it and fills remaining ones with zeros
            pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[
                ["count"]
            ]
            pdf.columns = [input_column_name]
            output_series.append(pdf[input_column_name])

        return output_series

Example #23

0

Show file

File: categorical.py Project: zzzhy/spark

    def set_categories(
        self,
        new_categories: Union[pd.Index, List],
        ordered: Optional[bool] = None,
        rename: bool = False,
        inplace: bool = False,
    ) -> Optional["ps.Series"]:
        """
        Set the categories to the specified new_categories.

        `new_categories` can include new categories (which will result in
        unused categories) or remove old categories (which results in values
        set to NaN). If `rename==True`, the categories will simple be renamed
        (less or more items than in old categories will result in values set to
        NaN or in unused categories respectively).

        This method can be used to perform more than one action of adding,
        removing, and reordering simultaneously and is therefore faster than
        performing the individual steps via the more specialised methods.

        On the other hand this methods does not do checks (e.g., whether the
        old categories are included in the new categories on a reorder), which
        can result in surprising changes, for example when using special string
        dtypes, which does not considers a S1 string equal to a single char
        python string.

        Parameters
        ----------
        new_categories : Index-like
           The categories in new order.
        ordered : bool, default False
           Whether or not the categorical is treated as a ordered categorical.
           If not given, do not change the ordered information.
        rename : bool, default False
           Whether or not the new_categories should be considered as a rename
           of the old categories or as reordered categories.
        inplace : bool, default False
           Whether or not to reorder the categories in-place or return a copy
           of this categorical with reordered categories.

           .. deprecated:: 3.2.0

        Returns
        -------
        Series with reordered categories or None if inplace.

        Raises
        ------
        ValueError
            If new_categories does not validate as categories

        See Also
        --------
        rename_categories : Rename categories.
        reorder_categories : Reorder categories.
        add_categories : Add new categories.
        remove_categories : Remove the specified categories.
        remove_unused_categories : Remove categories which are not used.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.set_categories(['b', 'c'])  # doctest: +SKIP
        0    NaN
        1      b
        2      b
        3      c
        4      c
        5      c
        dtype: category
        Categories (2, object): ['b', 'c']

        >>> s.cat.set_categories([1, 2, 3], rename=True)  # doctest: +SKIP
        0    1
        1    2
        2    2
        3    3
        4    3
        5    3
        dtype: category
        Categories (3, int64): [1, 2, 3]

        >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True)  # doctest: +SKIP
        0    1
        1    2
        2    2
        3    3
        4    3
        5    3
        dtype: category
        Categories (3, int64): [1 < 2 < 3]
        """
        from pyspark.pandas.frame import DataFrame

        if inplace:
            warnings.warn(
                "The `inplace` parameter in set_categories is deprecated "
                "and will be removed in a future version.",
                FutureWarning,
            )

        if not is_list_like(new_categories):
            raise TypeError(
                "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories)
            )

        if ordered is None:
            ordered = self.ordered

        new_dtype = CategoricalDtype(new_categories, ordered=ordered)
        scol = self._data.spark.column

        if rename:
            new_scol = (
                F.when(scol >= len(new_categories), SF.lit(-1).cast(self._data.spark.data_type))
                .otherwise(scol)
                .alias(self._data._internal.data_spark_column_names[0])
            )

            internal = self._data._psdf._internal.with_new_spark_column(
                self._data._column_label,
                new_scol,
                field=self._data._internal.data_fields[0].copy(dtype=new_dtype),
            )

            if inplace:
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return DataFrame(internal)._psser_for(self._data._column_label).copy()
        else:
            psser = self._data.astype(new_dtype)
            if inplace:
                internal = self._data._psdf._internal.with_new_spark_column(
                    self._data._column_label,
                    psser.spark.column,
                    field=psser._internal.data_fields[0],
                )
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                return psser