Beispiel #1
0
 def nan_to_null(self, index_ops: IndexOpsLike) -> IndexOpsLike:
     # Special handle floating point types because Spark's count treats nan as a valid value,
     # whereas pandas count doesn't include nan.
     return index_ops._with_new_scol(
         F.nanvl(index_ops.spark.column, SF.lit(None)),
         field=index_ops._internal.data_fields[0].copy(nullable=True),
     )
Beispiel #2
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, _ = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype) and cast(CategoricalDtype, dtype).categories is None:
            return index_ops.copy()

        return _to_cat(index_ops).astype(dtype)
Beispiel #3
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, _ = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype) and cast(CategoricalDtype, dtype).categories is None:
            return index_ops.copy()

        categories = cast(CategoricalDtype, index_ops.dtype).categories
        if len(categories) == 0:
            scol = SF.lit(None)
        else:
            kvs = chain(
                *[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)]
            )
            map_scol = F.create_map(*kvs)
            scol = map_scol.getItem(index_ops.spark.column)
        return index_ops._with_new_scol(scol).astype(dtype)
Beispiel #4
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return _as_bool_type(index_ops, dtype)
        elif isinstance(spark_type, StringType):
            if isinstance(dtype, extension_dtypes):
                # seems like a pandas' bug?
                scol = F.when(index_ops.spark.column.isNull(),
                              str(pd.NaT)).otherwise(
                                  index_ops.spark.column.cast(spark_type))
            else:
                null_str = str(pd.NaT)
                casted = index_ops.spark.column.cast(spark_type)
                scol = F.when(index_ops.spark.column.isNull(),
                              null_str).otherwise(casted)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(
                    dtype=dtype, spark_type=spark_type),
            )
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #5
0
 def isnull(self, index_ops: IndexOpsLike) -> IndexOpsLike:
     return index_ops._with_new_scol(
         index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
         field=index_ops._internal.data_fields[0].copy(
             dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False
         ),
     )
Beispiel #6
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
            if index_ops.hasnans:
                raise ValueError(
                    "Cannot convert %s with missing values to integer" % self.pretty_name
                )

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(
                    index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
                    SF.lit(True),
                ).otherwise(index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #7
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return _as_bool_type(index_ops, dtype)
        elif isinstance(spark_type, StringType):
            if isinstance(dtype, extension_dtypes):
                scol = F.when(
                    index_ops.spark.column.isNotNull(),
                    F.when(index_ops.spark.column, "True").otherwise("False"),
                )
                nullable = index_ops.spark.nullable
            else:
                null_str = str(pd.NA) if isinstance(
                    self, BooleanExtensionOps) else str(None)
                casted = F.when(index_ops.spark.column,
                                "True").otherwise("False")
                scol = F.when(index_ops.spark.column.isNull(),
                              null_str).otherwise(casted)
                nullable = False
            return index_ops._with_new_scol(
                scol,
                field=index_ops._internal.data_fields[0].copy(
                    dtype=dtype, spark_type=spark_type, nullable=nullable),
            )
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #8
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return _as_bool_type(index_ops, dtype)
        elif isinstance(spark_type, StringType):
            if isinstance(dtype, extension_dtypes):
                scol = F.when(
                    index_ops.spark.column.isNotNull(),
                    F.when(index_ops.spark.column, "True").otherwise("False"),
                )
            else:
                null_str = str(None)
                casted = F.when(index_ops.spark.column,
                                "True").otherwise("False")
                scol = F.when(index_ops.spark.column.isNull(),
                              null_str).otherwise(casted)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #9
0
def _to_cat(index_ops: IndexOpsLike) -> IndexOpsLike:
    categories = cast(CategoricalDtype, index_ops.dtype).categories
    if len(categories) == 0:
        scol = SF.lit(None)
    else:
        kvs = chain(*[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)])
        map_scol = F.create_map(*kvs)
        scol = map_scol[index_ops.spark.column]
    return index_ops._with_new_scol(scol)
Beispiel #10
0
 def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, str):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(F.concat(SF.lit(right), left.spark.column),
                                 field=left._internal.data_fields[0]),
         )
     else:
         raise TypeError("Addition can not be applied to given types.")
Beispiel #11
0
 def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     _sanitize_list_like(right)
     if not is_valid_operand_for_numeric_arithmetic(right):
         raise TypeError(
             "Multiplication can not be applied to %s and the given type." % self.pretty_name
         )
     if isinstance(right, bool):
         return left.__and__(right)
     elif isinstance(right, numbers.Number):
         left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
         return left * right
     else:
         assert isinstance(right, IndexOpsMixin)
         if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BooleanType):
             return left.__and__(right)
         else:
             left = transform_boolean_operand_to_numeric(left, spark_type=right.spark.data_type)
             return left * right
Beispiel #12
0
 def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, int):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 SF.repeat(left.spark.column, right), field=left._internal.data_fields[0]
             ),
         )
     else:
         raise TypeError("Multiplication can not be applied to given types.")
Beispiel #13
0
 def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, bytes):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(F.concat(SF.lit(right),
                                          left.spark.column)))
     else:
         raise TypeError(
             "Concatenation can not be applied to %s and the given type." %
             self.pretty_name)
Beispiel #14
0
 def radd(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     _sanitize_list_like(right)
     if isinstance(right, bool):
         return left.__or__(right)
     elif isinstance(right, numbers.Number):
         left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
         return right + left
     else:
         raise TypeError(
             "Addition can not be applied to %s and the given type." % self.pretty_name
         )
Beispiel #15
0
 def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, bool):
         return left.__and__(right)
     elif isinstance(right, numbers.Number):
         left = left.spark.transform(
             lambda scol: scol.cast(as_spark_type(type(right))))
         return right * left
     else:
         raise TypeError(
             "Multiplication can not be applied to %s and the given type." %
             self.pretty_name)
Beispiel #16
0
 def rmul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, bool):
         return left.__and__(right)
     elif isinstance(right, numbers.Number):
         left = transform_boolean_operand_to_numeric(
             left, spark_type=as_spark_type(type(right)))
         return right * left
     else:
         raise TypeError(
             "Multiplication can not be applied to %s and the given type." %
             self.pretty_name)
Beispiel #17
0
    def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        if not is_valid_operand_for_numeric_arithmetic(right):
            raise TypeError(
                "Addition can not be applied to %s and the given type." %
                self.pretty_name)

        if isinstance(right, bool):
            return left.__or__(right)
        elif isinstance(right, numbers.Number):
            left = left.spark.transform(
                lambda scol: scol.cast(as_spark_type(type(right))))
            return left + right
        else:
            assert isinstance(right, IndexOpsMixin)
            if isinstance(right, IndexOpsMixin) and isinstance(
                    right.spark.data_type, BooleanType):
                return left.__or__(right)
            else:
                left = transform_boolean_operand_to_numeric(
                    left, right.spark.data_type)
                return left + right
Beispiel #18
0
def _as_bool_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    spark_type = BooleanType()
    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(spark_type)
    else:
        scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise(
            index_ops.spark.column.cast(spark_type)
        )
    return index_ops._with_new_scol(
        scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type)
    )
Beispiel #19
0
 def add(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, str):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 F.concat(left.spark.column, SF.lit(right)), field=left._internal.data_fields[0]
             ),
         )
     elif isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, StringType):
         return column_op(F.concat)(left, right)
     else:
         raise TypeError("Addition can not be applied to given types.")
Beispiel #20
0
 def rsub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     _sanitize_list_like(right)
     # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
     msg = (
         "Note that there is a behavior difference of timestamp subtraction. "
         "The timestamp subtraction returns an integer in seconds, "
         "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return cast(
             SeriesOrIndex,
             left._with_new_scol(
                 self._cast_spark_column_timestamp_to_long(SF.lit(right)) -
                 left.astype("long").spark.column,
                 field=left._internal.data_fields[0].copy(
                     dtype=np.dtype("int64"), spark_type=LongType()),
             ),
         )
     else:
         raise TypeError(
             "Datetime subtraction can only be applied to datetime series.")
Beispiel #21
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            # Cannot cast binary to boolean in Spark.
            # We should cast binary to str first, and cast it to boolean
            return index_ops.astype(str).astype(bool)
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype)
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #22
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, NumericType):
            from pyspark.pandas.internal import InternalField

            scol = self._cast_spark_column_timestamp_to_long(index_ops.spark.column).cast(
                spark_type
            )
            return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype))
        else:
            return super(DatetimeNTZOps, self).astype(index_ops, dtype)
Beispiel #23
0
 def mul(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     if isinstance(right, int):
         return cast(
             SeriesOrIndex,
             left._with_new_scol(SF.repeat(left.spark.column, right),
                                 field=left._internal.data_fields[0]),
         )
     elif (isinstance(right, IndexOpsMixin)
           and isinstance(right.spark.data_type, IntegralType)
           and not isinstance(right.dtype, CategoricalDtype)):
         return column_op(SF.repeat)(left, right)
     else:
         raise TypeError(
             "Multiplication can not be applied to given types.")
Beispiel #24
0
def _as_bool_type(index_ops: IndexOpsLike,
                  dtype: Union[str, type, Dtype]) -> IndexOpsLike:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    from pyspark.pandas.internal import InternalField

    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(BooleanType())
    else:
        scol = F.when(index_ops.spark.column.isNull(),
                      SF.lit(False)).otherwise(
                          index_ops.spark.column.cast(BooleanType()))
    return index_ops._with_new_scol(
        scol.alias(index_ops._internal.data_spark_column_names[0]),
        field=InternalField(dtype=dtype),
    )
Beispiel #25
0
def _as_string_type(
    index_ops: IndexOpsLike, dtype: Union[str, type, Dtype], *, null_str: str = str(None)
) -> IndexOpsLike:
    """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
    representing null Spark column. Note that `null_str` is for non-extension dtypes only.
    """
    spark_type = StringType()
    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(spark_type)
    else:
        casted = index_ops.spark.column.cast(spark_type)
        scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted)
    return index_ops._with_new_scol(
        scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type)
    )
Beispiel #26
0
def _as_other_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype],
                   spark_type: DataType) -> IndexOpsLike:
    """Cast `index_ops` to a `dtype` (`spark_type`) that needs no pre-processing.

    Destination types that need pre-processing: CategoricalDtype, BooleanType, and StringType.
    """
    from pyspark.pandas.internal import InternalField

    need_pre_process = (isinstance(dtype, CategoricalDtype)
                        or isinstance(spark_type, BooleanType)
                        or isinstance(spark_type, StringType))
    assert not need_pre_process, "Pre-processing is needed before the type casting."

    scol = index_ops.spark.column.cast(spark_type)
    return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype))
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return index_ops._with_new_scol(
                index_ops.spark.column.isNotNull(),
                field=index_ops._internal.data_fields[0].copy(
                    dtype=np.dtype(bool),
                    spark_type=spark_type,
                    nullable=False),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(pd.NaT))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Beispiel #28
0
def _as_string_type(index_ops: IndexOpsLike,
                    dtype: Union[str, type, Dtype],
                    *,
                    null_str: str = str(None)) -> IndexOpsLike:
    """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
    representing null Spark column.
    """
    from pyspark.pandas.internal import InternalField

    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(StringType())
    else:
        casted = index_ops.spark.column.cast(StringType())
        scol = F.when(index_ops.spark.column.isNull(),
                      null_str).otherwise(casted)
    return index_ops._with_new_scol(
        scol.alias(index_ops._internal.data_spark_column_names[0]),
        field=InternalField(dtype=dtype),
    )
Beispiel #29
0
 def sub(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
     # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
     # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
     msg = (
         "Note that there is a behavior difference of timestamp subtraction. "
         "The timestamp subtraction returns an integer in seconds, "
         "whereas pandas returns 'timedelta64[ns]'.")
     if isinstance(right, IndexOpsMixin) and isinstance(
             right.spark.data_type, TimestampType):
         warnings.warn(msg, UserWarning)
         return left.astype("long") - right.astype("long")
     elif isinstance(right, datetime.datetime):
         warnings.warn(msg, UserWarning)
         return cast(
             SeriesOrIndex,
             left.spark.transform(lambda scol: scol.astype("long") - SF.lit(
                 right).cast(as_spark_type("long"))),
         )
     else:
         raise TypeError(
             "datetime subtraction can only be applied to datetime series.")
Beispiel #30
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)

        if isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise(
                    F.length(index_ops.spark.column) > 0
                )
            return index_ops._with_new_scol(
                scol,
                field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type),
            )
        elif isinstance(spark_type, StringType):
            null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None)
            return _as_string_type(index_ops, dtype, null_str=null_str)
        else:
            return _as_other_type(index_ops, dtype, spark_type)