def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: if isinstance(index_ops.spark.data_type, (FloatType, DoubleType)): scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), F.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) else: # DecimalType scol = F.when(index_ops.spark.column.isNull(), F.lit(False)).otherwise( index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): if isinstance(dtype, extension_dtypes): scol = F.when( index_ops.spark.column.isNotNull(), F.when(index_ops.spark.column, "True").otherwise("False"), ) else: null_str = str(None) casted = F.when(index_ops.spark.column, "True").otherwise("False") scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) else: return _as_other_type(index_ops, dtype, spark_type)
def isnull(self, index_ops: T_IndexOps) -> T_IndexOps: return index_ops._with_new_scol( index_ops.spark.column.isNull(), field=index_ops._internal.data_fields[0].copy( dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False), )
def mul(self, left: T_IndexOps, right: Any) -> IndexOpsLike: if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError( "Multiplication can not be applied to %s and the given type." % self.pretty_name ) if isinstance(right, bool): return left.__and__(right) elif isinstance(right, numbers.Number): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) return left * right else: assert isinstance(right, IndexOpsMixin) if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BooleanType): return left.__and__(right) else: left = transform_boolean_operand_to_numeric(left, right.spark.data_type) return left * right
def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype) and dtype.categories is None: return index_ops.copy() categories = index_ops.dtype.categories if len(categories) == 0: scol = F.lit(None) else: kvs = chain(*[(F.lit(code), F.lit(category)) for code, category in enumerate(categories)]) map_scol = F.create_map(*kvs) scol = map_scol.getItem(index_ops.spark.column) return index_ops._with_new_scol( scol.alias( index_ops._internal.data_spark_column_names[0])).astype(dtype)
def radd(self, left: T_IndexOps, right: Any) -> IndexOpsLike: if isinstance(right, bytes): return cast( IndexOpsLike, left._with_new_scol(F.concat(F.lit(right), left.spark.column)) ) else: raise TypeError( "Concatenation can not be applied to %s and the given type." % self.pretty_name )
def rmul(self, left: T_IndexOps, right: Any) -> IndexOpsLike: if isinstance(right, bool): return left.__and__(right) elif isinstance(right, numbers.Number): left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right)))) return right * left else: raise TypeError( "Multiplication can not be applied to %s and the given type." % self.pretty_name )
def radd(self, left: T_IndexOps, right: Any) -> IndexOpsLike: if isinstance(right, str): return cast( IndexOpsLike, left._with_new_scol(F.concat( F.lit(right), left.spark.column)), # TODO: dtype? ) else: raise TypeError( "string addition can only be applied to string series or literals." )
def sub(self, left: T_IndexOps, right: Any) -> IndexOpsLike: # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. msg = ( "Note that there is a behavior difference of timestamp subtraction. " "The timestamp subtraction returns an integer in seconds, " "whereas pandas returns 'timedelta64[ns]'.") if isinstance(right, IndexOpsMixin) and isinstance( right.spark.data_type, TimestampType): warnings.warn(msg, UserWarning) return left.astype("long") - right.astype("long") elif isinstance(right, datetime.datetime): warnings.warn(msg, UserWarning) return cast( IndexOpsLike, left.spark.transform(lambda scol: scol.astype("long") - F.lit( right).cast(as_spark_type("long"))), ) else: raise TypeError( "datetime subtraction can only be applied to datetime series.")
def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) if isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( F.length(index_ops.spark.column) > 0) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype) else: return _as_other_type(index_ops, dtype, spark_type)
def isnull(self, index_ops: T_IndexOps) -> T_IndexOps: return index_ops._with_new_scol(index_ops.spark.column.isNull())