def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, NumericType): from pyspark.pandas.internal import InternalField scol = self._cast_spark_column_timestamp_to_long( index_ops.spark.column).cast(spark_type) return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype)) else: return super(DatetimeNTZOps, self).astype(index_ops, dtype)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, _ = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype) and cast(CategoricalDtype, dtype).categories is None: return index_ops.copy() categories = cast(CategoricalDtype, index_ops.dtype).categories if len(categories) == 0: scol = SF.lit(None) else: kvs = chain( *[(SF.lit(code), SF.lit(category)) for code, category in enumerate(categories)] ) map_scol = F.create_map(*kvs) scol = map_scol.getItem(index_ops.spark.column) return index_ops._with_new_scol(scol).astype(dtype)
def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype) and dtype.categories is None: return index_ops.copy() categories = index_ops.dtype.categories if len(categories) == 0: scol = F.lit(None) else: kvs = chain(*[(F.lit(code), F.lit(category)) for code, category in enumerate(categories)]) map_scol = F.create_map(*kvs) scol = map_scol.getItem(index_ops.spark.column) return index_ops._with_new_scol( scol.alias( index_ops._internal.data_spark_column_names[0])).astype(dtype)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): return index_ops._with_new_scol( index_ops.spark.column.isNotNull(), field=index_ops._internal.data_fields[0].copy( dtype=np.dtype(bool), spark_type=spark_type, nullable=False), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(pd.NaT)) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) if isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when(index_ops.spark.column.isNull(), F.lit(False)).otherwise( F.length(index_ops.spark.column) > 0) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) if isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when(index_ops.spark.column.isNull(), SF.lit(False)).otherwise( F.length(index_ops.spark.column) > 0 ) return index_ops._with_new_scol( scol, field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) elif isinstance(spark_type, StringType): null_str = str(pd.NA) if isinstance(self, StringExtensionOps) else str(None) return _as_string_type(index_ops, dtype, null_str=null_str) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): if isinstance(dtype, extension_dtypes): scol = index_ops.spark.column.cast(spark_type) else: scol = F.when( index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), SF.lit(True), ).otherwise(index_ops.spark.column.cast(spark_type)) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) elif isinstance(spark_type, StringType): return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): if isinstance(dtype, extension_dtypes): # seems like a pandas' bug? scol = F.when(index_ops.spark.column.isNull(), str(pd.NaT)).otherwise( index_ops.spark.column.cast(spark_type) ) else: null_str = str(pd.NaT) casted = index_ops.spark.column.cast(spark_type) scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=index_ops._internal.data_fields[0].copy(dtype=dtype, spark_type=spark_type), ) else: return _as_other_type(index_ops, dtype, spark_type)
def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) if isinstance(dtype, CategoricalDtype): return _as_categorical_type(index_ops, dtype, spark_type) elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): if isinstance(dtype, extension_dtypes): scol = F.when( index_ops.spark.column.isNotNull(), F.when(index_ops.spark.column, "True").otherwise("False"), ) else: null_str = str(None) casted = F.when(index_ops.spark.column, "True").otherwise("False") scol = F.when(index_ops.spark.column.isNull(), null_str).otherwise(casted) return index_ops._with_new_scol( scol.alias(index_ops._internal.data_spark_column_names[0]), field=InternalField(dtype=dtype), ) else: return _as_other_type(index_ops, dtype, spark_type)
def test_as_spark_type_koalas_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(pandas_on_spark_type(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): pandas_on_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): pandas_on_spark_type(np.dtype("object"))
def test_as_spark_type_pandas_on_spark_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(pandas_on_spark_type(numpy_or_python_type), (dtype, spark_type)) if isinstance(numpy_or_python_type, CategoricalDtype): # Nested CategoricalDtype is not yet supported. continue self.assertEqual(as_spark_type(List[numpy_or_python_type]), ArrayType(spark_type)) self.assertEqual( pandas_on_spark_type(List[numpy_or_python_type]), (np.dtype("object"), ArrayType(spark_type)), ) # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ if sys.version_info >= (3, 8) and LooseVersion( np.__version__) >= LooseVersion("1.21"): import numpy.typing as ntp self.assertEqual( as_spark_type(ntp.NDArray[numpy_or_python_type]), ArrayType(spark_type)) self.assertEqual( pandas_on_spark_type(ntp.NDArray[numpy_or_python_type]), (np.dtype("object"), ArrayType(spark_type)), ) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): pandas_on_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): pandas_on_spark_type(np.dtype("object"))