Beispiel #1
0
    def test_data_type_ops(self):
        _mock_spark_type = DataType()
        _mock_dtype = ExtensionDtype()
        _mappings = (
            (CategoricalDtype(), _mock_spark_type, CategoricalOps),
            (_mock_dtype, DecimalType(), DecimalOps),
            (_mock_dtype, FractionalType(), FractionalOps),
            (_mock_dtype, IntegralType(), IntegralOps),
            (_mock_dtype, StringType(), StringOps),
            (_mock_dtype, BooleanType(), BooleanOps),
            (_mock_dtype, TimestampType(), DatetimeOps),
            (_mock_dtype, TimestampNTZType(), DatetimeNTZOps),
            (_mock_dtype, DateType(), DateOps),
            (_mock_dtype, DayTimeIntervalType(), TimedeltaOps),
            (_mock_dtype, BinaryType(), BinaryOps),
            (_mock_dtype, ArrayType(StringType()), ArrayOps),
            (_mock_dtype, MapType(StringType(), IntegralType()), MapOps),
            (_mock_dtype, StructType(), StructOps),
            (_mock_dtype, NullType(), NullOps),
            (_mock_dtype, UserDefinedType(), UDTOps),
        )
        for _dtype, _spark_type, _ops in _mappings:
            self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops)

        _unknow_spark_type = _mock_spark_type
        self.assertRaises(TypeError, DataTypeOps, BooleanType(),
                          _unknow_spark_type)
Beispiel #2
0
    def test_create_data_frame_to_pandas_timestamp_ntz(self):
        # SPARK-36626: Test TimestampNTZ in createDataFrame and toPandas
        with self.sql_conf({"spark.sql.session.timeZone": "America/Los_Angeles"}):
            origin = pd.DataFrame({"a": [datetime.datetime(2012, 2, 2, 2, 2, 2)]})
            df = self.spark.createDataFrame(
                origin, schema=StructType([StructField("a", TimestampNTZType(), True)]))
            df.selectExpr("assert_true('2012-02-02 02:02:02' == CAST(a AS STRING))").collect()

            pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
            assert_frame_equal(origin, pdf)
            assert_frame_equal(pdf, pdf_arrow)
Beispiel #3
0
    def _to_pandas(self):
        from datetime import datetime, date, timedelta

        schema = (
            StructType()
            .add("a", IntegerType())
            .add("b", StringType())
            .add("c", BooleanType())
            .add("d", FloatType())
            .add("dt", DateType())
            .add("ts", TimestampType())
            .add("ts_ntz", TimestampNTZType())
            .add("dt_interval", DayTimeIntervalType())
        )
        data = [
            (
                1,
                "foo",
                True,
                3.0,
                date(1969, 1, 1),
                datetime(1969, 1, 1, 1, 1, 1),
                datetime(1969, 1, 1, 1, 1, 1),
                timedelta(days=1),
            ),
            (2, "foo", True, 5.0, None, None, None, None),
            (
                3,
                "bar",
                False,
                -1.0,
                date(2012, 3, 3),
                datetime(2012, 3, 3, 3, 3, 3),
                datetime(2012, 3, 3, 3, 3, 3),
                timedelta(hours=-1, milliseconds=421),
            ),
            (
                4,
                "bar",
                False,
                6.0,
                date(2100, 4, 4),
                datetime(2100, 4, 4, 4, 4, 4),
                datetime(2100, 4, 4, 4, 4, 4),
                timedelta(microseconds=123),
            ),
        ]
        df = self.spark.createDataFrame(data, schema)
        return df.toPandas()
Beispiel #4
0
    def test_udf_timestamp_ntz(self):
        # SPARK-36626: Test TimestampNTZ in Python UDF
        @udf(TimestampNTZType())
        def noop(x):
            assert x == datetime.datetime(1970, 1, 1, 0, 0)
            return x

        with self.sql_conf({"spark.sql.session.timeZone": "Pacific/Honolulu"}):
            df = self.spark.createDataFrame(
                [(datetime.datetime(1970, 1, 1, 0, 0),)], schema="dt timestamp_ntz"
            ).select(noop("dt").alias("dt"))

            df.selectExpr("assert_true('1970-01-01 00:00:00' == CAST(dt AS STRING))").collect()
            self.assertEqual(df.schema[0].dataType.typeName(), "timestamp_ntz")
            self.assertEqual(df.first()[0], datetime.datetime(1970, 1, 1, 0, 0))
 def _to_pandas(self):
     from datetime import datetime, date
     schema = StructType().add("a", IntegerType()).add("b", StringType())\
                          .add("c", BooleanType()).add("d", FloatType())\
                          .add("dt", DateType()).add("ts", TimestampType())\
                          .add("ts_ntz", TimestampNTZType())
     data = [
         (1, "foo", True, 3.0, date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1),
          datetime(1969, 1, 1, 1, 1, 1)),
         (2, "foo", True, 5.0, None, None, None),
         (3, "bar", False, -1.0, date(2012, 3, 3), datetime(2012, 3, 3, 3, 3, 3),
          datetime(2012, 3, 3, 3, 3, 3)),
         (4, "bar", False, 6.0, date(2100, 4, 4), datetime(2100, 4, 4, 4, 4, 4),
          datetime(2100, 4, 4, 4, 4, 4)),
     ]
     df = self.spark.createDataFrame(data, schema)
     return df.toPandas()
Beispiel #6
0
    def test_toPandas_empty_df_arrow_enabled(self):
        # SPARK-30537 test that toPandas() on an empty dataframe has the correct dtypes
        # when arrow is enabled
        from datetime import date
        from decimal import Decimal

        schema = StructType([
            StructField("a", StringType(), True),
            StructField("a", IntegerType(), True),
            StructField("c", TimestampType(), True),
            StructField("d", NullType(), True),
            StructField("e", LongType(), True),
            StructField("f", FloatType(), True),
            StructField("g", DateType(), True),
            StructField("h", BinaryType(), True),
            StructField("i", DecimalType(38, 18), True),
            StructField("k", TimestampNTZType(), True),
            StructField("L", DayTimeIntervalType(0, 3), True),
        ])
        df = self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(),
                                        schema=schema)
        non_empty_df = self.spark.createDataFrame(
            [(
                "a",
                1,
                datetime.datetime(1969, 1, 1, 1, 1, 1),
                None,
                10,
                0.2,
                date(1969, 1, 1),
                bytearray(b"a"),
                Decimal("2.0"),
                datetime.datetime(1969, 1, 1, 1, 1, 1),
                datetime.timedelta(microseconds=123),
            )],
            schema=schema,
        )

        pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
        pdf_non_empty, pdf_arrow_non_empty = self._toPandas_arrow_toggle(
            non_empty_df)
        assert_frame_equal(pdf, pdf_arrow)
        self.assertTrue(pdf_arrow.dtypes.equals(pdf_arrow_non_empty.dtypes))
        self.assertTrue(pdf_arrow.dtypes.equals(pdf_non_empty.dtypes))
Beispiel #7
0
def from_arrow_type(at: "pa.DataType",
                    prefer_timestamp_ntz: bool = False) -> DataType:
    """ Convert pyarrow type to Spark data type.
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()  # type: DataType
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None:
        spark_type = TimestampNTZType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError(
                "MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = MapType(from_arrow_type(at.key_type),
                             from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type