def test_assert_true(self): from pyspark.sql.functions import assert_true df = self.spark.range(3) self.assertEquals( df.select(assert_true(df.id < 3)).toDF("val").collect(), [Row(val=None), Row(val=None), Row(val=None)], ) with self.assertRaises(Py4JJavaError) as cm: df.select(assert_true(df.id < 2, 'too big')).toDF("val").collect() self.assertIn("java.lang.RuntimeException", str(cm.exception)) self.assertIn("too big", str(cm.exception)) with self.assertRaises(Py4JJavaError) as cm: df.select(assert_true(df.id < 2, df.id * 1e6)).toDF("val").collect() self.assertIn("java.lang.RuntimeException", str(cm.exception)) self.assertIn("2000000", str(cm.exception)) with self.assertRaises(TypeError) as cm: df.select(assert_true(df.id < 2, 5)) self.assertEquals( "errMsg should be a Column or a str, got <class 'int'>", str(cm.exception))
def test_create_data_frame_to_pandas_day_time_internal(self): # SPARK-37279: Test DayTimeInterval in createDataFrame and toPandas origin = pd.DataFrame({"a": [datetime.timedelta(microseconds=123)]}) df = self.spark.createDataFrame(origin) df.select( assert_true( lit("INTERVAL '0 00:00:00.000123' DAY TO SECOND") == df.a.cast( "string"))).collect() pdf, pdf_arrow = self._toPandas_arrow_toggle(df) assert_frame_equal(origin, pdf) assert_frame_equal(pdf, pdf_arrow)
def test_udf_daytime_interval(self): # SPARK-37277: Support DayTimeIntervalType in Python UDF @udf(DayTimeIntervalType(DayTimeIntervalType.DAY, DayTimeIntervalType.SECOND)) def noop(x): assert x == datetime.timedelta(microseconds=123) return x df = self.spark.createDataFrame( [(datetime.timedelta(microseconds=123),)], schema="td interval day to second" ).select(noop("td").alias("td")) df.select( assert_true(lit("INTERVAL '0 00:00:00.000123' DAY TO SECOND") == df.td.cast("string")) ).collect() self.assertEqual(df.schema[0].dataType.simpleString(), "interval day to second") self.assertEqual(df.first()[0], datetime.timedelta(microseconds=123))
def test_pandas_udf_day_time_interval_type(self): # SPARK-37277: Test DayTimeIntervalType in pandas UDF import pandas as pd @pandas_udf(DayTimeIntervalType(DayTimeIntervalType.DAY, DayTimeIntervalType.SECOND)) def noop(s: pd.Series) -> pd.Series: assert s.iloc[0] == datetime.timedelta(microseconds=123) return s df = self.spark.createDataFrame( [(datetime.timedelta(microseconds=123),)], schema="td interval day to second" ).select(noop("td").alias("td")) df.select( assert_true(lit("INTERVAL '0 00:00:00.000123' DAY TO SECOND") == df.td.cast("string")) ).collect() self.assertEqual(df.schema[0].dataType.simpleString(), "interval day to second") self.assertEqual(df.first()[0], datetime.timedelta(microseconds=123))
def test_shiftrightunsigned(self): self.spark.range(10).select( assert_true( shiftRightUnsigned(col("id"), 2) == shiftrightunsigned( col("id"), 2))).collect()
def test_shiftleft(self): self.spark.range(10).select( assert_true( shiftLeft(col("id"), 2) == shiftleft(col("id"), 2))).collect()
def test_sum_distinct(self): self.spark.range(10).select( assert_true( sum_distinct(col("id")) == sumDistinct(col("id")))).collect()