def test_vectorized_udf_timestamps_respect_session_timezone(self): schema = StructType([ StructField("idx", LongType(), True), StructField("timestamp", TimestampType(), True)]) data = [(1, datetime(1969, 1, 1, 1, 1, 1)), (2, datetime(2012, 2, 2, 2, 2, 2)), (3, None), (4, datetime(2100, 3, 3, 3, 3, 3))] df = self.spark.createDataFrame(data, schema=schema) f_timestamp_copy = pandas_udf(lambda ts: ts, TimestampType()) internal_value = pandas_udf( lambda ts: ts.apply(lambda ts: ts.value if ts is not pd.NaT else None), LongType()) timezone = "America/New_York" with self.sql_conf({ "spark.sql.execution.pandas.respectSessionTimeZone": False, "spark.sql.session.timeZone": timezone}): df_la = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \ .withColumn("internal_value", internal_value(col("timestamp"))) result_la = df_la.select(col("idx"), col("internal_value")).collect() # Correct result_la by adjusting 3 hours difference between Los Angeles and New York diff = 3 * 60 * 60 * 1000 * 1000 * 1000 result_la_corrected = \ df_la.select(col("idx"), col("tscopy"), col("internal_value") + diff).collect() with self.sql_conf({ "spark.sql.execution.pandas.respectSessionTimeZone": True, "spark.sql.session.timeZone": timezone}): df_ny = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \ .withColumn("internal_value", internal_value(col("timestamp"))) result_ny = df_ny.select(col("idx"), col("tscopy"), col("internal_value")).collect() self.assertNotEqual(result_ny, result_la) self.assertEqual(result_ny, result_la_corrected)
def test_vectorized_udf_chained(self): from pyspark.sql.functions import pandas_udf, col df = self.spark.range(10) f = pandas_udf(lambda x: x + 1, LongType()) g = pandas_udf(lambda x: x - 1, LongType()) res = df.select(g(f(col('id')))) self.assertEquals(df.collect(), res.collect())
def test_udf_wrong_arg(self): with QuietTest(self.sc): with self.assertRaises(ParseException): @pandas_udf('blah') def foo(x): return x with self.assertRaisesRegexp(ValueError, 'Invalid returnType.*None'): @pandas_udf(functionType=PandasUDFType.SCALAR) def foo(x): return x with self.assertRaisesRegexp(ValueError, 'Invalid functionType'): @pandas_udf('double', 100) def foo(x): return x with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'): pandas_udf(lambda: 1, LongType(), PandasUDFType.SCALAR) with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'): @pandas_udf(LongType(), PandasUDFType.SCALAR) def zero_with_type(): return 1 with self.assertRaisesRegexp(TypeError, 'Invalid returnType'): @pandas_udf(returnType=PandasUDFType.GROUPED_MAP) def foo(df): return df with self.assertRaisesRegexp(TypeError, 'Invalid returnType'): @pandas_udf(returnType='double', functionType=PandasUDFType.GROUPED_MAP) def foo(df): return df with self.assertRaisesRegexp(ValueError, 'Invalid function'): @pandas_udf(returnType='k int, v double', functionType=PandasUDFType.GROUPED_MAP) def foo(k, v, w): return k
def test_vectorized_udf_unsupported_types(self): from pyspark.sql.functions import pandas_udf with QuietTest(self.sc): with self.assertRaisesRegexp( NotImplementedError, 'Invalid returnType.*scalar Pandas UDF.*MapType'): pandas_udf(lambda x: x, MapType(StringType(), IntegerType()))
def test_vectorized_udf_wrong_return_type(self): from pyspark.sql.functions import pandas_udf with QuietTest(self.sc): with self.assertRaisesRegexp( NotImplementedError, 'Invalid returnType.*scalar Pandas UDF.*MapType'): pandas_udf(lambda x: x * 1.0, MapType(LongType(), LongType()))
def test_vectorized_udf_struct_type(self): df = self.spark.range(10) return_type = StructType([ StructField('id', LongType()), StructField('str', StringType())]) def func(id): return pd.DataFrame({'id': id, 'str': id.apply(unicode)}) f = pandas_udf(func, returnType=return_type) expected = df.select(struct(col('id'), col('id').cast('string').alias('str')) .alias('struct')).collect() actual = df.select(f(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) g = pandas_udf(func, 'id: long, str: string') actual = df.select(g(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) struct_f = pandas_udf(lambda x: x, return_type) actual = df.select(struct_f(struct(col('id'), col('id').cast('string').alias('str')))) if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): with QuietTest(self.sc): from py4j.protocol import Py4JJavaError with self.assertRaisesRegexp( Py4JJavaError, 'Unsupported type in conversion from Arrow'): self.assertEqual(expected, actual.collect()) else: self.assertEqual(expected, actual.collect())
def test_wrong_return_type(self): with QuietTest(self.sc): with self.assertRaisesRegexp( NotImplementedError, 'Invalid returnType.*grouped map Pandas UDF.*MapType'): pandas_udf( lambda pdf: pdf, 'id long, v map<int, int>', PandasUDFType.GROUPED_MAP)
def test_vectorized_udf_unsupported_types(self): with QuietTest(self.sc): with self.assertRaisesRegexp( NotImplementedError, 'Invalid returnType.*scalar Pandas UDF.*MapType'): pandas_udf(lambda x: x, MapType(StringType(), IntegerType())) with self.assertRaisesRegexp( NotImplementedError, 'Invalid returnType.*scalar Pandas UDF.*ArrayType.StructType'): pandas_udf(lambda x: x, ArrayType(StructType([StructField('a', IntegerType())])))
def test_vectorized_udf_complex(self): df = self.spark.range(10).select( col('id').cast('int').alias('a'), col('id').cast('int').alias('b'), col('id').cast('double').alias('c')) add = pandas_udf(lambda x, y: x + y, IntegerType()) power2 = pandas_udf(lambda x: 2 ** x, IntegerType()) mul = pandas_udf(lambda x, y: x * y, DoubleType()) res = df.select(add(col('a'), col('b')), power2(col('a')), mul(col('b'), col('c'))) expected = df.select(expr('a + b'), expr('power(2, a)'), expr('b * c')) self.assertEquals(expected.collect(), res.collect())
def test_stopiteration_in_udf(self): from pyspark.sql.functions import udf, pandas_udf, PandasUDFType from py4j.protocol import Py4JJavaError def foo(x): raise StopIteration() def foofoo(x, y): raise StopIteration() exc_message = "Caught StopIteration thrown from user's code; failing the task" df = self.spark.range(0, 100) # plain udf (test for SPARK-23754) self.assertRaisesRegexp( Py4JJavaError, exc_message, df.withColumn('v', udf(foo)('id')).collect ) # pandas scalar udf self.assertRaisesRegexp( Py4JJavaError, exc_message, df.withColumn( 'v', pandas_udf(foo, 'double', PandasUDFType.SCALAR)('id') ).collect ) # pandas grouped map self.assertRaisesRegexp( Py4JJavaError, exc_message, df.groupBy('id').apply( pandas_udf(foo, df.schema, PandasUDFType.GROUPED_MAP) ).collect ) self.assertRaisesRegexp( Py4JJavaError, exc_message, df.groupBy('id').apply( pandas_udf(foofoo, df.schema, PandasUDFType.GROUPED_MAP) ).collect ) # pandas grouped agg self.assertRaisesRegexp( Py4JJavaError, exc_message, df.groupBy('id').agg( pandas_udf(foo, 'double', PandasUDFType.GROUPED_AGG)('id') ).collect )
def test_mixed_scalar_udfs_followed_by_grouby_apply(self): df = self.spark.range(0, 10).toDF('v1') df = df.withColumn('v2', udf(lambda x: x + 1, 'int')(df['v1'])) \ .withColumn('v3', pandas_udf(lambda x: x + 2, 'int')(df['v1'])) result = df.groupby() \ .apply(pandas_udf(lambda x: pd.DataFrame([x.sum().sum()]), 'sum int', PandasUDFType.GROUPED_MAP)) self.assertEquals(result.collect()[0]['sum'], 165)
def test_unsupported_types(self): common_err_msg = 'Invalid returnType.*grouped map Pandas UDF.*' unsupported_types = [ StructField('map', MapType(StringType(), IntegerType())), StructField('arr_ts', ArrayType(TimestampType())), StructField('null', NullType()), StructField('struct', StructType([StructField('l', LongType())])), ] for unsupported_type in unsupported_types: schema = StructType([StructField('id', LongType(), True), unsupported_type]) with QuietTest(self.sc): with self.assertRaisesRegexp(NotImplementedError, common_err_msg): pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP)
def test_vectorized_udf_basic(self): from pyspark.sql.functions import pandas_udf, col, array df = self.spark.range(10).select( col('id').cast('string').alias('str'), col('id').cast('int').alias('int'), col('id').alias('long'), col('id').cast('float').alias('float'), col('id').cast('double').alias('double'), col('id').cast('decimal').alias('decimal'), col('id').cast('boolean').alias('bool'), array(col('id')).alias('array_long')) f = lambda x: x str_f = pandas_udf(f, StringType()) int_f = pandas_udf(f, IntegerType()) long_f = pandas_udf(f, LongType()) float_f = pandas_udf(f, FloatType()) double_f = pandas_udf(f, DoubleType()) decimal_f = pandas_udf(f, DecimalType()) bool_f = pandas_udf(f, BooleanType()) array_long_f = pandas_udf(f, ArrayType(LongType())) res = df.select(str_f(col('str')), int_f(col('int')), long_f(col('long')), float_f(col('float')), double_f(col('double')), decimal_f('decimal'), bool_f(col('bool')), array_long_f('array_long')) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_binary(self): if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): with QuietTest(self.sc): with self.assertRaisesRegexp( NotImplementedError, 'Invalid returnType.*scalar Pandas UDF.*BinaryType'): pandas_udf(lambda x: x, BinaryType()) else: data = [(bytearray(b"a"),), (None,), (bytearray(b"bb"),), (bytearray(b"ccc"),)] schema = StructType().add("binary", BinaryType()) df = self.spark.createDataFrame(data, schema) str_f = pandas_udf(lambda x: x, BinaryType()) res = df.select(str_f(col('binary'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_nested_struct(self): nested_type = StructType([ StructField('id', IntegerType()), StructField('nested', StructType([ StructField('foo', StringType()), StructField('bar', FloatType()) ])) ]) with QuietTest(self.sc): with self.assertRaisesRegexp( Exception, 'Invalid returnType with scalar Pandas UDFs'): pandas_udf(lambda x: x, returnType=nested_type)
def test_pandas_udf_nested_arrays(self): tokenize = pandas_udf(lambda s: s.apply(lambda str: [str.split(' ')]), ArrayType(ArrayType(StringType()))) self.assertEqual(tokenize.returnType, ArrayType(ArrayType(StringType()))) df = self.spark.createDataFrame([("hi boo",), ("bye boo",)], ["vals"]) result = df.select(tokenize("vals").alias("hi")) self.assertEqual([Row(hi=[[u'hi', u'boo']]), Row(hi=[[u'bye', u'boo']])], result.collect())
def scalar_pandas_udf_example(spark): # $example on:scalar_pandas_udf$ import pandas as pd from pyspark.sql.functions import col, pandas_udf from pyspark.sql.types import LongType # Declare the function and create the UDF def multiply_func(a, b): return a * b multiply = pandas_udf(multiply_func, returnType=LongType()) # The function for a pandas_udf should be able to execute with local Pandas data x = pd.Series([1, 2, 3]) print(multiply_func(x, x)) # 0 1 # 1 4 # 2 9 # dtype: int64 # Create a Spark DataFrame, 'spark' is an existing SparkSession df = spark.createDataFrame(pd.DataFrame(x, columns=["x"])) # Execute function as a Spark vectorized UDF df.select(multiply(col("x"), col("x"))).show()
def test_vectorized_udf_dates(self): schema = StructType().add("idx", LongType()).add("date", DateType()) data = [(0, date(1969, 1, 1),), (1, date(2012, 2, 2),), (2, None,), (3, date(2100, 4, 4),), (4, date(2262, 4, 12),)] df = self.spark.createDataFrame(data, schema=schema) date_copy = pandas_udf(lambda t: t, returnType=DateType()) df = df.withColumn("date_copy", date_copy(col("date"))) @pandas_udf(returnType=StringType()) def check_data(idx, date, date_copy): msgs = [] is_equal = date.isnull() for i in range(len(idx)): if (is_equal[i] and data[idx[i]][1] is None) or \ date[i] == data[idx[i]][1]: msgs.append(None) else: msgs.append( "date values are not equal (date='%s': data[%d][1]='%s')" % (date[i], idx[i], data[idx[i]][1])) return pd.Series(msgs) result = df.withColumn("check_data", check_data(col("idx"), col("date"), col("date_copy"))).collect() self.assertEquals(len(data), len(result)) for i in range(len(result)): self.assertEquals(data[i][1], result[i][1]) # "date" col self.assertEquals(data[i][1], result[i][2]) # "date_copy" col self.assertIsNone(result[i][3]) # "check_data" col
def test_vectorized_udf_null_float(self): data = [(3.0,), (5.0,), (-1.0,), (None,)] schema = StructType().add("float", FloatType()) df = self.spark.createDataFrame(data, schema) float_f = pandas_udf(lambda x: x, FloatType()) res = df.select(float_f(col('float'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_int(self): data = [(None,), (2,), (3,), (4,)] schema = StructType().add("int", IntegerType()) df = self.spark.createDataFrame(data, schema) int_f = pandas_udf(lambda x: x, IntegerType()) res = df.select(int_f(col('int'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_decimal(self): data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)] schema = StructType().add("decimal", DecimalType(38, 18)) df = self.spark.createDataFrame(data, schema) decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18)) res = df.select(decimal_f(col('decimal'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_boolean(self): data = [(True,), (True,), (None,), (False,)] schema = StructType().add("bool", BooleanType()) df = self.spark.createDataFrame(data, schema) bool_f = pandas_udf(lambda x: x, BooleanType()) res = df.select(bool_f(col('bool'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_byte(self): data = [(None,), (2,), (3,), (4,)] schema = StructType().add("byte", ByteType()) df = self.spark.createDataFrame(data, schema) byte_f = pandas_udf(lambda x: x, ByteType()) res = df.select(byte_f(col('byte'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_array(self): data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)] array_schema = StructType([StructField("array", ArrayType(IntegerType()))]) df = self.spark.createDataFrame(data, schema=array_schema) array_f = pandas_udf(lambda x: x, ArrayType(IntegerType())) result = df.select(array_f(col('array'))) self.assertEquals(df.collect(), result.collect())
def test_vectorized_udf_null_string(self): data = [("foo",), (None,), ("bar",), ("bar",)] schema = StructType().add("str", StringType()) df = self.spark.createDataFrame(data, schema) str_f = pandas_udf(lambda x: x, StringType()) res = df.select(str_f(col('str'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_string_in_udf(self): import pandas as pd df = self.spark.range(10) str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType()) actual = df.select(str_f(col('id'))) expected = df.select(col('id').cast('string')) self.assertEquals(expected.collect(), actual.collect())
def test_manual(self): df = self.data sum_udf = self.pandas_agg_sum_udf mean_udf = self.pandas_agg_mean_udf mean_arr_udf = pandas_udf( self.pandas_agg_mean_udf.func, ArrayType(self.pandas_agg_mean_udf.returnType), self.pandas_agg_mean_udf.evalType) result1 = df.groupby('id').agg( sum_udf(df.v), mean_udf(df.v), mean_arr_udf(array(df.v))).sort('id') expected1 = self.spark.createDataFrame( [[0, 245.0, 24.5, [24.5]], [1, 255.0, 25.5, [25.5]], [2, 265.0, 26.5, [26.5]], [3, 275.0, 27.5, [27.5]], [4, 285.0, 28.5, [28.5]], [5, 295.0, 29.5, [29.5]], [6, 305.0, 30.5, [30.5]], [7, 315.0, 31.5, [31.5]], [8, 325.0, 32.5, [32.5]], [9, 335.0, 33.5, [33.5]]], ['id', 'sum(v)', 'avg(v)', 'avg(array(v))']) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_register_grouped_map_udf(self): foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP) with QuietTest(self.sc): with self.assertRaisesRegexp( ValueError, 'f.*SQL_BATCHED_UDF.*SQL_SCALAR_PANDAS_UDF.*SQL_GROUPED_AGG_PANDAS_UDF.*'): self.spark.catalog.registerFunction("foo_udf", foo_udf)
def test_vectorized_udf_null_double(self): data = [(3.0,), (5.0,), (-1.0,), (None,)] schema = StructType().add("double", DoubleType()) df = self.spark.createDataFrame(data, schema) double_f = pandas_udf(lambda x: x, DoubleType()) res = df.select(double_f(col('double'))) self.assertEquals(df.collect(), res.collect())
def test_vectorized_udf_null_long(self): data = [(None,), (2,), (3,), (4,)] schema = StructType().add("long", LongType()) df = self.spark.createDataFrame(data, schema) long_f = pandas_udf(lambda x: x, LongType()) res = df.select(long_f(col('long'))) self.assertEquals(df.collect(), res.collect())
def test_register_nondeterministic_vectorized_udf_basic(self): random_pandas_udf = pandas_udf(lambda x: random.randint(6, 6) + x, IntegerType()).asNondeterministic() self.assertEqual(random_pandas_udf.deterministic, False) self.assertEqual(random_pandas_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) nondeterministic_pandas_udf = self.spark.catalog.registerFunction( "randomPandasUDF", random_pandas_udf) self.assertEqual(nondeterministic_pandas_udf.deterministic, False) self.assertEqual(nondeterministic_pandas_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) [row] = self.spark.sql("SELECT randomPandasUDF(1)").collect() self.assertEqual(row[0], 7)
def test_vectorized_udf_struct_type(self): import pandas as pd import pyarrow as pa df = self.spark.range(10) return_type = StructType( [StructField('id', LongType()), StructField('str', StringType())]) def func(id): return pd.DataFrame({'id': id, 'str': id.apply(unicode)}) f = pandas_udf(func, returnType=return_type) expected = df.select( struct(col('id'), col('id').cast('string').alias('str')).alias( 'struct')).collect() actual = df.select(f(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) g = pandas_udf(func, 'id: long, str: string') actual = df.select(g(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) struct_f = pandas_udf(lambda x: x, return_type) actual = df.select( struct_f(struct(col('id'), col('id').cast('string').alias('str')))) if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): with QuietTest(self.sc): from py4j.protocol import Py4JJavaError with self.assertRaisesRegexp( Py4JJavaError, 'Unsupported type in conversion from Arrow'): self.assertEqual(expected, actual.collect()) else: self.assertEqual(expected, actual.collect())
def test_timestamp_dst(self): from pyspark.sql.functions import pandas_udf, PandasUDFType # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am dt = [ datetime.datetime(2015, 11, 1, 0, 30), datetime.datetime(2015, 11, 1, 1, 30), datetime.datetime(2015, 11, 1, 2, 30) ] df = self.spark.createDataFrame(dt, 'timestamp').toDF('time') foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp', PandasUDFType.GROUPED_MAP) result = df.groupby('time').apply(foo_udf).sort('time') self.assertPandasEqual(df.toPandas(), result.toPandas())
def map_from_array(theArray, theDelim): def pull_key_val(x, d, kind): retval = [] index = -1 if (kind == "key"): index = 0 if (kind == "val"): index = 1 if index == -1: raise "Bad input" for i in x: retval.append(i.split(d)[index]) return (retval) pull_key_udf = f.pandas_udf( lambda x: x.apply(pull_key_val, args=(theDelim, "key")), ArrayType(StringType())) pull_val_udf = f.pandas_udf( lambda x: x.apply(pull_key_val, args=(theDelim, "val")), ArrayType(StringType())) return (f.map_from_arrays(pull_key_udf(theArray), pull_val_udf(theArray)))
def test_coerce(self): df = self.data foo = pandas_udf(lambda pdf: pdf, 'id long, v double', PandasUDFType.GROUPED_MAP) result = df.groupby('id').apply(foo).sort('id').toPandas() expected = df.toPandas().groupby('id').apply( foo.func).reset_index(drop=True) expected = expected.assign(v=expected.v.astype('float64')) assert_frame_equal(expected, result, check_column_type=_check_column_type)
def test_datatype_string(self): from pyspark.sql.functions import pandas_udf, PandasUDFType df = self.data foo_udf = pandas_udf( lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id), 'id long, v int, v1 double, v2 long', PandasUDFType.GROUPED_MAP ) result = df.groupby('id').apply(foo_udf).sort('id').toPandas() expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True) self.assertPandasEqual(expected, result)
def test_datatype_string(self): df = self.data foo_udf = pandas_udf( lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id), 'id long, v int, v1 double, v2 long', PandasUDFType.GROUPED_MAP) result = df.groupby('id').apply(foo_udf).sort('id').toPandas() expected = df.toPandas().groupby('id').apply( foo_udf.func).reset_index(drop=True) assert_frame_equal(expected, result, check_column_type=_check_column_type)
def pandas_udf_func(func=None, args=None): # TODO: Get the column type, so is not necessary to pass the return type as param. # Apply the function over the whole series def apply_to_series(value, args): if args is None or args == (None, ): return value.apply(func) else: return value.apply(func, args=args) def to_serie(value): return apply_to_series(value, args) return F.pandas_udf(to_serie, func_return_type)
def testArrowPandas(spark): # Enable Arrow-based columnar data transfers spark.conf.set("spark.sql.execution.arrow.enabled", "true") # Generate a Pandas DataFrame pdf = pd.DataFrame(np.random.rand(100, 3)) # Create a Spark DataFrame from a Pandas DataFrame using Arrow df = spark.createDataFrame(pdf) df.show() # Convert the Spark DataFrame back to a Pandas DataFrame using Arrow result_pdf = df.select("*").toPandas() log.info(f'pandas-- {result_pdf}') # Scalar test # http://spark.apache.org/docs/latest/sql-programming-guide.html#scalar def multiply_func(x, y): return x * y multiply = pandas_udf(multiply_func, returnType=LongType()) # The function for a pandas_udf should be able to execute with local Pandas data x = pd.Series([1, 2, 3]) log.info(f'multiply_func(x, x) \n x = {x} \n {multiply_func(x, x)}') # 0 1 # 1 4 # 2 9 # dtype: int64 # Create a Spark DataFrame, 'spark' is an existing SparkSession df = spark.createDataFrame(pd.DataFrame(x, columns=["x"])) log.info(f'df before {df.show()}') # Execute function as a Spark vectorized UDF df.select(multiply(col("x"), col("x"))).show() # groupmap # http://spark.apache.org/docs/latest/sql-programming-guide.html#grouped-map df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v")) @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP) def substract_mean(pdf): # pdf is a pandas.DataFrame not spark dataframe, can not use show v = pdf.v log.info(f'grouped {pdf}') return pdf.assign(v=v - v.mean()) df.groupby("id").apply(substract_mean).show()
def test_unsupported_types(self): from distutils.version import LooseVersion import pyarrow as pa from pyspark.sql.functions import pandas_udf, PandasUDFType common_err_msg = 'Invalid returnType.*grouped map Pandas UDF.*' unsupported_types = [ StructField('map', MapType(StringType(), IntegerType())), StructField('arr_ts', ArrayType(TimestampType())), StructField('null', NullType()), ] # TODO: Remove this if-statement once minimum pyarrow version is 0.10.0 if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): unsupported_types.append(StructField('bin', BinaryType())) for unsupported_type in unsupported_types: schema = StructType( [StructField('id', LongType(), True), unsupported_type]) with QuietTest(self.sc): with self.assertRaisesRegexp(NotImplementedError, common_err_msg): pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP)
def test_register_vectorized_udf_basic(self): sum_pandas_udf = pandas_udf(lambda v: v.sum(), "integer", PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) self.assertEqual(sum_pandas_udf.evalType, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) group_agg_pandas_udf = self.spark.udf.register("sum_pandas_udf", sum_pandas_udf) self.assertEqual(group_agg_pandas_udf.evalType, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) q = "SELECT sum_pandas_udf(v1) FROM VALUES (3, 0), (2, 0), (1, 1) tbl(v1, v2) GROUP BY v2" actual = sorted(map(lambda r: r[0], self.spark.sql(q).collect())) expected = [1, 5] self.assertEqual(actual, expected)
def test_coerce(self): from pyspark.sql.functions import pandas_udf, PandasUDFType df = self.data foo = pandas_udf( lambda pdf: pdf, 'id long, v double', PandasUDFType.GROUPED_MAP ) result = df.groupby('id').apply(foo).sort('id').toPandas() expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True) expected = expected.assign(v=expected.v.astype('float64')) self.assertPandasEqual(expected, result)
def test_register_vectorized_udf_basic(self): df = self.spark.range(10).select( col('id').cast('int').alias('a'), col('id').cast('int').alias('b')) original_add = pandas_udf(lambda x, y: x + y, IntegerType()) self.assertEqual(original_add.deterministic, True) self.assertEqual(original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) new_add = self.spark.catalog.registerFunction("add1", original_add) res1 = df.select(new_add(col('a'), col('b'))) res2 = self.spark.sql( "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t") expected = df.select(expr('a + b')) self.assertEquals(expected.collect(), res1.collect()) self.assertEquals(expected.collect(), res2.collect())
def test_timestamp_dst(self): # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am dt = [ datetime.datetime(2015, 11, 1, 0, 30), datetime.datetime(2015, 11, 1, 1, 30), datetime.datetime(2015, 11, 1, 2, 30) ] df = self.spark.createDataFrame(dt, 'timestamp').toDF('time') foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp', PandasUDFType.GROUPED_MAP) result = df.groupby('time').apply(foo_udf).sort('time') assert_frame_equal(df.toPandas(), result.toPandas(), check_column_type=_check_column_type)
def test_vectorized_udf_timestamps_respect_session_timezone(self): import pandas as pd schema = StructType([ StructField("idx", LongType(), True), StructField("timestamp", TimestampType(), True)]) data = [(1, datetime(1969, 1, 1, 1, 1, 1)), (2, datetime(2012, 2, 2, 2, 2, 2)), (3, None), (4, datetime(2100, 3, 3, 3, 3, 3))] df = self.spark.createDataFrame(data, schema=schema) f_timestamp_copy = pandas_udf(lambda ts: ts, TimestampType()) internal_value = pandas_udf( lambda ts: ts.apply(lambda ts: ts.value if ts is not pd.NaT else None), LongType()) timezone = "America/New_York" with self.sql_conf({ "spark.sql.execution.pandas.respectSessionTimeZone": False, "spark.sql.session.timeZone": timezone}): df_la = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \ .withColumn("internal_value", internal_value(col("timestamp"))) result_la = df_la.select(col("idx"), col("internal_value")).collect() # Correct result_la by adjusting 3 hours difference between Los Angeles and New York diff = 3 * 60 * 60 * 1000 * 1000 * 1000 result_la_corrected = \ df_la.select(col("idx"), col("tscopy"), col("internal_value") + diff).collect() with self.sql_conf({ "spark.sql.execution.pandas.respectSessionTimeZone": True, "spark.sql.session.timeZone": timezone}): df_ny = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \ .withColumn("internal_value", internal_value(col("timestamp"))) result_ny = df_ny.select(col("idx"), col("tscopy"), col("internal_value")).collect() self.assertNotEqual(result_ny, result_la) self.assertEqual(result_ny, result_la_corrected)
def reduce_block_matrix(self, response: str) -> DataFrame: """ Transforms a starting reduced block matrix by applying a linear model. The form of the output can either be a direct linear transformation (response = "linear") or a linear transformation followed by a sigmoid transformation (response = "sigmoid"). Args: response : String specifying what transformation to apply ("linear" or "sigmoid") Returns: Spark DataFrame containing the result of the transformation. """ transform_key_pattern = ['sample_block', 'label'] if response == 'linear': warnings.warn('Ignoring any covariates for linear response') transform_udf = pandas_udf( lambda key, pdf: apply_model( key, transform_key_pattern, pdf, self._label_df, self. sample_blocks, self._alphas, pd.DataFrame({})), reduced_matrix_struct, PandasUDFType.GROUPED_MAP) join_type = 'inner' elif response == 'sigmoid': transform_udf = pandas_udf( lambda key, pdf: apply_logistic_model( key, transform_key_pattern, pdf, self._label_df, self. sample_blocks, self._alphas, self._std_cov_df), logistic_reduced_matrix_struct, PandasUDFType.GROUPED_MAP) join_type = 'right' else: raise ValueError( f'response must be either "linear" or "sigmoid", received "{response}"' ) return apply_model_df(self.reduced_block_df, self.model_df, self.cv_df, transform_udf, transform_key_pattern, join_type)
def generate_udf(spec: "rikai.spark.sql.codegen.base.ModelSpec"): """Construct a UDF to run pytorch model. Parameters ---------- spec : ModelSpec the model specifications object Returns ------- A Spark Pandas UDF. """ use_gpu = spec.options.get("device", "cpu") == "gpu" num_workers = int( spec.options.get("num_workers", min(os.cpu_count(), DEFAULT_NUM_WORKERS))) batch_size = int(spec.options.get("batch_size", DEFAULT_BATCH_SIZE)) schema = spec.schema should_return_df = isinstance(schema, StructType) return_type = (Iterator[pd.DataFrame] if should_return_df else Iterator[pd.Series]) def torch_inference_udf(iter: Iterator[pd.DataFrame], ) -> return_type: device = torch.device("cuda" if use_gpu else "cpu") model = spec.load_model() model.to(device) model.eval() with torch.no_grad(): for series in iter: dataset = PandasDataset(series, transform=spec.pre_processing) results = [] for batch in DataLoader( dataset, batch_size=batch_size, num_workers=num_workers, ): batch = batch.to(device) predictions = model(batch) if spec.post_processing: predictions = spec.post_processing(predictions) results.extend(predictions) if should_return_df: yield pd.DataFrame(results) else: yield pd.Series(results) return pandas_udf(torch_inference_udf, returnType=schema)
def test_unsupported_types(self): from pyspark.sql.types import DoubleType, MapType from pyspark.sql.functions import pandas_udf, PandasUDFType with QuietTest(self.sc): with self.assertRaisesRegexp(NotImplementedError, 'not supported'): pandas_udf(lambda x: x, ArrayType(ArrayType(TimestampType())), PandasUDFType.GROUPED_AGG) with QuietTest(self.sc): with self.assertRaisesRegexp(NotImplementedError, 'not supported'): @pandas_udf('mean double, std double', PandasUDFType.GROUPED_AGG) def mean_and_std_udf(v): return v.mean(), v.std() with QuietTest(self.sc): with self.assertRaisesRegexp(NotImplementedError, 'not supported'): @pandas_udf(MapType(DoubleType(), DoubleType()), PandasUDFType.GROUPED_AGG) def mean_and_std_udf(v): return {v.mean(): v.std()}
def test_wrong_args(self): df = self.data with QuietTest(self.sc): with self.assertRaisesRegexp(ValueError, 'Invalid udf'): df.groupby('id').apply(lambda x: x) with self.assertRaisesRegexp(ValueError, 'Invalid udf'): df.groupby('id').apply(udf(lambda x: x, DoubleType())) with self.assertRaisesRegexp(ValueError, 'Invalid udf'): df.groupby('id').apply(sum(df.v)) with self.assertRaisesRegexp(ValueError, 'Invalid udf'): df.groupby('id').apply(df.v + 1) with self.assertRaisesRegexp(ValueError, 'Invalid function'): df.groupby('id').apply( pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())]))) with self.assertRaisesRegexp(ValueError, 'Invalid udf'): df.groupby('id').apply(pandas_udf(lambda x, y: x, DoubleType())) with self.assertRaisesRegexp(ValueError, 'Invalid udf.*GROUPED_MAP'): df.groupby('id').apply( pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR))
def pandas_udf(self): function = self.getFunction() returnType = self.getReturnType() functionType = self.functionType # f = pickle.loads(codecs.decode(function.encode(), "base64")) f = self.decode_function(function) if not callable(f): raise ValueError("Decoded function parameter is not callable.") return pandas_udf(f=f, returnType=returnType, functionType=functionType, )
def fit(self) -> DataFrame: """ Fits a ridge reducer model, represented by a Spark DataFrame containing coefficients for each of the ridge alpha parameters, for each block in the starting matrix, for each label in the target labels. Returns: Spark DataFrame containing the model resulting from the fitting routine. """ map_key_pattern = ['header_block', 'sample_block'] reduce_key_pattern = ['header_block', 'header'] if 'label' in self.block_df.columns: map_key_pattern.append('label') reduce_key_pattern.append('label') map_udf = pandas_udf( lambda key, pdf: map_normal_eqn(key, map_key_pattern, pdf, self._std_label_df, self. sample_blocks, self._std_cov_df), normal_eqn_struct, PandasUDFType.GROUPED_MAP) reduce_udf = pandas_udf( lambda key, pdf: reduce_normal_eqn(key, reduce_key_pattern, pdf), normal_eqn_struct, PandasUDFType.GROUPED_MAP) model_udf = pandas_udf( lambda key, pdf: solve_normal_eqn( key, map_key_pattern, pdf, self._std_label_df, self._alphas, self._std_cov_df), model_struct, PandasUDFType.GROUPED_MAP) record_hls_event('wgrRidgeReduceFit') self.model_df = self.block_df.groupBy(map_key_pattern).apply( map_udf).groupBy(reduce_key_pattern).apply(reduce_udf).groupBy( map_key_pattern).apply(model_udf) return self.model_df
def test_vectorized_udf_struct_type(self): import pandas as pd df = self.spark.range(10) return_type = StructType( [StructField('id', LongType()), StructField('str', StringType())]) def func(id): return pd.DataFrame({'id': id, 'str': id.apply(unicode)}) f = pandas_udf(func, returnType=return_type) expected = df.select( struct(col('id'), col('id').cast('string').alias('str')).alias( 'struct')).collect() actual = df.select(f(col('id')).alias('struct')).collect() self.assertEqual(expected, actual) g = pandas_udf(func, 'id: long, str: string') actual = df.select(g(col('id')).alias('struct')).collect() self.assertEqual(expected, actual)
def generate( batch_id, n_data, public_key_hex_internal, public_key_hex_external, output, n_rows, scale, partition_size_mb, ): shares = ( spark_session().range(n_rows * n_data).select( (F.col("id") % n_rows).alias("row_id"), F.when(F.rand() > 0.5, 1).otherwise(0).alias("payload"), ).groupBy("row_id").agg( F.collect_list("payload").alias("payload")).select( F.pandas_udf( partial( udf.encode, batch_id, n_data, public_key_hex_internal, public_key_hex_external, ), returnType="a: binary, b: binary", )("payload").alias("shares")) # repeat this data `scale` times .withColumn("_repeat", F.explode(F.array_repeat( F.lit(0), scale))).drop("_repeat").withColumn( "id", F.udf(lambda: str(uuid4()), returnType="string")())) # we can make an estimate with just a single row, since the configuration # is the same here. row = shares.first() dataset_estimate_mb = ((len(b64encode(row.shares.a)) + len(str(uuid4()))) * n_rows * scale * 1.0 / 10**6) num_partitions = math.ceil(dataset_estimate_mb / partition_size_mb) click.echo(f"writing {num_partitions} partitions") # try to be efficient without caching by repartitioning repartitioned = (shares.withColumn( "shares", F.map_from_arrays(F.array(F.lit("a"), F.lit("b")), F.array("shares.a", "shares.b")), ).repartitionByRange(num_partitions, "id").select( "id", F.explode("shares").alias("server_id", "payload"))) repartitioned.write.partitionBy("server_id").json(output, mode="overwrite")
def pandas_udf_func(attr=None, func=None): # TODO: Get the column type, so is not necessary to pass the return type as param. # Apply the function over the whole series def apply_to_series(val, attr): if attr is None: attr = (None,) else: attr = (attr,) return val.apply(func, args=attr) def to_serie(value): return apply_to_series(value, attr) return F.pandas_udf(to_serie, func_return_type)
def test_type_annotation(self): # Regression test to check if type hints can be used. See SPARK-23569. # Note that it throws an error during compilation in lower Python versions if 'exec' # is not used. Also, note that we explicitly use another dictionary to avoid modifications # in the current 'locals()'. # # Hyukjin: I think it's an ugly way to test issues about syntax specific in # higher versions of Python, which we shouldn't encourage. This was the last resort # I could come up with at that time. _locals = {} exec( "import pandas as pd\ndef noop(col: pd.Series) -> pd.Series: return col", _locals) df = self.spark.range(1).select( pandas_udf(f=_locals['noop'], returnType='bigint')('id')) self.assertEqual(df.first()[0], 0)
def test_array_type_correct(self): df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id") output_schema = StructType([ StructField('id', LongType()), StructField('v', IntegerType()), StructField('arr', ArrayType(LongType())) ]) udf = pandas_udf(lambda pdf: pdf, output_schema, PandasUDFType.GROUPED_MAP) result = df.groupby('id').apply(udf).sort('id').toPandas() expected = df.toPandas().groupby('id').apply( udf.func).reset_index(drop=True) self.assertPandasEqual(expected, result)
def test_vectorized_udf_dates(self): schema = StructType().add("idx", LongType()).add("date", DateType()) data = [( 0, date(1969, 1, 1), ), ( 1, date(2012, 2, 2), ), ( 2, None, ), ( 3, date(2100, 4, 4), ), ( 4, date(2262, 4, 12), )] df = self.spark.createDataFrame(data, schema=schema) date_copy = pandas_udf(lambda t: t, returnType=DateType()) df = df.withColumn("date_copy", date_copy(col("date"))) @pandas_udf(returnType=StringType()) def check_data(idx, date, date_copy): import pandas as pd msgs = [] is_equal = date.isnull() for i in range(len(idx)): if (is_equal[i] and data[idx[i]][1] is None) or \ date[i] == data[idx[i]][1]: msgs.append(None) else: msgs.append( "date values are not equal (date='%s': data[%d][1]='%s')" % (date[i], idx[i], data[idx[i]][1])) return pd.Series(msgs) result = df.withColumn( "check_data", check_data(col("idx"), col("date"), col("date_copy"))).collect() self.assertEquals(len(data), len(result)) for i in range(len(result)): self.assertEquals(data[i][1], result[i][1]) # "date" col self.assertEquals(data[i][1], result[i][2]) # "date_copy" col self.assertIsNone(result[i][3]) # "check_data" col
def verify2( batch_id, n_data, server_id, private_key_hex, shared_secret, public_key_hex_internal, public_key_hex_external, input, input_internal, input_external, output, ): """Verify a batch of SNIPs""" click.echo("Running verify2") spark = spark_session() shares = spark.read.json(input) internal = spark.read.json(input_internal) external = spark.read.json(input_external) df = (shares.select("id", F.unbase64("payload").alias("shares")).join( internal.select( "id", F.unbase64("payload").alias("internal")), on="id").join(external.select( "id", F.unbase64("payload").alias("external")), on="id").select( "id", F.pandas_udf( partial( udf.verify2, batch_id, n_data, server_id, private_key_hex, b64decode(shared_secret), public_key_hex_internal, public_key_hex_external, ), returnType="binary", )("shares", "internal", "external").alias("payload"), )) valid = df.where("payload is not null") valid.write.json(output, mode="overwrite")
def transform( self, blockdf: DataFrame, labeldf: pd.DataFrame, sample_blocks: Dict[str, List[str]], modeldf: DataFrame, covdf: pd.DataFrame = pd.DataFrame({})) -> DataFrame: """ Transforms a starting block matrix to the reduced block matrix, using a reducer model produced by the RidgeReducer fit method. Args: blockdf : Spark DataFrame representing the beginning block matrix labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models sample_blocks: Dict containing a mapping of sample_block ID to a list of corresponding sample IDs modeldf : Spark DataFrame produced by the RidgeReducer fit method, representing the reducer model covdf : Pandas DataFrame containing covariates to be included in every model in the stacking ensemble (optional). Returns: Spark DataFrame representing the reduced block matrix """ validate_inputs(labeldf, covdf) transform_key_pattern = ['header_block', 'sample_block'] if 'label' in blockdf.columns: transform_key_pattern.append('label') joined = blockdf.drop('sort_key') \ .join(modeldf, ['header_block', 'sample_block', 'header'], 'right') \ .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0))) else: joined = blockdf.drop('sort_key') \ .join(modeldf, ['header_block', 'sample_block', 'header'], 'right') transform_udf = pandas_udf( lambda key, pdf: apply_model(key, transform_key_pattern, pdf, labeldf, sample_blocks, self.alphas, covdf), reduced_matrix_struct, PandasUDFType.GROUPED_MAP) record_hls_event('wgrRidgeReduceTransform') return joined \ .groupBy(transform_key_pattern) \ .apply(transform_udf)
def transform(self, blockdf: DataFrame, labeldf: pd.DataFrame, sample_blocks: Dict[str, List[str]], modeldf: DataFrame, cvdf: DataFrame, covdf: pd.DataFrame = pd.DataFrame({})) -> pd.DataFrame: """ Generates predictions for the target labels in the provided label DataFrame by applying the model resulting from the RidgeRegression fit method to the starting block matrix. Args: blockdf : Spark DataFrame representing the beginning block matrix X labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs modeldf : Spark DataFrame produced by the RidgeRegression fit method, representing the reducer model cvdf : Spark DataFrame produced by the RidgeRegression fit method, containing the results of the cross validation routine. covdf : Pandas DataFrame containing covariates to be included in every model in the stacking ensemble (optional). Returns: Pandas DataFrame containing prediction y_hat values. The shape and order match labeldf such that the rows are indexed by sample ID and the columns by label. The column types are float64. """ validate_inputs(labeldf, covdf) transform_key_pattern = ['sample_block', 'label'] transform_udf = pandas_udf( lambda key, pdf: apply_model(key, transform_key_pattern, pdf, labeldf, sample_blocks, self.alphas, covdf), reduced_matrix_struct, PandasUDFType.GROUPED_MAP) blocked_prediction_df = blockdf.drop('header_block', 'sort_key') \ .join(modeldf.drop('header_block'), ['sample_block', 'header'], 'right') \ .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0))) \ .groupBy(transform_key_pattern) \ .apply(transform_udf) \ .join(cvdf, ['label', 'alpha'], 'inner') pivoted_df = flatten_prediction_df(blocked_prediction_df, sample_blocks, labeldf) record_hls_event('wgrRidgeRegressionTransform') return pivoted_df