def test_vectorized_udf_timestamps_respect_session_timezone(self):
        schema = StructType([
            StructField("idx", LongType(), True),
            StructField("timestamp", TimestampType(), True)])
        data = [(1, datetime(1969, 1, 1, 1, 1, 1)),
                (2, datetime(2012, 2, 2, 2, 2, 2)),
                (3, None),
                (4, datetime(2100, 3, 3, 3, 3, 3))]
        df = self.spark.createDataFrame(data, schema=schema)

        f_timestamp_copy = pandas_udf(lambda ts: ts, TimestampType())
        internal_value = pandas_udf(
            lambda ts: ts.apply(lambda ts: ts.value if ts is not pd.NaT else None), LongType())

        timezone = "America/New_York"
        with self.sql_conf({
                "spark.sql.execution.pandas.respectSessionTimeZone": False,
                "spark.sql.session.timeZone": timezone}):
            df_la = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \
                .withColumn("internal_value", internal_value(col("timestamp")))
            result_la = df_la.select(col("idx"), col("internal_value")).collect()
            # Correct result_la by adjusting 3 hours difference between Los Angeles and New York
            diff = 3 * 60 * 60 * 1000 * 1000 * 1000
            result_la_corrected = \
                df_la.select(col("idx"), col("tscopy"), col("internal_value") + diff).collect()

        with self.sql_conf({
                "spark.sql.execution.pandas.respectSessionTimeZone": True,
                "spark.sql.session.timeZone": timezone}):
            df_ny = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \
                .withColumn("internal_value", internal_value(col("timestamp")))
            result_ny = df_ny.select(col("idx"), col("tscopy"), col("internal_value")).collect()

            self.assertNotEqual(result_ny, result_la)
            self.assertEqual(result_ny, result_la_corrected)
 def test_vectorized_udf_chained(self):
     from pyspark.sql.functions import pandas_udf, col
     df = self.spark.range(10)
     f = pandas_udf(lambda x: x + 1, LongType())
     g = pandas_udf(lambda x: x - 1, LongType())
     res = df.select(g(f(col('id'))))
     self.assertEquals(df.collect(), res.collect())
Example #3
0
    def test_udf_wrong_arg(self):
        with QuietTest(self.sc):
            with self.assertRaises(ParseException):
                @pandas_udf('blah')
                def foo(x):
                    return x
            with self.assertRaisesRegexp(ValueError, 'Invalid returnType.*None'):
                @pandas_udf(functionType=PandasUDFType.SCALAR)
                def foo(x):
                    return x
            with self.assertRaisesRegexp(ValueError, 'Invalid functionType'):
                @pandas_udf('double', 100)
                def foo(x):
                    return x

            with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'):
                pandas_udf(lambda: 1, LongType(), PandasUDFType.SCALAR)
            with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'):
                @pandas_udf(LongType(), PandasUDFType.SCALAR)
                def zero_with_type():
                    return 1

            with self.assertRaisesRegexp(TypeError, 'Invalid returnType'):
                @pandas_udf(returnType=PandasUDFType.GROUPED_MAP)
                def foo(df):
                    return df
            with self.assertRaisesRegexp(TypeError, 'Invalid returnType'):
                @pandas_udf(returnType='double', functionType=PandasUDFType.GROUPED_MAP)
                def foo(df):
                    return df
            with self.assertRaisesRegexp(ValueError, 'Invalid function'):
                @pandas_udf(returnType='k int, v double', functionType=PandasUDFType.GROUPED_MAP)
                def foo(k, v, w):
                    return k
 def test_vectorized_udf_unsupported_types(self):
     from pyspark.sql.functions import pandas_udf
     with QuietTest(self.sc):
         with self.assertRaisesRegexp(
                 NotImplementedError,
                 'Invalid returnType.*scalar Pandas UDF.*MapType'):
             pandas_udf(lambda x: x, MapType(StringType(), IntegerType()))
 def test_vectorized_udf_wrong_return_type(self):
     from pyspark.sql.functions import pandas_udf
     with QuietTest(self.sc):
         with self.assertRaisesRegexp(
                 NotImplementedError,
                 'Invalid returnType.*scalar Pandas UDF.*MapType'):
             pandas_udf(lambda x: x * 1.0, MapType(LongType(), LongType()))
    def test_vectorized_udf_struct_type(self):
        df = self.spark.range(10)
        return_type = StructType([
            StructField('id', LongType()),
            StructField('str', StringType())])

        def func(id):
            return pd.DataFrame({'id': id, 'str': id.apply(unicode)})

        f = pandas_udf(func, returnType=return_type)

        expected = df.select(struct(col('id'), col('id').cast('string').alias('str'))
                             .alias('struct')).collect()

        actual = df.select(f(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        g = pandas_udf(func, 'id: long, str: string')
        actual = df.select(g(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        struct_f = pandas_udf(lambda x: x, return_type)
        actual = df.select(struct_f(struct(col('id'), col('id').cast('string').alias('str'))))
        if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            with QuietTest(self.sc):
                from py4j.protocol import Py4JJavaError
                with self.assertRaisesRegexp(
                        Py4JJavaError,
                        'Unsupported type in conversion from Arrow'):
                    self.assertEqual(expected, actual.collect())
        else:
            self.assertEqual(expected, actual.collect())
 def test_wrong_return_type(self):
     with QuietTest(self.sc):
         with self.assertRaisesRegexp(
                 NotImplementedError,
                 'Invalid returnType.*grouped map Pandas UDF.*MapType'):
             pandas_udf(
                 lambda pdf: pdf,
                 'id long, v map<int, int>',
                 PandasUDFType.GROUPED_MAP)
 def test_vectorized_udf_unsupported_types(self):
     with QuietTest(self.sc):
         with self.assertRaisesRegexp(
                 NotImplementedError,
                 'Invalid returnType.*scalar Pandas UDF.*MapType'):
             pandas_udf(lambda x: x, MapType(StringType(), IntegerType()))
         with self.assertRaisesRegexp(
                 NotImplementedError,
                 'Invalid returnType.*scalar Pandas UDF.*ArrayType.StructType'):
             pandas_udf(lambda x: x, ArrayType(StructType([StructField('a', IntegerType())])))
 def test_vectorized_udf_complex(self):
     df = self.spark.range(10).select(
         col('id').cast('int').alias('a'),
         col('id').cast('int').alias('b'),
         col('id').cast('double').alias('c'))
     add = pandas_udf(lambda x, y: x + y, IntegerType())
     power2 = pandas_udf(lambda x: 2 ** x, IntegerType())
     mul = pandas_udf(lambda x, y: x * y, DoubleType())
     res = df.select(add(col('a'), col('b')), power2(col('a')), mul(col('b'), col('c')))
     expected = df.select(expr('a + b'), expr('power(2, a)'), expr('b * c'))
     self.assertEquals(expected.collect(), res.collect())
Example #10
0
    def test_stopiteration_in_udf(self):
        from pyspark.sql.functions import udf, pandas_udf, PandasUDFType
        from py4j.protocol import Py4JJavaError

        def foo(x):
            raise StopIteration()

        def foofoo(x, y):
            raise StopIteration()

        exc_message = "Caught StopIteration thrown from user's code; failing the task"
        df = self.spark.range(0, 100)

        # plain udf (test for SPARK-23754)
        self.assertRaisesRegexp(
            Py4JJavaError,
            exc_message,
            df.withColumn('v', udf(foo)('id')).collect
        )

        # pandas scalar udf
        self.assertRaisesRegexp(
            Py4JJavaError,
            exc_message,
            df.withColumn(
                'v', pandas_udf(foo, 'double', PandasUDFType.SCALAR)('id')
            ).collect
        )

        # pandas grouped map
        self.assertRaisesRegexp(
            Py4JJavaError,
            exc_message,
            df.groupBy('id').apply(
                pandas_udf(foo, df.schema, PandasUDFType.GROUPED_MAP)
            ).collect
        )

        self.assertRaisesRegexp(
            Py4JJavaError,
            exc_message,
            df.groupBy('id').apply(
                pandas_udf(foofoo, df.schema, PandasUDFType.GROUPED_MAP)
            ).collect
        )

        # pandas grouped agg
        self.assertRaisesRegexp(
            Py4JJavaError,
            exc_message,
            df.groupBy('id').agg(
                pandas_udf(foo, 'double', PandasUDFType.GROUPED_AGG)('id')
            ).collect
        )
    def test_mixed_scalar_udfs_followed_by_grouby_apply(self):
        df = self.spark.range(0, 10).toDF('v1')
        df = df.withColumn('v2', udf(lambda x: x + 1, 'int')(df['v1'])) \
            .withColumn('v3', pandas_udf(lambda x: x + 2, 'int')(df['v1']))

        result = df.groupby() \
            .apply(pandas_udf(lambda x: pd.DataFrame([x.sum().sum()]),
                              'sum int',
                              PandasUDFType.GROUPED_MAP))

        self.assertEquals(result.collect()[0]['sum'], 165)
    def test_unsupported_types(self):
        common_err_msg = 'Invalid returnType.*grouped map Pandas UDF.*'
        unsupported_types = [
            StructField('map', MapType(StringType(), IntegerType())),
            StructField('arr_ts', ArrayType(TimestampType())),
            StructField('null', NullType()),
            StructField('struct', StructType([StructField('l', LongType())])),
        ]

        for unsupported_type in unsupported_types:
            schema = StructType([StructField('id', LongType(), True), unsupported_type])
            with QuietTest(self.sc):
                with self.assertRaisesRegexp(NotImplementedError, common_err_msg):
                    pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP)
 def test_vectorized_udf_basic(self):
     from pyspark.sql.functions import pandas_udf, col, array
     df = self.spark.range(10).select(
         col('id').cast('string').alias('str'),
         col('id').cast('int').alias('int'),
         col('id').alias('long'),
         col('id').cast('float').alias('float'),
         col('id').cast('double').alias('double'),
         col('id').cast('decimal').alias('decimal'),
         col('id').cast('boolean').alias('bool'),
         array(col('id')).alias('array_long'))
     f = lambda x: x
     str_f = pandas_udf(f, StringType())
     int_f = pandas_udf(f, IntegerType())
     long_f = pandas_udf(f, LongType())
     float_f = pandas_udf(f, FloatType())
     double_f = pandas_udf(f, DoubleType())
     decimal_f = pandas_udf(f, DecimalType())
     bool_f = pandas_udf(f, BooleanType())
     array_long_f = pandas_udf(f, ArrayType(LongType()))
     res = df.select(str_f(col('str')), int_f(col('int')),
                     long_f(col('long')), float_f(col('float')),
                     double_f(col('double')), decimal_f('decimal'),
                     bool_f(col('bool')), array_long_f('array_long'))
     self.assertEquals(df.collect(), res.collect())
 def test_vectorized_udf_null_binary(self):
     if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(
                     NotImplementedError,
                     'Invalid returnType.*scalar Pandas UDF.*BinaryType'):
                 pandas_udf(lambda x: x, BinaryType())
     else:
         data = [(bytearray(b"a"),), (None,), (bytearray(b"bb"),), (bytearray(b"ccc"),)]
         schema = StructType().add("binary", BinaryType())
         df = self.spark.createDataFrame(data, schema)
         str_f = pandas_udf(lambda x: x, BinaryType())
         res = df.select(str_f(col('binary')))
         self.assertEquals(df.collect(), res.collect())
    def test_vectorized_udf_nested_struct(self):
        nested_type = StructType([
            StructField('id', IntegerType()),
            StructField('nested', StructType([
                StructField('foo', StringType()),
                StructField('bar', FloatType())
            ]))
        ])

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(
                    Exception,
                    'Invalid returnType with scalar Pandas UDFs'):
                pandas_udf(lambda x: x, returnType=nested_type)
 def test_pandas_udf_nested_arrays(self):
     tokenize = pandas_udf(lambda s: s.apply(lambda str: [str.split(' ')]),
                           ArrayType(ArrayType(StringType())))
     self.assertEqual(tokenize.returnType, ArrayType(ArrayType(StringType())))
     df = self.spark.createDataFrame([("hi boo",), ("bye boo",)], ["vals"])
     result = df.select(tokenize("vals").alias("hi"))
     self.assertEqual([Row(hi=[[u'hi', u'boo']]), Row(hi=[[u'bye', u'boo']])], result.collect())
Example #17
0
def scalar_pandas_udf_example(spark):
    # $example on:scalar_pandas_udf$
    import pandas as pd

    from pyspark.sql.functions import col, pandas_udf
    from pyspark.sql.types import LongType

    # Declare the function and create the UDF
    def multiply_func(a, b):
        return a * b

    multiply = pandas_udf(multiply_func, returnType=LongType())

    # The function for a pandas_udf should be able to execute with local Pandas data
    x = pd.Series([1, 2, 3])
    print(multiply_func(x, x))
    # 0    1
    # 1    4
    # 2    9
    # dtype: int64

    # Create a Spark DataFrame, 'spark' is an existing SparkSession
    df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))

    # Execute function as a Spark vectorized UDF
    df.select(multiply(col("x"), col("x"))).show()
    def test_vectorized_udf_dates(self):
        schema = StructType().add("idx", LongType()).add("date", DateType())
        data = [(0, date(1969, 1, 1),),
                (1, date(2012, 2, 2),),
                (2, None,),
                (3, date(2100, 4, 4),),
                (4, date(2262, 4, 12),)]
        df = self.spark.createDataFrame(data, schema=schema)

        date_copy = pandas_udf(lambda t: t, returnType=DateType())
        df = df.withColumn("date_copy", date_copy(col("date")))

        @pandas_udf(returnType=StringType())
        def check_data(idx, date, date_copy):
            msgs = []
            is_equal = date.isnull()
            for i in range(len(idx)):
                if (is_equal[i] and data[idx[i]][1] is None) or \
                        date[i] == data[idx[i]][1]:
                    msgs.append(None)
                else:
                    msgs.append(
                        "date values are not equal (date='%s': data[%d][1]='%s')"
                        % (date[i], idx[i], data[idx[i]][1]))
            return pd.Series(msgs)

        result = df.withColumn("check_data",
                               check_data(col("idx"), col("date"), col("date_copy"))).collect()

        self.assertEquals(len(data), len(result))
        for i in range(len(result)):
            self.assertEquals(data[i][1], result[i][1])  # "date" col
            self.assertEquals(data[i][1], result[i][2])  # "date_copy" col
            self.assertIsNone(result[i][3])  # "check_data" col
 def test_vectorized_udf_null_float(self):
     data = [(3.0,), (5.0,), (-1.0,), (None,)]
     schema = StructType().add("float", FloatType())
     df = self.spark.createDataFrame(data, schema)
     float_f = pandas_udf(lambda x: x, FloatType())
     res = df.select(float_f(col('float')))
     self.assertEquals(df.collect(), res.collect())
 def test_vectorized_udf_null_int(self):
     data = [(None,), (2,), (3,), (4,)]
     schema = StructType().add("int", IntegerType())
     df = self.spark.createDataFrame(data, schema)
     int_f = pandas_udf(lambda x: x, IntegerType())
     res = df.select(int_f(col('int')))
     self.assertEquals(df.collect(), res.collect())
 def test_vectorized_udf_null_decimal(self):
     data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)]
     schema = StructType().add("decimal", DecimalType(38, 18))
     df = self.spark.createDataFrame(data, schema)
     decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18))
     res = df.select(decimal_f(col('decimal')))
     self.assertEquals(df.collect(), res.collect())
 def test_vectorized_udf_null_boolean(self):
     data = [(True,), (True,), (None,), (False,)]
     schema = StructType().add("bool", BooleanType())
     df = self.spark.createDataFrame(data, schema)
     bool_f = pandas_udf(lambda x: x, BooleanType())
     res = df.select(bool_f(col('bool')))
     self.assertEquals(df.collect(), res.collect())
 def test_vectorized_udf_null_byte(self):
     data = [(None,), (2,), (3,), (4,)]
     schema = StructType().add("byte", ByteType())
     df = self.spark.createDataFrame(data, schema)
     byte_f = pandas_udf(lambda x: x, ByteType())
     res = df.select(byte_f(col('byte')))
     self.assertEquals(df.collect(), res.collect())
 def test_vectorized_udf_null_array(self):
     data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)]
     array_schema = StructType([StructField("array", ArrayType(IntegerType()))])
     df = self.spark.createDataFrame(data, schema=array_schema)
     array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()))
     result = df.select(array_f(col('array')))
     self.assertEquals(df.collect(), result.collect())
 def test_vectorized_udf_null_string(self):
     data = [("foo",), (None,), ("bar",), ("bar",)]
     schema = StructType().add("str", StringType())
     df = self.spark.createDataFrame(data, schema)
     str_f = pandas_udf(lambda x: x, StringType())
     res = df.select(str_f(col('str')))
     self.assertEquals(df.collect(), res.collect())
Example #26
0
 def test_vectorized_udf_string_in_udf(self):
     import pandas as pd
     df = self.spark.range(10)
     str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType())
     actual = df.select(str_f(col('id')))
     expected = df.select(col('id').cast('string'))
     self.assertEquals(expected.collect(), actual.collect())
    def test_manual(self):
        df = self.data
        sum_udf = self.pandas_agg_sum_udf
        mean_udf = self.pandas_agg_mean_udf
        mean_arr_udf = pandas_udf(
            self.pandas_agg_mean_udf.func,
            ArrayType(self.pandas_agg_mean_udf.returnType),
            self.pandas_agg_mean_udf.evalType)

        result1 = df.groupby('id').agg(
            sum_udf(df.v),
            mean_udf(df.v),
            mean_arr_udf(array(df.v))).sort('id')
        expected1 = self.spark.createDataFrame(
            [[0, 245.0, 24.5, [24.5]],
             [1, 255.0, 25.5, [25.5]],
             [2, 265.0, 26.5, [26.5]],
             [3, 275.0, 27.5, [27.5]],
             [4, 285.0, 28.5, [28.5]],
             [5, 295.0, 29.5, [29.5]],
             [6, 305.0, 30.5, [30.5]],
             [7, 315.0, 31.5, [31.5]],
             [8, 325.0, 32.5, [32.5]],
             [9, 335.0, 33.5, [33.5]]],
            ['id', 'sum(v)', 'avg(v)', 'avg(array(v))'])

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
 def test_register_grouped_map_udf(self):
     foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP)
     with QuietTest(self.sc):
         with self.assertRaisesRegexp(
                 ValueError,
                 'f.*SQL_BATCHED_UDF.*SQL_SCALAR_PANDAS_UDF.*SQL_GROUPED_AGG_PANDAS_UDF.*'):
             self.spark.catalog.registerFunction("foo_udf", foo_udf)
 def test_vectorized_udf_null_double(self):
     data = [(3.0,), (5.0,), (-1.0,), (None,)]
     schema = StructType().add("double", DoubleType())
     df = self.spark.createDataFrame(data, schema)
     double_f = pandas_udf(lambda x: x, DoubleType())
     res = df.select(double_f(col('double')))
     self.assertEquals(df.collect(), res.collect())
 def test_vectorized_udf_null_long(self):
     data = [(None,), (2,), (3,), (4,)]
     schema = StructType().add("long", LongType())
     df = self.spark.createDataFrame(data, schema)
     long_f = pandas_udf(lambda x: x, LongType())
     res = df.select(long_f(col('long')))
     self.assertEquals(df.collect(), res.collect())
 def test_register_nondeterministic_vectorized_udf_basic(self):
     random_pandas_udf = pandas_udf(lambda x: random.randint(6, 6) + x,
                                    IntegerType()).asNondeterministic()
     self.assertEqual(random_pandas_udf.deterministic, False)
     self.assertEqual(random_pandas_udf.evalType,
                      PythonEvalType.SQL_SCALAR_PANDAS_UDF)
     nondeterministic_pandas_udf = self.spark.catalog.registerFunction(
         "randomPandasUDF", random_pandas_udf)
     self.assertEqual(nondeterministic_pandas_udf.deterministic, False)
     self.assertEqual(nondeterministic_pandas_udf.evalType,
                      PythonEvalType.SQL_SCALAR_PANDAS_UDF)
     [row] = self.spark.sql("SELECT randomPandasUDF(1)").collect()
     self.assertEqual(row[0], 7)
    def test_vectorized_udf_struct_type(self):
        import pandas as pd
        import pyarrow as pa

        df = self.spark.range(10)
        return_type = StructType(
            [StructField('id', LongType()),
             StructField('str', StringType())])

        def func(id):
            return pd.DataFrame({'id': id, 'str': id.apply(unicode)})

        f = pandas_udf(func, returnType=return_type)

        expected = df.select(
            struct(col('id'),
                   col('id').cast('string').alias('str')).alias(
                       'struct')).collect()

        actual = df.select(f(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        g = pandas_udf(func, 'id: long, str: string')
        actual = df.select(g(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        struct_f = pandas_udf(lambda x: x, return_type)
        actual = df.select(
            struct_f(struct(col('id'),
                            col('id').cast('string').alias('str'))))
        if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            with QuietTest(self.sc):
                from py4j.protocol import Py4JJavaError
                with self.assertRaisesRegexp(
                        Py4JJavaError,
                        'Unsupported type in conversion from Arrow'):
                    self.assertEqual(expected, actual.collect())
        else:
            self.assertEqual(expected, actual.collect())
 def test_timestamp_dst(self):
     from pyspark.sql.functions import pandas_udf, PandasUDFType
     # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am
     dt = [
         datetime.datetime(2015, 11, 1, 0, 30),
         datetime.datetime(2015, 11, 1, 1, 30),
         datetime.datetime(2015, 11, 1, 2, 30)
     ]
     df = self.spark.createDataFrame(dt, 'timestamp').toDF('time')
     foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp',
                          PandasUDFType.GROUPED_MAP)
     result = df.groupby('time').apply(foo_udf).sort('time')
     self.assertPandasEqual(df.toPandas(), result.toPandas())
Example #34
0
def map_from_array(theArray, theDelim):
    def pull_key_val(x, d, kind):
        retval = []
        index = -1
        if (kind == "key"):
            index = 0
        if (kind == "val"):
            index = 1
        if index == -1:
            raise "Bad input"
        for i in x:
            retval.append(i.split(d)[index])
        return (retval)

    pull_key_udf = f.pandas_udf(
        lambda x: x.apply(pull_key_val, args=(theDelim, "key")),
        ArrayType(StringType()))
    pull_val_udf = f.pandas_udf(
        lambda x: x.apply(pull_key_val, args=(theDelim, "val")),
        ArrayType(StringType()))

    return (f.map_from_arrays(pull_key_udf(theArray), pull_val_udf(theArray)))
Example #35
0
    def test_coerce(self):
        df = self.data

        foo = pandas_udf(lambda pdf: pdf, 'id long, v double',
                         PandasUDFType.GROUPED_MAP)

        result = df.groupby('id').apply(foo).sort('id').toPandas()
        expected = df.toPandas().groupby('id').apply(
            foo.func).reset_index(drop=True)
        expected = expected.assign(v=expected.v.astype('float64'))
        assert_frame_equal(expected,
                           result,
                           check_column_type=_check_column_type)
    def test_datatype_string(self):
        from pyspark.sql.functions import pandas_udf, PandasUDFType
        df = self.data

        foo_udf = pandas_udf(
            lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id),
            'id long, v int, v1 double, v2 long',
            PandasUDFType.GROUPED_MAP
        )

        result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
        expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
        self.assertPandasEqual(expected, result)
Example #37
0
    def test_datatype_string(self):
        df = self.data

        foo_udf = pandas_udf(
            lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id),
            'id long, v int, v1 double, v2 long', PandasUDFType.GROUPED_MAP)

        result = df.groupby('id').apply(foo_udf).sort('id').toPandas()
        expected = df.toPandas().groupby('id').apply(
            foo_udf.func).reset_index(drop=True)
        assert_frame_equal(expected,
                           result,
                           check_column_type=_check_column_type)
Example #38
0
    def pandas_udf_func(func=None, args=None):
        # TODO: Get the column type, so is not necessary to pass the return type as param.

        # Apply the function over the whole series
        def apply_to_series(value, args):
            if args is None or args == (None, ):
                return value.apply(func)
            else:
                return value.apply(func, args=args)

        def to_serie(value):
            return apply_to_series(value, args)

        return F.pandas_udf(to_serie, func_return_type)
Example #39
0
def testArrowPandas(spark):
    # Enable Arrow-based columnar data transfers
    spark.conf.set("spark.sql.execution.arrow.enabled", "true")

    # Generate a Pandas DataFrame
    pdf = pd.DataFrame(np.random.rand(100, 3))

    # Create a Spark DataFrame from a Pandas DataFrame using Arrow
    df = spark.createDataFrame(pdf)
    df.show()

    # Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
    result_pdf = df.select("*").toPandas()
    log.info(f'pandas-- {result_pdf}')

    # Scalar test
    # http://spark.apache.org/docs/latest/sql-programming-guide.html#scalar
    def multiply_func(x, y):
        return x * y

    multiply = pandas_udf(multiply_func, returnType=LongType())

    # The function for a pandas_udf should be able to execute with local Pandas data
    x = pd.Series([1, 2, 3])
    log.info(f'multiply_func(x, x) \n x = {x} \n {multiply_func(x, x)}')
    # 0    1
    # 1    4
    # 2    9
    # dtype: int64

    # Create a Spark DataFrame, 'spark' is an existing SparkSession
    df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))

    log.info(f'df before {df.show()}')
    # Execute function as a Spark vectorized UDF
    df.select(multiply(col("x"), col("x"))).show()

    # groupmap
    # http://spark.apache.org/docs/latest/sql-programming-guide.html#grouped-map
    df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0),
                                (2, 10.0)], ("id", "v"))

    @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP)
    def substract_mean(pdf):
        # pdf is a pandas.DataFrame not spark dataframe, can not use show
        v = pdf.v
        log.info(f'grouped {pdf}')
        return pdf.assign(v=v - v.mean())

    df.groupby("id").apply(substract_mean).show()
    def test_unsupported_types(self):
        from distutils.version import LooseVersion
        import pyarrow as pa
        from pyspark.sql.functions import pandas_udf, PandasUDFType

        common_err_msg = 'Invalid returnType.*grouped map Pandas UDF.*'
        unsupported_types = [
            StructField('map', MapType(StringType(), IntegerType())),
            StructField('arr_ts', ArrayType(TimestampType())),
            StructField('null', NullType()),
        ]

        # TODO: Remove this if-statement once minimum pyarrow version is 0.10.0
        if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            unsupported_types.append(StructField('bin', BinaryType()))

        for unsupported_type in unsupported_types:
            schema = StructType(
                [StructField('id', LongType(), True), unsupported_type])
            with QuietTest(self.sc):
                with self.assertRaisesRegexp(NotImplementedError,
                                             common_err_msg):
                    pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP)
Example #41
0
    def test_register_vectorized_udf_basic(self):
        sum_pandas_udf = pandas_udf(lambda v: v.sum(), "integer",
                                    PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF)

        self.assertEqual(sum_pandas_udf.evalType,
                         PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF)
        group_agg_pandas_udf = self.spark.udf.register("sum_pandas_udf",
                                                       sum_pandas_udf)
        self.assertEqual(group_agg_pandas_udf.evalType,
                         PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF)
        q = "SELECT sum_pandas_udf(v1) FROM VALUES (3, 0), (2, 0), (1, 1) tbl(v1, v2) GROUP BY v2"
        actual = sorted(map(lambda r: r[0], self.spark.sql(q).collect()))
        expected = [1, 5]
        self.assertEqual(actual, expected)
Example #42
0
    def test_coerce(self):
        from pyspark.sql.functions import pandas_udf, PandasUDFType
        df = self.data

        foo = pandas_udf(
            lambda pdf: pdf,
            'id long, v double',
            PandasUDFType.GROUPED_MAP
        )

        result = df.groupby('id').apply(foo).sort('id').toPandas()
        expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True)
        expected = expected.assign(v=expected.v.astype('float64'))
        self.assertPandasEqual(expected, result)
 def test_register_vectorized_udf_basic(self):
     df = self.spark.range(10).select(
         col('id').cast('int').alias('a'),
         col('id').cast('int').alias('b'))
     original_add = pandas_udf(lambda x, y: x + y, IntegerType())
     self.assertEqual(original_add.deterministic, True)
     self.assertEqual(original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
     new_add = self.spark.catalog.registerFunction("add1", original_add)
     res1 = df.select(new_add(col('a'), col('b')))
     res2 = self.spark.sql(
         "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t")
     expected = df.select(expr('a + b'))
     self.assertEquals(expected.collect(), res1.collect())
     self.assertEquals(expected.collect(), res2.collect())
 def test_timestamp_dst(self):
     # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am
     dt = [
         datetime.datetime(2015, 11, 1, 0, 30),
         datetime.datetime(2015, 11, 1, 1, 30),
         datetime.datetime(2015, 11, 1, 2, 30)
     ]
     df = self.spark.createDataFrame(dt, 'timestamp').toDF('time')
     foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp',
                          PandasUDFType.GROUPED_MAP)
     result = df.groupby('time').apply(foo_udf).sort('time')
     assert_frame_equal(df.toPandas(),
                        result.toPandas(),
                        check_column_type=_check_column_type)
    def test_vectorized_udf_timestamps_respect_session_timezone(self):
        import pandas as pd

        schema = StructType([
            StructField("idx", LongType(), True),
            StructField("timestamp", TimestampType(), True)])
        data = [(1, datetime(1969, 1, 1, 1, 1, 1)),
                (2, datetime(2012, 2, 2, 2, 2, 2)),
                (3, None),
                (4, datetime(2100, 3, 3, 3, 3, 3))]
        df = self.spark.createDataFrame(data, schema=schema)

        f_timestamp_copy = pandas_udf(lambda ts: ts, TimestampType())
        internal_value = pandas_udf(
            lambda ts: ts.apply(lambda ts: ts.value if ts is not pd.NaT else None), LongType())

        timezone = "America/New_York"
        with self.sql_conf({
                "spark.sql.execution.pandas.respectSessionTimeZone": False,
                "spark.sql.session.timeZone": timezone}):
            df_la = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \
                .withColumn("internal_value", internal_value(col("timestamp")))
            result_la = df_la.select(col("idx"), col("internal_value")).collect()
            # Correct result_la by adjusting 3 hours difference between Los Angeles and New York
            diff = 3 * 60 * 60 * 1000 * 1000 * 1000
            result_la_corrected = \
                df_la.select(col("idx"), col("tscopy"), col("internal_value") + diff).collect()

        with self.sql_conf({
                "spark.sql.execution.pandas.respectSessionTimeZone": True,
                "spark.sql.session.timeZone": timezone}):
            df_ny = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \
                .withColumn("internal_value", internal_value(col("timestamp")))
            result_ny = df_ny.select(col("idx"), col("tscopy"), col("internal_value")).collect()

            self.assertNotEqual(result_ny, result_la)
            self.assertEqual(result_ny, result_la_corrected)
    def reduce_block_matrix(self, response: str) -> DataFrame:
        """
        Transforms a starting reduced block matrix by applying a linear model.  The form of the output
        can either be a direct linear transformation (response = "linear") or a linear transformation followed by a
        sigmoid transformation (response = "sigmoid").

        Args:
            response : String specifying what transformation to apply ("linear" or "sigmoid")

        Returns:
            Spark DataFrame containing the result of the transformation.
        """

        transform_key_pattern = ['sample_block', 'label']

        if response == 'linear':
            warnings.warn('Ignoring any covariates for linear response')
            transform_udf = pandas_udf(
                lambda key, pdf: apply_model(
                    key, transform_key_pattern, pdf, self._label_df, self.
                    sample_blocks, self._alphas, pd.DataFrame({})),
                reduced_matrix_struct, PandasUDFType.GROUPED_MAP)
            join_type = 'inner'
        elif response == 'sigmoid':
            transform_udf = pandas_udf(
                lambda key, pdf: apply_logistic_model(
                    key, transform_key_pattern, pdf, self._label_df, self.
                    sample_blocks, self._alphas, self._std_cov_df),
                logistic_reduced_matrix_struct, PandasUDFType.GROUPED_MAP)
            join_type = 'right'
        else:
            raise ValueError(
                f'response must be either "linear" or "sigmoid", received "{response}"'
            )

        return apply_model_df(self.reduced_block_df, self.model_df, self.cv_df,
                              transform_udf, transform_key_pattern, join_type)
Example #47
0
def generate_udf(spec: "rikai.spark.sql.codegen.base.ModelSpec"):
    """Construct a UDF to run pytorch model.

    Parameters
    ----------
    spec : ModelSpec
        the model specifications object

    Returns
    -------
    A Spark Pandas UDF.
    """
    use_gpu = spec.options.get("device", "cpu") == "gpu"
    num_workers = int(
        spec.options.get("num_workers", min(os.cpu_count(),
                                            DEFAULT_NUM_WORKERS)))
    batch_size = int(spec.options.get("batch_size", DEFAULT_BATCH_SIZE))

    schema = spec.schema
    should_return_df = isinstance(schema, StructType)
    return_type = (Iterator[pd.DataFrame]
                   if should_return_df else Iterator[pd.Series])

    def torch_inference_udf(iter: Iterator[pd.DataFrame], ) -> return_type:
        device = torch.device("cuda" if use_gpu else "cpu")
        model = spec.load_model()
        model.to(device)
        model.eval()

        with torch.no_grad():
            for series in iter:
                dataset = PandasDataset(series, transform=spec.pre_processing)
                results = []
                for batch in DataLoader(
                        dataset,
                        batch_size=batch_size,
                        num_workers=num_workers,
                ):
                    batch = batch.to(device)
                    predictions = model(batch)
                    if spec.post_processing:
                        predictions = spec.post_processing(predictions)
                    results.extend(predictions)
                if should_return_df:
                    yield pd.DataFrame(results)
                else:
                    yield pd.Series(results)

    return pandas_udf(torch_inference_udf, returnType=schema)
Example #48
0
    def test_unsupported_types(self):
        from pyspark.sql.types import DoubleType, MapType
        from pyspark.sql.functions import pandas_udf, PandasUDFType

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(NotImplementedError, 'not supported'):
                pandas_udf(lambda x: x, ArrayType(ArrayType(TimestampType())),
                           PandasUDFType.GROUPED_AGG)

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(NotImplementedError, 'not supported'):

                @pandas_udf('mean double, std double',
                            PandasUDFType.GROUPED_AGG)
                def mean_and_std_udf(v):
                    return v.mean(), v.std()

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(NotImplementedError, 'not supported'):

                @pandas_udf(MapType(DoubleType(), DoubleType()),
                            PandasUDFType.GROUPED_AGG)
                def mean_and_std_udf(v):
                    return {v.mean(): v.std()}
    def test_wrong_args(self):
        df = self.data

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(lambda x: x)
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(udf(lambda x: x, DoubleType()))
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(sum(df.v))
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(df.v + 1)
            with self.assertRaisesRegexp(ValueError, 'Invalid function'):
                df.groupby('id').apply(
                    pandas_udf(lambda: 1,
                               StructType([StructField("d", DoubleType())])))
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(pandas_udf(lambda x, y: x,
                                                  DoubleType()))
            with self.assertRaisesRegexp(ValueError,
                                         'Invalid udf.*GROUPED_MAP'):
                df.groupby('id').apply(
                    pandas_udf(lambda x, y: x, DoubleType(),
                               PandasUDFType.SCALAR))
    def pandas_udf(self):
        function = self.getFunction()
        returnType = self.getReturnType()
        functionType = self.functionType

        # f = pickle.loads(codecs.decode(function.encode(), "base64"))
        f = self.decode_function(function)

        if not callable(f):
            raise ValueError("Decoded function parameter is not callable.")

        return pandas_udf(f=f,
                          returnType=returnType,
                          functionType=functionType,
                          )
Example #51
0
    def fit(self) -> DataFrame:
        """
        Fits a ridge reducer model, represented by a Spark DataFrame containing coefficients for each of the ridge
        alpha parameters, for each block in the starting matrix, for each label in the target labels.

        Returns:
            Spark DataFrame containing the model resulting from the fitting routine.
        """

        map_key_pattern = ['header_block', 'sample_block']
        reduce_key_pattern = ['header_block', 'header']

        if 'label' in self.block_df.columns:
            map_key_pattern.append('label')
            reduce_key_pattern.append('label')

        map_udf = pandas_udf(
            lambda key, pdf:
            map_normal_eqn(key, map_key_pattern, pdf, self._std_label_df, self.
                           sample_blocks, self._std_cov_df), normal_eqn_struct,
            PandasUDFType.GROUPED_MAP)
        reduce_udf = pandas_udf(
            lambda key, pdf: reduce_normal_eqn(key, reduce_key_pattern, pdf),
            normal_eqn_struct, PandasUDFType.GROUPED_MAP)
        model_udf = pandas_udf(
            lambda key, pdf: solve_normal_eqn(
                key, map_key_pattern, pdf, self._std_label_df, self._alphas,
                self._std_cov_df), model_struct, PandasUDFType.GROUPED_MAP)

        record_hls_event('wgrRidgeReduceFit')

        self.model_df = self.block_df.groupBy(map_key_pattern).apply(
            map_udf).groupBy(reduce_key_pattern).apply(reduce_udf).groupBy(
                map_key_pattern).apply(model_udf)

        return self.model_df
Example #52
0
    def test_vectorized_udf_struct_type(self):
        import pandas as pd

        df = self.spark.range(10)
        return_type = StructType(
            [StructField('id', LongType()),
             StructField('str', StringType())])

        def func(id):
            return pd.DataFrame({'id': id, 'str': id.apply(unicode)})

        f = pandas_udf(func, returnType=return_type)

        expected = df.select(
            struct(col('id'),
                   col('id').cast('string').alias('str')).alias(
                       'struct')).collect()

        actual = df.select(f(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)

        g = pandas_udf(func, 'id: long, str: string')
        actual = df.select(g(col('id')).alias('struct')).collect()
        self.assertEqual(expected, actual)
Example #53
0
def generate(
    batch_id,
    n_data,
    public_key_hex_internal,
    public_key_hex_external,
    output,
    n_rows,
    scale,
    partition_size_mb,
):
    shares = (
        spark_session().range(n_rows * n_data).select(
            (F.col("id") % n_rows).alias("row_id"),
            F.when(F.rand() > 0.5, 1).otherwise(0).alias("payload"),
        ).groupBy("row_id").agg(
            F.collect_list("payload").alias("payload")).select(
                F.pandas_udf(
                    partial(
                        udf.encode,
                        batch_id,
                        n_data,
                        public_key_hex_internal,
                        public_key_hex_external,
                    ),
                    returnType="a: binary, b: binary",
                )("payload").alias("shares"))
        # repeat this data `scale` times
        .withColumn("_repeat", F.explode(F.array_repeat(
            F.lit(0), scale))).drop("_repeat").withColumn(
                "id",
                F.udf(lambda: str(uuid4()), returnType="string")()))
    # we can make an estimate with just a single row, since the configuration
    # is the same here.
    row = shares.first()
    dataset_estimate_mb = ((len(b64encode(row.shares.a)) + len(str(uuid4()))) *
                           n_rows * scale * 1.0 / 10**6)
    num_partitions = math.ceil(dataset_estimate_mb / partition_size_mb)
    click.echo(f"writing {num_partitions} partitions")

    # try to be efficient without caching by repartitioning
    repartitioned = (shares.withColumn(
        "shares",
        F.map_from_arrays(F.array(F.lit("a"), F.lit("b")),
                          F.array("shares.a", "shares.b")),
    ).repartitionByRange(num_partitions, "id").select(
        "id",
        F.explode("shares").alias("server_id", "payload")))
    repartitioned.write.partitionBy("server_id").json(output, mode="overwrite")
Example #54
0
    def pandas_udf_func(attr=None, func=None):
        # TODO: Get the column type, so is not necessary to pass the return type as param.

        # Apply the function over the whole series
        def apply_to_series(val, attr):
            if attr is None:
                attr = (None,)
            else:
                attr = (attr,)

            return val.apply(func, args=attr)

        def to_serie(value):
            return apply_to_series(value, attr)

        return F.pandas_udf(to_serie, func_return_type)
Example #55
0
 def test_type_annotation(self):
     # Regression test to check if type hints can be used. See SPARK-23569.
     # Note that it throws an error during compilation in lower Python versions if 'exec'
     # is not used. Also, note that we explicitly use another dictionary to avoid modifications
     # in the current 'locals()'.
     #
     # Hyukjin: I think it's an ugly way to test issues about syntax specific in
     # higher versions of Python, which we shouldn't encourage. This was the last resort
     # I could come up with at that time.
     _locals = {}
     exec(
         "import pandas as pd\ndef noop(col: pd.Series) -> pd.Series: return col",
         _locals)
     df = self.spark.range(1).select(
         pandas_udf(f=_locals['noop'], returnType='bigint')('id'))
     self.assertEqual(df.first()[0], 0)
    def test_array_type_correct(self):
        df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id")

        output_schema = StructType([
            StructField('id', LongType()),
            StructField('v', IntegerType()),
            StructField('arr', ArrayType(LongType()))
        ])

        udf = pandas_udf(lambda pdf: pdf, output_schema,
                         PandasUDFType.GROUPED_MAP)

        result = df.groupby('id').apply(udf).sort('id').toPandas()
        expected = df.toPandas().groupby('id').apply(
            udf.func).reset_index(drop=True)
        self.assertPandasEqual(expected, result)
Example #57
0
    def test_vectorized_udf_dates(self):
        schema = StructType().add("idx", LongType()).add("date", DateType())
        data = [(
            0,
            date(1969, 1, 1),
        ), (
            1,
            date(2012, 2, 2),
        ), (
            2,
            None,
        ), (
            3,
            date(2100, 4, 4),
        ), (
            4,
            date(2262, 4, 12),
        )]
        df = self.spark.createDataFrame(data, schema=schema)

        date_copy = pandas_udf(lambda t: t, returnType=DateType())
        df = df.withColumn("date_copy", date_copy(col("date")))

        @pandas_udf(returnType=StringType())
        def check_data(idx, date, date_copy):
            import pandas as pd
            msgs = []
            is_equal = date.isnull()
            for i in range(len(idx)):
                if (is_equal[i] and data[idx[i]][1] is None) or \
                        date[i] == data[idx[i]][1]:
                    msgs.append(None)
                else:
                    msgs.append(
                        "date values are not equal (date='%s': data[%d][1]='%s')"
                        % (date[i], idx[i], data[idx[i]][1]))
            return pd.Series(msgs)

        result = df.withColumn(
            "check_data", check_data(col("idx"), col("date"),
                                     col("date_copy"))).collect()

        self.assertEquals(len(data), len(result))
        for i in range(len(result)):
            self.assertEquals(data[i][1], result[i][1])  # "date" col
            self.assertEquals(data[i][1], result[i][2])  # "date_copy" col
            self.assertIsNone(result[i][3])  # "check_data" col
Example #58
0
def verify2(
    batch_id,
    n_data,
    server_id,
    private_key_hex,
    shared_secret,
    public_key_hex_internal,
    public_key_hex_external,
    input,
    input_internal,
    input_external,
    output,
):
    """Verify a batch of SNIPs"""
    click.echo("Running verify2")
    spark = spark_session()
    shares = spark.read.json(input)
    internal = spark.read.json(input_internal)
    external = spark.read.json(input_external)
    df = (shares.select("id",
                        F.unbase64("payload").alias("shares")).join(
                            internal.select(
                                "id",
                                F.unbase64("payload").alias("internal")),
                            on="id").join(external.select(
                                "id",
                                F.unbase64("payload").alias("external")),
                                          on="id").select(
                                              "id",
                                              F.pandas_udf(
                                                  partial(
                                                      udf.verify2,
                                                      batch_id,
                                                      n_data,
                                                      server_id,
                                                      private_key_hex,
                                                      b64decode(shared_secret),
                                                      public_key_hex_internal,
                                                      public_key_hex_external,
                                                  ),
                                                  returnType="binary",
                                              )("shares", "internal",
                                                "external").alias("payload"),
                                          ))
    valid = df.where("payload is not null")
    valid.write.json(output, mode="overwrite")
Example #59
0
    def transform(
        self,
        blockdf: DataFrame,
        labeldf: pd.DataFrame,
        sample_blocks: Dict[str, List[str]],
        modeldf: DataFrame,
        covdf: pd.DataFrame = pd.DataFrame({})) -> DataFrame:
        """
        Transforms a starting block matrix to the reduced block matrix, using a reducer model produced by the
        RidgeReducer fit method.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks: Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            modeldf : Spark DataFrame produced by the RidgeReducer fit method, representing the reducer model
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional).

        Returns:
             Spark DataFrame representing the reduced block matrix
        """

        validate_inputs(labeldf, covdf)
        transform_key_pattern = ['header_block', 'sample_block']

        if 'label' in blockdf.columns:
            transform_key_pattern.append('label')
            joined = blockdf.drop('sort_key') \
                .join(modeldf, ['header_block', 'sample_block', 'header'], 'right') \
                .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0)))
        else:
            joined = blockdf.drop('sort_key') \
                .join(modeldf, ['header_block', 'sample_block', 'header'], 'right')

        transform_udf = pandas_udf(
            lambda key, pdf: apply_model(key, transform_key_pattern, pdf,
                                         labeldf, sample_blocks, self.alphas,
                                         covdf), reduced_matrix_struct,
            PandasUDFType.GROUPED_MAP)

        record_hls_event('wgrRidgeReduceTransform')

        return joined \
            .groupBy(transform_key_pattern) \
            .apply(transform_udf)
Example #60
0
    def transform(self,
                  blockdf: DataFrame,
                  labeldf: pd.DataFrame,
                  sample_blocks: Dict[str, List[str]],
                  modeldf: DataFrame,
                  cvdf: DataFrame,
                  covdf: pd.DataFrame = pd.DataFrame({})) -> pd.DataFrame:
        """
        Generates predictions for the target labels in the provided label DataFrame by applying the model resulting from
        the RidgeRegression fit method to the starting block matrix.

        Args:
            blockdf : Spark DataFrame representing the beginning block matrix X
            labeldf : Pandas DataFrame containing the target labels used in fitting the ridge models
            sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs
            modeldf : Spark DataFrame produced by the RidgeRegression fit method, representing the reducer model
            cvdf : Spark DataFrame produced by the RidgeRegression fit method, containing the results of the cross
            validation routine.
            covdf : Pandas DataFrame containing covariates to be included in every model in the stacking
                ensemble (optional).

        Returns:
            Pandas DataFrame containing prediction y_hat values. The shape and order match labeldf such that the
            rows are indexed by sample ID and the columns by label. The column types are float64.
        """

        validate_inputs(labeldf, covdf)
        transform_key_pattern = ['sample_block', 'label']

        transform_udf = pandas_udf(
            lambda key, pdf: apply_model(key, transform_key_pattern, pdf, labeldf, sample_blocks,
                                         self.alphas, covdf), reduced_matrix_struct,
            PandasUDFType.GROUPED_MAP)

        blocked_prediction_df = blockdf.drop('header_block', 'sort_key') \
            .join(modeldf.drop('header_block'), ['sample_block', 'header'], 'right') \
            .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0))) \
            .groupBy(transform_key_pattern) \
            .apply(transform_udf) \
            .join(cvdf, ['label', 'alpha'], 'inner')

        pivoted_df = flatten_prediction_df(blocked_prediction_df, sample_blocks, labeldf)

        record_hls_event('wgrRidgeRegressionTransform')

        return pivoted_df