Beispiel #1
0
    def test_addColumnsForCycle_udf(self):
        from ts.flint import udf
        from pyspark.sql.types import DoubleType
        from collections import OrderedDict

        price2 = self.price2()

        result1 = price2.addColumnsForCycle({
            'rank':
            udf(lambda v: v.rank(), DoubleType())(price2['price'])
        }).toPandas()

        expected1 = make_pdf([
            (0, 1, 1.0, 1.0),
            (0, 2, 2.0, 2.0),
            (1, 1, 3.0, 1.0),
            (1, 2, 4.0, 2.0),
            (1, 3, 5.0, 3.0),
        ], ['time', 'id', 'price', 'rank'])
        assert_same(result1, expected1)

        result2 = price2.addColumnsForCycle(
            OrderedDict([('rank', udf(lambda v: v.rank(),
                                      DoubleType())(price2['price'])),
                         ('pct_rank',
                          udf(lambda v: v.rank(pct=True),
                              DoubleType())(price2['price']))])).toPandas()

        expected2 = make_pdf([
            (0, 1, 1.0, 1.0, 0.5),
            (0, 2, 2.0, 2.0, 1.0),
            (1, 1, 3.0, 1.0, 0.333333),
            (1, 2, 4.0, 2.0, 0.666667),
            (1, 3, 5.0, 3.0, 1.0),
        ], ['time', 'id', 'price', 'rank', 'pct_rank'])

        pdt.assert_frame_equal(result2, expected2)

        @udf((DoubleType(), DoubleType()))
        def rank(v):
            return v.rank(), v.rank(pct=True)

        result3 = price2.addColumnsForCycle({
            ('rank', 'pct_rank'):
            rank(price2['price']),
        }).toPandas()
        expected3 = expected2
        pdt.assert_frame_equal(result3, expected3)
Beispiel #2
0
    def test_udf(self):
        from ts.flint import udf
        import pyspark.sql.functions as F
        from pyspark.sql.types import LongType

        vol = self.vol()

        @udf(LongType())
        def foo(v, w):
            return v*2

        result1 = vol.withColumn("volume", foo(vol['volume'], F.lit(42))).toPandas()
        result2 = vol.withColumn("volume", udf(lambda v, w: v*2, LongType())(vol['volume'], F.lit(42))).toPandas()

        expected_pdf1 = make_pdf([
            (1000, 7, 200,),
            (1000, 3, 400,),
            (1050, 3, 600,),
            (1050, 7, 800,),
            (1100, 3, 1000,),
            (1100, 7, 1200,),
            (1150, 3, 1400,),
            (1150, 7, 1600,),
            (1200, 3, 1800,),
            (1200, 7, 2000,),
            (1250, 3, 2200,),
            (1250, 7, 2400,)
        ], ['time', 'id', 'volume'])

        assert_same(result1, expected_pdf1)
        assert_same(result2, expected_pdf1)
Beispiel #3
0
    def test_addColumnsForCycle_udf(self):
        from ts.flint import udf
        from pyspark.sql.types import DoubleType
        from collections import OrderedDict

        price2 = self.price2()

        result1 = price2.addColumnsForCycle({
            'rank': udf(lambda v: v.rank(), DoubleType())(price2['price'])
        }).toPandas()

        expected1 = make_pdf([
            (0, 1, 1.0, 1.0),
            (0, 2, 2.0, 2.0),
            (1, 1, 3.0, 1.0),
            (1, 2, 4.0, 2.0),
            (1, 3, 5.0, 3.0),
        ], ['time', 'id', 'price', 'rank'])
        assert_same(result1, expected1)

        result2 = price2.addColumnsForCycle(OrderedDict([
            ('rank', udf(lambda v: v.rank(), DoubleType())(price2['price'])),
            ('pct_rank', udf(lambda v: v.rank(pct=True), DoubleType())(price2['price']))
        ])).toPandas()

        expected2 = make_pdf([
            (0, 1, 1.0, 1.0, 0.5),
            (0, 2, 2.0, 2.0, 1.0),
            (1, 1, 3.0, 1.0, 0.333333),
            (1, 2, 4.0, 2.0, 0.666667),
            (1, 3, 5.0, 3.0, 1.0),
        ], ['time', 'id', 'price', 'rank', 'pct_rank'])

        pdt.assert_frame_equal(result2, expected2)

        @udf((DoubleType(), DoubleType()))
        def rank(v):
            return v.rank(), v.rank(pct=True)

        result3 = price2.addColumnsForCycle({
            ('rank', 'pct_rank'): rank(price2['price']),
        }).toPandas()
        expected3 = expected2
        pdt.assert_frame_equal(result3, expected3)
Beispiel #4
0
    def test_summarizeWindows_udf(self):
        from ts.flint import udf
        from ts.flint import windows
        from collections import OrderedDict
        from pyspark.sql.types import DoubleType, LongType

        vol = self.vol()
        w = windows.past_absolute_time('99s')

        df = self.flintContext.read.pandas(
            make_pdf([
                (1000, 3, 10.0),
                (1000, 7, 20.0),
                (1050, 3, 30.0),
                (1050, 7, 40.0),
                (1100, 3, 50.0),
                (1150, 3, 60.0),
                (1150, 7, 70.0),
                (1200, 3, 80.0),
                (1200, 7, 90.0),
                (1250, 7, 100.0),
            ], ['time', 'id', 'v']))

        result1 = df.summarizeWindows(
            w,
            OrderedDict([('mean',
                          udf(lambda time, window: window.mean(),
                              DoubleType())(df['time'], vol['volume']))]),
            key="id",
            other=vol).toPandas()
        expected1 = make_pdf([
            (1000, 3, 10.0, 200.0),
            (1000, 7, 20.0, 100.0),
            (1050, 3, 30.0, 250.0),
            (1050, 7, 40.0, 250.0),
            (1100, 3, 50.0, 400.0),
            (1150, 3, 60.0, 600.0),
            (1150, 7, 70.0, 700.0),
            (1200, 3, 80.0, 800.0),
            (1200, 7, 90.0, 900.0),
            (1250, 7, 100.0, 1100.0),
        ], ['time', 'id', 'v', 'mean'])
        assert_same(result1, expected1)

        result2 = df.summarizeWindows(w,
                                      OrderedDict([
                                          ('mean',
                                           udf(lambda window: window.mean(),
                                               DoubleType())(vol['volume']))
                                      ]),
                                      key='id',
                                      other=vol).toPandas()
        expected2 = expected1
        assert_same(result2, expected2)

        result3 = df.summarizeWindows(
            w,
            OrderedDict([
                ('mean', udf(lambda window: window.mean(),
                             DoubleType())(vol['volume'])),
                ('count', udf(lambda time, window: len(window),
                              LongType())(df['time'], vol['volume']))
            ]),
            key='id',
            other=vol).toPandas()
        expected3 = make_pdf([
            (1000, 3, 10.0, 200.0, 1),
            (1000, 7, 20.0, 100.0, 1),
            (1050, 3, 30.0, 250.0, 2),
            (1050, 7, 40.0, 250.0, 2),
            (1100, 3, 50.0, 400.0, 2),
            (1150, 3, 60.0, 600.0, 2),
            (1150, 7, 70.0, 700.0, 2),
            (1200, 3, 80.0, 800.0, 2),
            (1200, 7, 90.0, 900.0, 2),
            (1250, 7, 100.0, 1100.0, 2),
        ], ['time', 'id', 'v', 'mean', 'count'])
        assert_same(result3, expected3)

        @udf('double')
        def window_udf(time, window):
            return (time - window.time).mean().seconds + window.volume.mean()

        result4 = df.summarizeWindows(w,
                                      OrderedDict([
                                          ('mean',
                                           window_udf(df['time'],
                                                      vol[['time',
                                                           'volume']])),
                                      ]),
                                      key='id',
                                      other=vol).toPandas()

        expected4 = make_pdf([
            (1000, 3, 10.0, 200.0),
            (1000, 7, 20.0, 100.0),
            (1050, 3, 30.0, 275.0),
            (1050, 7, 40.0, 275.0),
            (1100, 3, 50.0, 425.0),
            (1150, 3, 60.0, 625.0),
            (1150, 7, 70.0, 725.0),
            (1200, 3, 80.0, 825.0),
            (1200, 7, 90.0, 925.0),
            (1250, 7, 100.0, 1125.0),
        ], ['time', 'id', 'v', 'mean'])
        assert_same(result4, expected4)

        @udf(DoubleType())
        def foo5(row, window):
            return (row[0] - window.time).mean().seconds + window.volume.mean()

        result5 = df.summarizeWindows(w,
                                      OrderedDict([
                                          ('mean',
                                           foo5(df[['time', 'v']],
                                                vol[['time', 'volume']])),
                                      ]),
                                      key='id',
                                      other=vol).toPandas()
        expected5 = expected4
        assert_same(result5, expected5)

        @udf((DoubleType(), LongType()))
        def mean_and_count(v):
            return v.mean(), len(v)

        result6 = df.summarizeWindows(w,
                                      OrderedDict([[
                                          ('mean', 'count'),
                                          mean_and_count(vol['volume'])
                                      ]]),
                                      key='id',
                                      other=vol).toPandas()
        expected6 = expected3
        assert_same(result6, expected6)

        @udf(DoubleType())
        def mean(v):
            return v.mean()

        result7 = vol.summarizeWindows(w, {
            'mean': mean(vol['volume'])
        },
                                       key='id').toPandas()
        expected7 = make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 400.0),
            (1100, 7, 600, 500.0),
            (1150, 3, 700, 600.0),
            (1150, 7, 800, 700.0),
            (1200, 3, 900, 800.0),
            (1200, 7, 1000, 900.0),
            (1250, 3, 1100, 1000.0),
            (1250, 7, 1200, 1100.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result7, expected7)

        result8 = vol.summarizeWindows(w, {
            'mean': mean(vol['volume'])
        }).toPandas()
        expected8 = make_pdf([
            (1000, 7, 100, 150.0),
            (1000, 3, 200, 150.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 450.0),
            (1100, 7, 600, 450.0),
            (1150, 3, 700, 650.0),
            (1150, 7, 800, 650.0),
            (1200, 3, 900, 850.0),
            (1200, 7, 1000, 850.0),
            (1250, 3, 1100, 1050.0),
            (1250, 7, 1200, 1050.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result8, expected8)
Beispiel #5
0
    def test_summarizeIntervals_udf(self):
        from ts.flint.functions import udf
        from pyspark.sql.types import LongType, DoubleType

        vol = self.vol()

        clock = self.flintContext.read.pandas(
            make_pdf([
                (1000, ),
                (1100, ),
                (1200, ),
                (1300, ),
            ], ["time"]))

        result1 = vol.summarizeIntervals(
            clock, {
                'sum': udf(lambda v: v.sum(), LongType())(vol.volume)
            }).toPandas()
        expected1 = make_pdf([
            (1100, 1000),
            (1200, 2600),
            (1300, 4200),
        ], ["time", "sum"])
        assert_same(result1, expected1)

        result2 = vol.summarizeIntervals(
            clock, {
                'sum': udf(lambda v: v.sum(), LongType())(vol.volume)
            },
            key="id").toPandas()
        expected2 = make_pdf([
            (1100, 7, 500),
            (1100, 3, 500),
            (1200, 3, 1200),
            (1200, 7, 1400),
            (1300, 3, 2000),
            (1300, 7, 2200),
        ], ["time", "id", "sum"])
        assert_same(result2, expected2)

        result3 = vol.summarizeIntervals(
            clock, {
                'sum': udf(lambda v: v.sum(), LongType())(vol.volume)
            },
            rounding="begin").toPandas()
        expected3 = make_pdf([
            (1000, 1000),
            (1100, 2600),
            (1200, 4200),
        ], ["time", "sum"])
        assert_same(result3, expected3)

        result4 = vol.summarizeIntervals(
            clock, {
                'sum': udf(lambda v: v.sum(), LongType())(vol.volume)
            },
            inclusion="end").toPandas()
        expected4 = make_pdf([
            (1100, 1800),
            (1200, 3400),
            (1300, 2300),
        ], ["time", "sum"])
        assert_same(result4, expected4)

        result5 = vol.summarizeIntervals(
            clock, {
                'mean': udf(lambda v: v.mean(), DoubleType())(vol.volume),
                'sum': udf(lambda v: v.sum(), LongType())(vol.volume)
            }).toPandas()
        expected5 = make_pdf([
            (1100, 250.0, 1000),
            (1200, 650.0, 2600),
            (1300, 1050.0, 4200),
        ], ["time", "mean", "sum"])
        assert_same(result5, expected5)

        @udf((DoubleType(), LongType()))
        def mean_and_sum(v):
            return v.mean(), v.sum()

        result6 = vol.summarizeIntervals(
            clock, {
                ('mean', 'sum'): mean_and_sum(vol.volume)
            }).toPandas()
        expected6 = make_pdf([
            (1100, 250.0, 1000),
            (1200, 650.0, 2600),
            (1300, 1050.0, 4200),
        ], ["time", "mean", "sum"])
        assert_same(result6, expected6)

        from pyspark.sql.functions import lit

        vol_with_weights = vol.withColumn('w', lit(1.0))
        result7 = vol_with_weights.summarizeIntervals(
            clock,
            {'weighted_mean': udf(lambda df: np.average(df.volume, weights=df.w), DoubleType()) \
                (vol_with_weights[['volume', 'w']]) }
        ).toPandas()
        expected7 = make_pdf([
            (1100, 250.0),
            (1200, 650.0),
            (1300, 1050.0),
        ], ["time", "weighted_mean"])
        assert_same(result7, expected7)
Beispiel #6
0
    def test_udf(self):
        from ts.flint import udf
        import pyspark.sql.functions as F
        from pyspark.sql.types import LongType

        vol = self.vol()

        @udf(LongType())
        def foo(v, w):
            return v * 2

        result1 = vol.withColumn("volume", foo(vol['volume'],
                                               F.lit(42))).toPandas()
        result2 = vol.withColumn(
            "volume",
            udf(lambda v, w: v * 2, LongType())(vol['volume'],
                                                F.lit(42))).toPandas()

        expected_pdf1 = make_pdf([(
            1000,
            7,
            200,
        ), (
            1000,
            3,
            400,
        ), (
            1050,
            3,
            600,
        ), (
            1050,
            7,
            800,
        ), (
            1100,
            3,
            1000,
        ), (
            1100,
            7,
            1200,
        ), (
            1150,
            3,
            1400,
        ), (
            1150,
            7,
            1600,
        ), (
            1200,
            3,
            1800,
        ), (
            1200,
            7,
            2000,
        ), (
            1250,
            3,
            2200,
        ), (
            1250,
            7,
            2400,
        )], ['time', 'id', 'volume'])

        assert_same(result1, expected_pdf1)
        assert_same(result2, expected_pdf1)
Beispiel #7
0
    def test_summarizeCycles_udf(self):
        from ts.flint import udf
        from pyspark.sql.types import DoubleType, LongType
        from collections import OrderedDict
        import pyspark.sql.functions as F

        vol = self.vol()
        weighted_vol = vol.withColumn('weight', F.lit(1))

        result1 = vol.summarizeCycles(
            OrderedDict([
                ('mean', udf(lambda v: v.mean(),
                             DoubleType())(weighted_vol.volume)),
                ('sum', udf(lambda v: v.sum(),
                            LongType())(weighted_vol.volume)),
            ])).toPandas()

        result2 = vol.summarizeCycles(
            OrderedDict([
                ('mean', udf(lambda df: df.volume.mean(),
                             DoubleType())(weighted_vol[['volume']])),
                ('sum', udf(lambda df: df.volume.sum(),
                            LongType())(weighted_vol[['volume']])),
            ])).toPandas()

        result3 = weighted_vol.summarizeCycles(
            OrderedDict([
                ('mean',
                 udf(lambda v, w: np.average(v, weights=w),
                     DoubleType())(weighted_vol['volume'],
                                   weighted_vol['weight'])),
                ('sum', udf(lambda v, w: (v * w).sum(),
                            LongType())(weighted_vol['volume'],
                                        weighted_vol['weight'])),
            ])).toPandas()

        result4 = weighted_vol.summarizeCycles(
            OrderedDict([
                ('mean',
                 udf(lambda df: np.average(df.volume, weights=df.weight),
                     DoubleType())(weighted_vol[['volume', 'weight']])),
                ('sum',
                 udf(lambda df: (df.volume * df.weight).sum(),
                     LongType())(weighted_vol[['volume', 'weight']])),
            ])).toPandas()

        @udf(DoubleType())
        def foo(v, w):
            return np.average(v, weights=w)

        @udf(LongType())
        def bar(v, w):
            return (v * w).sum()

        result5 = weighted_vol.summarizeCycles(
            OrderedDict([
                ('mean', foo(weighted_vol['volume'], weighted_vol['weight'])),
                ('sum', bar(weighted_vol['volume'], weighted_vol['weight']))
            ])).toPandas()

        @udf(DoubleType())
        def foo1(df):
            return np.average(df.volume, weights=df.weight)

        @udf(LongType())
        def bar1(df):
            return (df.volume * df.weight).sum()

        result6 = weighted_vol.summarizeCycles(
            OrderedDict([('mean', foo1(weighted_vol[['volume', 'weight']])),
                         ('sum', bar1(weighted_vol[['volume',
                                                    'weight']]))])).toPandas()

        expected_pdf1 = make_pdf([
            (
                1000,
                150.0,
                300,
            ),
            (
                1050,
                350.0,
                700,
            ),
            (
                1100,
                550.0,
                1100,
            ),
            (
                1150,
                750.0,
                1500,
            ),
            (
                1200,
                950.0,
                1900,
            ),
            (
                1250,
                1150.0,
                2300,
            ),
        ], ["time", "mean", "sum"])

        @udf((DoubleType(), LongType()))
        def foobar(v, w):
            foo = np.average(v, weights=w)
            bar = (v * w).sum()
            return (foo, bar)

        result7 = weighted_vol.summarizeCycles({
            ('mean', 'sum'):
            foobar(weighted_vol['volume'], weighted_vol['weight'])
        }).toPandas()

        result8 = weighted_vol.summarizeCycles({
            "mean":
            udf(lambda v: v.mean(), DoubleType())(weighted_vol['volume']),
            "sum":
            udf(lambda v: v.sum(), LongType())(weighted_vol['volume'])
        }).toPandas()

        assert_same(result1, expected_pdf1)
        assert_same(result2, expected_pdf1)
        assert_same(result3, expected_pdf1)
        assert_same(result4, expected_pdf1)
        assert_same(result5, expected_pdf1)
        assert_same(result6, expected_pdf1)
        assert_same(result7, expected_pdf1)
        assert_same(result8, expected_pdf1)
Beispiel #8
0
    def test_summarizeIntervals_udf(self):
        from ts.flint.functions import udf
        from pyspark.sql.types import LongType, DoubleType

        vol = self.vol()

        clock = self.flintContext.read.pandas(make_pdf([
            (1000,),
            (1100,),
            (1200,),
            (1300,),
        ], ["time"]))

        result1 = vol.summarizeIntervals(
            clock,
            {'sum': udf(lambda v: v.sum(), LongType())(vol.volume)}
        ).toPandas()
        expected1 = make_pdf([
            (1100, 1000),
            (1200, 2600),
            (1300, 4200),
        ], ["time", "sum"])
        assert_same(result1, expected1)

        result2 = vol.summarizeIntervals(
            clock,
            {'sum': udf(lambda v: v.sum(), LongType())(vol.volume)},
            key="id"
        ).toPandas()
        expected2 = make_pdf([
            (1100, 7, 500),
            (1100, 3, 500),
            (1200, 3, 1200),
            (1200, 7, 1400),
            (1300, 3, 2000),
            (1300, 7, 2200),
        ], ["time", "id", "sum"])
        assert_same(result2, expected2)

        result3 = vol.summarizeIntervals(
            clock,
            {'sum': udf(lambda v: v.sum(), LongType())(vol.volume)},
            rounding="begin"
        ).toPandas()
        expected3 = make_pdf([
            (1000, 1000),
            (1100, 2600),
            (1200, 4200),
        ], ["time", "sum"])
        assert_same(result3, expected3)

        result4 = vol.summarizeIntervals(
            clock,
            {'sum': udf(lambda v: v.sum(), LongType())(vol.volume)},
            inclusion="end"
        ).toPandas()
        expected4 = make_pdf([
            (1100, 1800),
            (1200, 3400),
            (1300, 2300),
        ], ["time", "sum"])
        assert_same(result4, expected4)

        result5 = vol.summarizeIntervals(
            clock,
            {'mean': udf(lambda v: v.mean(), DoubleType())(vol.volume),
             'sum': udf(lambda v: v.sum(), LongType())(vol.volume)}
        ).toPandas()
        expected5 = make_pdf([
            (1100, 250.0, 1000),
            (1200, 650.0, 2600),
            (1300, 1050.0, 4200),
        ], ["time", "mean", "sum"])
        assert_same(result5, expected5)

        @udf((DoubleType(), LongType()))
        def mean_and_sum(v):
            return v.mean(), v.sum()

        result6 = vol.summarizeIntervals(
            clock,
            {('mean', 'sum'): mean_and_sum(vol.volume) }
        ).toPandas()
        expected6 = make_pdf([
            (1100, 250.0, 1000),
            (1200, 650.0, 2600),
            (1300, 1050.0, 4200),
        ], ["time", "mean", "sum"])
        assert_same(result6, expected6)

        from pyspark.sql.functions import lit

        vol_with_weights = vol.withColumn('w', lit(1.0))
        result7 = vol_with_weights.summarizeIntervals(
            clock,
            {'weighted_mean': udf(lambda df: np.average(df.volume, weights=df.w), DoubleType()) \
                (vol_with_weights[['volume', 'w']]) }
        ).toPandas()
        expected7 = make_pdf([
            (1100, 250.0),
            (1200, 650.0),
            (1300, 1050.0),
        ], ["time", "weighted_mean"])
        assert_same(result7, expected7)
Beispiel #9
0
    def test_summarizeCycles_udf(self):
        from ts.flint import udf
        from pyspark.sql.types import DoubleType, LongType
        from collections import OrderedDict
        import pyspark.sql.functions as F

        vol = self.vol()
        weighted_vol = vol.withColumn('weight', F.lit(1))

        result1 = vol.summarizeCycles(
            OrderedDict([
                ('mean', udf(lambda v: v.mean(), DoubleType())(weighted_vol.volume)),
                ('sum', udf(lambda v: v.sum(), LongType())(weighted_vol.volume)),
            ])
        ).toPandas()

        result2 = vol.summarizeCycles(
            OrderedDict([
                ('mean', udf(lambda df: df.volume.mean(), DoubleType())(weighted_vol[['volume']])),
                ('sum', udf(lambda df: df.volume.sum(), LongType())(weighted_vol[['volume']])),
            ])
        ).toPandas()

        result3 = weighted_vol.summarizeCycles(
            OrderedDict([
                ('mean', udf(lambda v, w: np.average(v, weights=w), DoubleType())(weighted_vol['volume'],
                                                                                  weighted_vol['weight'])),
                ('sum', udf(lambda v, w: (v * w).sum(), LongType())(weighted_vol['volume'], weighted_vol['weight'])),
            ])
        ).toPandas()

        result4 = weighted_vol.summarizeCycles(
            OrderedDict([
                ('mean', udf(lambda df: np.average(df.volume, weights=df.weight), DoubleType())(
                    weighted_vol[['volume', 'weight']])),
                ('sum', udf(lambda df: (df.volume * df.weight).sum(), LongType())(weighted_vol[['volume', 'weight']])),
            ])
        ).toPandas()

        @udf(DoubleType())
        def foo(v, w):
            return np.average(v, weights=w)

        @udf(LongType())
        def bar(v, w):
            return (v * w).sum()

        result5 = weighted_vol.summarizeCycles(
            OrderedDict([
                ('mean', foo(weighted_vol['volume'], weighted_vol['weight'])),
                ('sum', bar(weighted_vol['volume'], weighted_vol['weight']))
            ])
        ).toPandas()

        @udf(DoubleType())
        def foo1(df):
            return np.average(df.volume, weights=df.weight)

        @udf(LongType())
        def bar1(df):
            return (df.volume * df.weight).sum()

        result6 = weighted_vol.summarizeCycles(
            OrderedDict([
                ('mean', foo1(weighted_vol[['volume', 'weight']])),
                ('sum', bar1(weighted_vol[['volume', 'weight']]))
            ])
        ).toPandas()

        expected_pdf1 = make_pdf([
            (1000, 150.0, 300,),
            (1050, 350.0, 700,),
            (1100, 550.0, 1100,),
            (1150, 750.0, 1500,),
            (1200, 950.0, 1900,),
            (1250, 1150.0, 2300,),
        ], ["time", "mean", "sum"])

        @udf((DoubleType(), LongType()))
        def foobar(v, w):
            foo = np.average(v, weights=w)
            bar = (v * w).sum()
            return (foo, bar)

        result7 = weighted_vol.summarizeCycles(
            {('mean', 'sum'): foobar(weighted_vol['volume'], weighted_vol['weight'])}
        ).toPandas()

        result8 = weighted_vol.summarizeCycles({
            "mean": udf(lambda v: v.mean(), DoubleType())(weighted_vol['volume']),
            "sum": udf(lambda v: v.sum(), LongType())(weighted_vol['volume'])
        }).toPandas()

        assert_same(result1, expected_pdf1)
        assert_same(result2, expected_pdf1)
        assert_same(result3, expected_pdf1)
        assert_same(result4, expected_pdf1)
        assert_same(result5, expected_pdf1)
        assert_same(result6, expected_pdf1)
        assert_same(result7, expected_pdf1)
        assert_same(result8, expected_pdf1)