def test_addColumnsForCycle_udf(self): from ts.flint import udf from pyspark.sql.types import DoubleType from collections import OrderedDict price2 = self.price2() result1 = price2.addColumnsForCycle({ 'rank': udf(lambda v: v.rank(), DoubleType())(price2['price']) }).toPandas() expected1 = make_pdf([ (0, 1, 1.0, 1.0), (0, 2, 2.0, 2.0), (1, 1, 3.0, 1.0), (1, 2, 4.0, 2.0), (1, 3, 5.0, 3.0), ], ['time', 'id', 'price', 'rank']) assert_same(result1, expected1) result2 = price2.addColumnsForCycle( OrderedDict([('rank', udf(lambda v: v.rank(), DoubleType())(price2['price'])), ('pct_rank', udf(lambda v: v.rank(pct=True), DoubleType())(price2['price']))])).toPandas() expected2 = make_pdf([ (0, 1, 1.0, 1.0, 0.5), (0, 2, 2.0, 2.0, 1.0), (1, 1, 3.0, 1.0, 0.333333), (1, 2, 4.0, 2.0, 0.666667), (1, 3, 5.0, 3.0, 1.0), ], ['time', 'id', 'price', 'rank', 'pct_rank']) pdt.assert_frame_equal(result2, expected2) @udf((DoubleType(), DoubleType())) def rank(v): return v.rank(), v.rank(pct=True) result3 = price2.addColumnsForCycle({ ('rank', 'pct_rank'): rank(price2['price']), }).toPandas() expected3 = expected2 pdt.assert_frame_equal(result3, expected3)
def test_udf(self): from ts.flint import udf import pyspark.sql.functions as F from pyspark.sql.types import LongType vol = self.vol() @udf(LongType()) def foo(v, w): return v*2 result1 = vol.withColumn("volume", foo(vol['volume'], F.lit(42))).toPandas() result2 = vol.withColumn("volume", udf(lambda v, w: v*2, LongType())(vol['volume'], F.lit(42))).toPandas() expected_pdf1 = make_pdf([ (1000, 7, 200,), (1000, 3, 400,), (1050, 3, 600,), (1050, 7, 800,), (1100, 3, 1000,), (1100, 7, 1200,), (1150, 3, 1400,), (1150, 7, 1600,), (1200, 3, 1800,), (1200, 7, 2000,), (1250, 3, 2200,), (1250, 7, 2400,) ], ['time', 'id', 'volume']) assert_same(result1, expected_pdf1) assert_same(result2, expected_pdf1)
def test_addColumnsForCycle_udf(self): from ts.flint import udf from pyspark.sql.types import DoubleType from collections import OrderedDict price2 = self.price2() result1 = price2.addColumnsForCycle({ 'rank': udf(lambda v: v.rank(), DoubleType())(price2['price']) }).toPandas() expected1 = make_pdf([ (0, 1, 1.0, 1.0), (0, 2, 2.0, 2.0), (1, 1, 3.0, 1.0), (1, 2, 4.0, 2.0), (1, 3, 5.0, 3.0), ], ['time', 'id', 'price', 'rank']) assert_same(result1, expected1) result2 = price2.addColumnsForCycle(OrderedDict([ ('rank', udf(lambda v: v.rank(), DoubleType())(price2['price'])), ('pct_rank', udf(lambda v: v.rank(pct=True), DoubleType())(price2['price'])) ])).toPandas() expected2 = make_pdf([ (0, 1, 1.0, 1.0, 0.5), (0, 2, 2.0, 2.0, 1.0), (1, 1, 3.0, 1.0, 0.333333), (1, 2, 4.0, 2.0, 0.666667), (1, 3, 5.0, 3.0, 1.0), ], ['time', 'id', 'price', 'rank', 'pct_rank']) pdt.assert_frame_equal(result2, expected2) @udf((DoubleType(), DoubleType())) def rank(v): return v.rank(), v.rank(pct=True) result3 = price2.addColumnsForCycle({ ('rank', 'pct_rank'): rank(price2['price']), }).toPandas() expected3 = expected2 pdt.assert_frame_equal(result3, expected3)
def test_summarizeWindows_udf(self): from ts.flint import udf from ts.flint import windows from collections import OrderedDict from pyspark.sql.types import DoubleType, LongType vol = self.vol() w = windows.past_absolute_time('99s') df = self.flintContext.read.pandas( make_pdf([ (1000, 3, 10.0), (1000, 7, 20.0), (1050, 3, 30.0), (1050, 7, 40.0), (1100, 3, 50.0), (1150, 3, 60.0), (1150, 7, 70.0), (1200, 3, 80.0), (1200, 7, 90.0), (1250, 7, 100.0), ], ['time', 'id', 'v'])) result1 = df.summarizeWindows( w, OrderedDict([('mean', udf(lambda time, window: window.mean(), DoubleType())(df['time'], vol['volume']))]), key="id", other=vol).toPandas() expected1 = make_pdf([ (1000, 3, 10.0, 200.0), (1000, 7, 20.0, 100.0), (1050, 3, 30.0, 250.0), (1050, 7, 40.0, 250.0), (1100, 3, 50.0, 400.0), (1150, 3, 60.0, 600.0), (1150, 7, 70.0, 700.0), (1200, 3, 80.0, 800.0), (1200, 7, 90.0, 900.0), (1250, 7, 100.0, 1100.0), ], ['time', 'id', 'v', 'mean']) assert_same(result1, expected1) result2 = df.summarizeWindows(w, OrderedDict([ ('mean', udf(lambda window: window.mean(), DoubleType())(vol['volume'])) ]), key='id', other=vol).toPandas() expected2 = expected1 assert_same(result2, expected2) result3 = df.summarizeWindows( w, OrderedDict([ ('mean', udf(lambda window: window.mean(), DoubleType())(vol['volume'])), ('count', udf(lambda time, window: len(window), LongType())(df['time'], vol['volume'])) ]), key='id', other=vol).toPandas() expected3 = make_pdf([ (1000, 3, 10.0, 200.0, 1), (1000, 7, 20.0, 100.0, 1), (1050, 3, 30.0, 250.0, 2), (1050, 7, 40.0, 250.0, 2), (1100, 3, 50.0, 400.0, 2), (1150, 3, 60.0, 600.0, 2), (1150, 7, 70.0, 700.0, 2), (1200, 3, 80.0, 800.0, 2), (1200, 7, 90.0, 900.0, 2), (1250, 7, 100.0, 1100.0, 2), ], ['time', 'id', 'v', 'mean', 'count']) assert_same(result3, expected3) @udf('double') def window_udf(time, window): return (time - window.time).mean().seconds + window.volume.mean() result4 = df.summarizeWindows(w, OrderedDict([ ('mean', window_udf(df['time'], vol[['time', 'volume']])), ]), key='id', other=vol).toPandas() expected4 = make_pdf([ (1000, 3, 10.0, 200.0), (1000, 7, 20.0, 100.0), (1050, 3, 30.0, 275.0), (1050, 7, 40.0, 275.0), (1100, 3, 50.0, 425.0), (1150, 3, 60.0, 625.0), (1150, 7, 70.0, 725.0), (1200, 3, 80.0, 825.0), (1200, 7, 90.0, 925.0), (1250, 7, 100.0, 1125.0), ], ['time', 'id', 'v', 'mean']) assert_same(result4, expected4) @udf(DoubleType()) def foo5(row, window): return (row[0] - window.time).mean().seconds + window.volume.mean() result5 = df.summarizeWindows(w, OrderedDict([ ('mean', foo5(df[['time', 'v']], vol[['time', 'volume']])), ]), key='id', other=vol).toPandas() expected5 = expected4 assert_same(result5, expected5) @udf((DoubleType(), LongType())) def mean_and_count(v): return v.mean(), len(v) result6 = df.summarizeWindows(w, OrderedDict([[ ('mean', 'count'), mean_and_count(vol['volume']) ]]), key='id', other=vol).toPandas() expected6 = expected3 assert_same(result6, expected6) @udf(DoubleType()) def mean(v): return v.mean() result7 = vol.summarizeWindows(w, { 'mean': mean(vol['volume']) }, key='id').toPandas() expected7 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 400.0), (1100, 7, 600, 500.0), (1150, 3, 700, 600.0), (1150, 7, 800, 700.0), (1200, 3, 900, 800.0), (1200, 7, 1000, 900.0), (1250, 3, 1100, 1000.0), (1250, 7, 1200, 1100.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result7, expected7) result8 = vol.summarizeWindows(w, { 'mean': mean(vol['volume']) }).toPandas() expected8 = make_pdf([ (1000, 7, 100, 150.0), (1000, 3, 200, 150.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 450.0), (1100, 7, 600, 450.0), (1150, 3, 700, 650.0), (1150, 7, 800, 650.0), (1200, 3, 900, 850.0), (1200, 7, 1000, 850.0), (1250, 3, 1100, 1050.0), (1250, 7, 1200, 1050.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result8, expected8)
def test_summarizeIntervals_udf(self): from ts.flint.functions import udf from pyspark.sql.types import LongType, DoubleType vol = self.vol() clock = self.flintContext.read.pandas( make_pdf([ (1000, ), (1100, ), (1200, ), (1300, ), ], ["time"])) result1 = vol.summarizeIntervals( clock, { 'sum': udf(lambda v: v.sum(), LongType())(vol.volume) }).toPandas() expected1 = make_pdf([ (1100, 1000), (1200, 2600), (1300, 4200), ], ["time", "sum"]) assert_same(result1, expected1) result2 = vol.summarizeIntervals( clock, { 'sum': udf(lambda v: v.sum(), LongType())(vol.volume) }, key="id").toPandas() expected2 = make_pdf([ (1100, 7, 500), (1100, 3, 500), (1200, 3, 1200), (1200, 7, 1400), (1300, 3, 2000), (1300, 7, 2200), ], ["time", "id", "sum"]) assert_same(result2, expected2) result3 = vol.summarizeIntervals( clock, { 'sum': udf(lambda v: v.sum(), LongType())(vol.volume) }, rounding="begin").toPandas() expected3 = make_pdf([ (1000, 1000), (1100, 2600), (1200, 4200), ], ["time", "sum"]) assert_same(result3, expected3) result4 = vol.summarizeIntervals( clock, { 'sum': udf(lambda v: v.sum(), LongType())(vol.volume) }, inclusion="end").toPandas() expected4 = make_pdf([ (1100, 1800), (1200, 3400), (1300, 2300), ], ["time", "sum"]) assert_same(result4, expected4) result5 = vol.summarizeIntervals( clock, { 'mean': udf(lambda v: v.mean(), DoubleType())(vol.volume), 'sum': udf(lambda v: v.sum(), LongType())(vol.volume) }).toPandas() expected5 = make_pdf([ (1100, 250.0, 1000), (1200, 650.0, 2600), (1300, 1050.0, 4200), ], ["time", "mean", "sum"]) assert_same(result5, expected5) @udf((DoubleType(), LongType())) def mean_and_sum(v): return v.mean(), v.sum() result6 = vol.summarizeIntervals( clock, { ('mean', 'sum'): mean_and_sum(vol.volume) }).toPandas() expected6 = make_pdf([ (1100, 250.0, 1000), (1200, 650.0, 2600), (1300, 1050.0, 4200), ], ["time", "mean", "sum"]) assert_same(result6, expected6) from pyspark.sql.functions import lit vol_with_weights = vol.withColumn('w', lit(1.0)) result7 = vol_with_weights.summarizeIntervals( clock, {'weighted_mean': udf(lambda df: np.average(df.volume, weights=df.w), DoubleType()) \ (vol_with_weights[['volume', 'w']]) } ).toPandas() expected7 = make_pdf([ (1100, 250.0), (1200, 650.0), (1300, 1050.0), ], ["time", "weighted_mean"]) assert_same(result7, expected7)
def test_udf(self): from ts.flint import udf import pyspark.sql.functions as F from pyspark.sql.types import LongType vol = self.vol() @udf(LongType()) def foo(v, w): return v * 2 result1 = vol.withColumn("volume", foo(vol['volume'], F.lit(42))).toPandas() result2 = vol.withColumn( "volume", udf(lambda v, w: v * 2, LongType())(vol['volume'], F.lit(42))).toPandas() expected_pdf1 = make_pdf([( 1000, 7, 200, ), ( 1000, 3, 400, ), ( 1050, 3, 600, ), ( 1050, 7, 800, ), ( 1100, 3, 1000, ), ( 1100, 7, 1200, ), ( 1150, 3, 1400, ), ( 1150, 7, 1600, ), ( 1200, 3, 1800, ), ( 1200, 7, 2000, ), ( 1250, 3, 2200, ), ( 1250, 7, 2400, )], ['time', 'id', 'volume']) assert_same(result1, expected_pdf1) assert_same(result2, expected_pdf1)
def test_summarizeCycles_udf(self): from ts.flint import udf from pyspark.sql.types import DoubleType, LongType from collections import OrderedDict import pyspark.sql.functions as F vol = self.vol() weighted_vol = vol.withColumn('weight', F.lit(1)) result1 = vol.summarizeCycles( OrderedDict([ ('mean', udf(lambda v: v.mean(), DoubleType())(weighted_vol.volume)), ('sum', udf(lambda v: v.sum(), LongType())(weighted_vol.volume)), ])).toPandas() result2 = vol.summarizeCycles( OrderedDict([ ('mean', udf(lambda df: df.volume.mean(), DoubleType())(weighted_vol[['volume']])), ('sum', udf(lambda df: df.volume.sum(), LongType())(weighted_vol[['volume']])), ])).toPandas() result3 = weighted_vol.summarizeCycles( OrderedDict([ ('mean', udf(lambda v, w: np.average(v, weights=w), DoubleType())(weighted_vol['volume'], weighted_vol['weight'])), ('sum', udf(lambda v, w: (v * w).sum(), LongType())(weighted_vol['volume'], weighted_vol['weight'])), ])).toPandas() result4 = weighted_vol.summarizeCycles( OrderedDict([ ('mean', udf(lambda df: np.average(df.volume, weights=df.weight), DoubleType())(weighted_vol[['volume', 'weight']])), ('sum', udf(lambda df: (df.volume * df.weight).sum(), LongType())(weighted_vol[['volume', 'weight']])), ])).toPandas() @udf(DoubleType()) def foo(v, w): return np.average(v, weights=w) @udf(LongType()) def bar(v, w): return (v * w).sum() result5 = weighted_vol.summarizeCycles( OrderedDict([ ('mean', foo(weighted_vol['volume'], weighted_vol['weight'])), ('sum', bar(weighted_vol['volume'], weighted_vol['weight'])) ])).toPandas() @udf(DoubleType()) def foo1(df): return np.average(df.volume, weights=df.weight) @udf(LongType()) def bar1(df): return (df.volume * df.weight).sum() result6 = weighted_vol.summarizeCycles( OrderedDict([('mean', foo1(weighted_vol[['volume', 'weight']])), ('sum', bar1(weighted_vol[['volume', 'weight']]))])).toPandas() expected_pdf1 = make_pdf([ ( 1000, 150.0, 300, ), ( 1050, 350.0, 700, ), ( 1100, 550.0, 1100, ), ( 1150, 750.0, 1500, ), ( 1200, 950.0, 1900, ), ( 1250, 1150.0, 2300, ), ], ["time", "mean", "sum"]) @udf((DoubleType(), LongType())) def foobar(v, w): foo = np.average(v, weights=w) bar = (v * w).sum() return (foo, bar) result7 = weighted_vol.summarizeCycles({ ('mean', 'sum'): foobar(weighted_vol['volume'], weighted_vol['weight']) }).toPandas() result8 = weighted_vol.summarizeCycles({ "mean": udf(lambda v: v.mean(), DoubleType())(weighted_vol['volume']), "sum": udf(lambda v: v.sum(), LongType())(weighted_vol['volume']) }).toPandas() assert_same(result1, expected_pdf1) assert_same(result2, expected_pdf1) assert_same(result3, expected_pdf1) assert_same(result4, expected_pdf1) assert_same(result5, expected_pdf1) assert_same(result6, expected_pdf1) assert_same(result7, expected_pdf1) assert_same(result8, expected_pdf1)
def test_summarizeIntervals_udf(self): from ts.flint.functions import udf from pyspark.sql.types import LongType, DoubleType vol = self.vol() clock = self.flintContext.read.pandas(make_pdf([ (1000,), (1100,), (1200,), (1300,), ], ["time"])) result1 = vol.summarizeIntervals( clock, {'sum': udf(lambda v: v.sum(), LongType())(vol.volume)} ).toPandas() expected1 = make_pdf([ (1100, 1000), (1200, 2600), (1300, 4200), ], ["time", "sum"]) assert_same(result1, expected1) result2 = vol.summarizeIntervals( clock, {'sum': udf(lambda v: v.sum(), LongType())(vol.volume)}, key="id" ).toPandas() expected2 = make_pdf([ (1100, 7, 500), (1100, 3, 500), (1200, 3, 1200), (1200, 7, 1400), (1300, 3, 2000), (1300, 7, 2200), ], ["time", "id", "sum"]) assert_same(result2, expected2) result3 = vol.summarizeIntervals( clock, {'sum': udf(lambda v: v.sum(), LongType())(vol.volume)}, rounding="begin" ).toPandas() expected3 = make_pdf([ (1000, 1000), (1100, 2600), (1200, 4200), ], ["time", "sum"]) assert_same(result3, expected3) result4 = vol.summarizeIntervals( clock, {'sum': udf(lambda v: v.sum(), LongType())(vol.volume)}, inclusion="end" ).toPandas() expected4 = make_pdf([ (1100, 1800), (1200, 3400), (1300, 2300), ], ["time", "sum"]) assert_same(result4, expected4) result5 = vol.summarizeIntervals( clock, {'mean': udf(lambda v: v.mean(), DoubleType())(vol.volume), 'sum': udf(lambda v: v.sum(), LongType())(vol.volume)} ).toPandas() expected5 = make_pdf([ (1100, 250.0, 1000), (1200, 650.0, 2600), (1300, 1050.0, 4200), ], ["time", "mean", "sum"]) assert_same(result5, expected5) @udf((DoubleType(), LongType())) def mean_and_sum(v): return v.mean(), v.sum() result6 = vol.summarizeIntervals( clock, {('mean', 'sum'): mean_and_sum(vol.volume) } ).toPandas() expected6 = make_pdf([ (1100, 250.0, 1000), (1200, 650.0, 2600), (1300, 1050.0, 4200), ], ["time", "mean", "sum"]) assert_same(result6, expected6) from pyspark.sql.functions import lit vol_with_weights = vol.withColumn('w', lit(1.0)) result7 = vol_with_weights.summarizeIntervals( clock, {'weighted_mean': udf(lambda df: np.average(df.volume, weights=df.w), DoubleType()) \ (vol_with_weights[['volume', 'w']]) } ).toPandas() expected7 = make_pdf([ (1100, 250.0), (1200, 650.0), (1300, 1050.0), ], ["time", "weighted_mean"]) assert_same(result7, expected7)
def test_summarizeCycles_udf(self): from ts.flint import udf from pyspark.sql.types import DoubleType, LongType from collections import OrderedDict import pyspark.sql.functions as F vol = self.vol() weighted_vol = vol.withColumn('weight', F.lit(1)) result1 = vol.summarizeCycles( OrderedDict([ ('mean', udf(lambda v: v.mean(), DoubleType())(weighted_vol.volume)), ('sum', udf(lambda v: v.sum(), LongType())(weighted_vol.volume)), ]) ).toPandas() result2 = vol.summarizeCycles( OrderedDict([ ('mean', udf(lambda df: df.volume.mean(), DoubleType())(weighted_vol[['volume']])), ('sum', udf(lambda df: df.volume.sum(), LongType())(weighted_vol[['volume']])), ]) ).toPandas() result3 = weighted_vol.summarizeCycles( OrderedDict([ ('mean', udf(lambda v, w: np.average(v, weights=w), DoubleType())(weighted_vol['volume'], weighted_vol['weight'])), ('sum', udf(lambda v, w: (v * w).sum(), LongType())(weighted_vol['volume'], weighted_vol['weight'])), ]) ).toPandas() result4 = weighted_vol.summarizeCycles( OrderedDict([ ('mean', udf(lambda df: np.average(df.volume, weights=df.weight), DoubleType())( weighted_vol[['volume', 'weight']])), ('sum', udf(lambda df: (df.volume * df.weight).sum(), LongType())(weighted_vol[['volume', 'weight']])), ]) ).toPandas() @udf(DoubleType()) def foo(v, w): return np.average(v, weights=w) @udf(LongType()) def bar(v, w): return (v * w).sum() result5 = weighted_vol.summarizeCycles( OrderedDict([ ('mean', foo(weighted_vol['volume'], weighted_vol['weight'])), ('sum', bar(weighted_vol['volume'], weighted_vol['weight'])) ]) ).toPandas() @udf(DoubleType()) def foo1(df): return np.average(df.volume, weights=df.weight) @udf(LongType()) def bar1(df): return (df.volume * df.weight).sum() result6 = weighted_vol.summarizeCycles( OrderedDict([ ('mean', foo1(weighted_vol[['volume', 'weight']])), ('sum', bar1(weighted_vol[['volume', 'weight']])) ]) ).toPandas() expected_pdf1 = make_pdf([ (1000, 150.0, 300,), (1050, 350.0, 700,), (1100, 550.0, 1100,), (1150, 750.0, 1500,), (1200, 950.0, 1900,), (1250, 1150.0, 2300,), ], ["time", "mean", "sum"]) @udf((DoubleType(), LongType())) def foobar(v, w): foo = np.average(v, weights=w) bar = (v * w).sum() return (foo, bar) result7 = weighted_vol.summarizeCycles( {('mean', 'sum'): foobar(weighted_vol['volume'], weighted_vol['weight'])} ).toPandas() result8 = weighted_vol.summarizeCycles({ "mean": udf(lambda v: v.mean(), DoubleType())(weighted_vol['volume']), "sum": udf(lambda v: v.sum(), LongType())(weighted_vol['volume']) }).toPandas() assert_same(result1, expected_pdf1) assert_same(result2, expected_pdf1) assert_same(result3, expected_pdf1) assert_same(result4, expected_pdf1) assert_same(result5, expected_pdf1) assert_same(result6, expected_pdf1) assert_same(result7, expected_pdf1) assert_same(result8, expected_pdf1)