Example #1
0
    def test_addWindows(self):
        from ts.flint import windows
        from pyspark.sql import Row

        vol = self.vol()
        VolRow = Row('time', 'id', 'volume')

        id = [VolRow(int(r['time'].strftime('%s')), r['id'], r['volume'])
              for r in vol.collect()]

        expected_pdf = make_pdf([
            (1000, 7, 100, [id[0], id[1]]),
            (1000, 3, 200, [id[0], id[1]]),
            (1050, 3, 300, [id[0], id[1], id[2], id[3]]),
            (1050, 7, 400, [id[0], id[1], id[2], id[3]]),
            (1100, 3, 500, [id[2], id[3], id[4], id[5]]),
            (1100, 7, 600, [id[2], id[3], id[4], id[5]]),
            (1150, 3, 700, [id[4], id[5], id[6], id[7]]),
            (1150, 7, 800, [id[4], id[5], id[6], id[7]]),
            (1200, 3, 900, [id[6], id[7], id[8], id[9]]),
            (1200, 7, 1000, [id[6], id[7], id[8], id[9]]),
            (1250, 3, 1100, [id[8], id[9], id[10], id[11]]),
            (1250, 7, 1200, [id[8], id[9], id[10], id[11]]),
        ], ["time", "id", "volume", "window_past_50s"])

        new_pdf = vol.addWindows(windows.past_absolute_time("50s")).toPandas()
        assert_same(new_pdf, expected_pdf)
Example #2
0
    def test_addWindows(self):
        from ts.flint import windows
        from pyspark.sql import Row

        vol = self.vol()
        VolRow = Row('time', 'id', 'volume')

        id = [
            VolRow(int(r['time'].strftime('%s')), r['id'], r['volume'])
            for r in vol.collect()
        ]

        expected_pdf = make_pdf([
            (1000, 7, 100, [id[0], id[1]]),
            (1000, 3, 200, [id[0], id[1]]),
            (1050, 3, 300, [id[0], id[1], id[2], id[3]]),
            (1050, 7, 400, [id[0], id[1], id[2], id[3]]),
            (1100, 3, 500, [id[2], id[3], id[4], id[5]]),
            (1100, 7, 600, [id[2], id[3], id[4], id[5]]),
            (1150, 3, 700, [id[4], id[5], id[6], id[7]]),
            (1150, 7, 800, [id[4], id[5], id[6], id[7]]),
            (1200, 3, 900, [id[6], id[7], id[8], id[9]]),
            (1200, 7, 1000, [id[6], id[7], id[8], id[9]]),
            (1250, 3, 1100, [id[8], id[9], id[10], id[11]]),
            (1250, 7, 1200, [id[8], id[9], id[10], id[11]]),
        ], ["time", "id", "volume", "window_past_50s"])

        new_pdf = vol.addWindows(windows.past_absolute_time("50s")).toPandas()
        assert_same(new_pdf, expected_pdf)
Example #3
0
    def test_summarizeWindows(self):
        from ts.flint import windows
        from ts.flint import summarizers
        vol = self.vol()
        new_pdf1 = vol.summarizeWindows(windows.past_absolute_time('99ns'),
                                        summarizers.sum("volume")).toPandas()
        expected_pdf1 = test_utils.make_pdf([
            (1000, 7, 100, 300.0),
            (1000, 3, 200, 300.0),
            (1050, 3, 300, 1000.0),
            (1050, 7, 400, 1000.0),
            (1100, 3, 500, 1800.0),
            (1100, 7, 600, 1800.0),
            (1150, 3, 700, 2600.0),
            (1150, 7, 800, 2600.0),
            (1200, 3, 900, 3400.0),
            (1200, 7, 1000, 3400.0),
            (1250, 3, 1100, 4200.0),
            (1250, 7, 1200, 4200.0),
        ], ["time", "id", "volume", "volume_sum"])
        test_utils.assert_same(new_pdf1, expected_pdf1)

        new_pdf2 = (vol.summarizeWindows(windows.past_absolute_time('99ns'),
                                         summarizers.sum("volume"),
                                         key="id").toPandas())
        expected_pdf2 = test_utils.make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 500.0),
            (1050, 7, 400, 500.0),
            (1100, 3, 500, 800.0),
            (1100, 7, 600, 1000.0),
            (1150, 3, 700, 1200.0),
            (1150, 7, 800, 1400.0),
            (1200, 3, 900, 1600.0),
            (1200, 7, 1000, 1800.0),
            (1250, 3, 1100, 2000.0),
            (1250, 7, 1200, 2200.0),
        ], ["time", "id", "volume", "volume_sum"])
        test_utils.assert_same(new_pdf2, expected_pdf2)
Example #4
0
    def test_summarizeWindows_udf(self):
        from ts.flint import udf
        from ts.flint import windows
        from collections import OrderedDict
        from pyspark.sql.types import DoubleType, LongType

        vol = self.vol()
        w = windows.past_absolute_time('99s')

        @udf(DoubleType())
        def mean(v):
            return v.mean()
        result7 = vol.summarizeWindows(
            w,
            {'mean': mean(vol['volume'])},
            key='id'
        ).toPandas()
        expected7 = make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 400.0),
            (1100, 7, 600, 500.0),
            (1150, 3, 700, 600.0),
            (1150, 7, 800, 700.0),
            (1200, 3, 900, 800.0),
            (1200, 7, 1000, 900.0),
            (1250, 3, 1100, 1000.0),
            (1250, 7, 1200, 1100.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result7, expected7)

        result8 = vol.summarizeWindows(
            w,
            {'mean': mean(vol['volume'])}
        ).toPandas()
        expected8 = make_pdf([
            (1000, 7, 100, 150.0),
            (1000, 3, 200, 150.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 450.0),
            (1100, 7, 600, 450.0),
            (1150, 3, 700, 650.0),
            (1150, 7, 800, 650.0),
            (1200, 3, 900, 850.0),
            (1200, 7, 1000, 850.0),
            (1250, 3, 1100, 1050.0),
            (1250, 7, 1200, 1050.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result8, expected8)
Example #5
0
    def test_summarizeWindows_udf(self):
        from ts.flint import udf
        from ts.flint import windows
        from collections import OrderedDict
        from pyspark.sql.types import DoubleType, LongType

        vol = self.vol()
        w = windows.past_absolute_time('99s')

        @udf(DoubleType())
        def mean(v):
            return v.mean()

        result7 = vol.summarizeWindows(w, {
            'mean': mean(vol['volume'])
        },
                                       key='id').toPandas()
        expected7 = make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 400.0),
            (1100, 7, 600, 500.0),
            (1150, 3, 700, 600.0),
            (1150, 7, 800, 700.0),
            (1200, 3, 900, 800.0),
            (1200, 7, 1000, 900.0),
            (1250, 3, 1100, 1000.0),
            (1250, 7, 1200, 1100.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result7, expected7)

        result8 = vol.summarizeWindows(w, {
            'mean': mean(vol['volume'])
        }).toPandas()
        expected8 = make_pdf([
            (1000, 7, 100, 150.0),
            (1000, 3, 200, 150.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 450.0),
            (1100, 7, 600, 450.0),
            (1150, 3, 700, 650.0),
            (1150, 7, 800, 650.0),
            (1200, 3, 900, 850.0),
            (1200, 7, 1000, 850.0),
            (1250, 3, 1100, 1050.0),
            (1250, 7, 1200, 1050.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result8, expected8)
Example #6
0
    def test_summarizeWindows(self):
        from ts.flint import windows
        from ts.flint import summarizers

        vol = self.vol()

        w = windows.past_absolute_time('99s')

        new_pdf1 = vol.summarizeWindows(w, summarizers.sum("volume")).toPandas()
        expected_pdf1 = make_pdf([
            (1000, 7, 100, 300.0),
            (1000, 3, 200, 300.0),
            (1050, 3, 300, 1000.0),
            (1050, 7, 400, 1000.0),
            (1100, 3, 500, 1800.0),
            (1100, 7, 600, 1800.0),
            (1150, 3, 700, 2600.0),
            (1150, 7, 800, 2600.0),
            (1200, 3, 900, 3400.0),
            (1200, 7, 1000, 3400.0),
            (1250, 3, 1100, 4200.0),
            (1250, 7, 1200, 4200.0),
        ], ["time", "id", "volume", "volume_sum"])
        assert_same(new_pdf1, expected_pdf1)

        new_pdf2 = (vol.summarizeWindows(w,
                                         summarizers.sum("volume"),
                                         key="id").toPandas())
        expected_pdf2 = make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 500.0),
            (1050, 7, 400, 500.0),
            (1100, 3, 500, 800.0),
            (1100, 7, 600, 1000.0),
            (1150, 3, 700, 1200.0),
            (1150, 7, 800, 1400.0),
            (1200, 3, 900, 1600.0),
            (1200, 7, 1000, 1800.0),
            (1250, 3, 1100, 2000.0),
            (1250, 7, 1200, 2200.0),
        ], ["time", "id", "volume", "volume_sum"])
        assert_same(new_pdf2, expected_pdf2)
Example #7
0
 def test_addWindows(self):
     from ts.flint import windows
     vol = self.vol()
     id = vol.collect()
     expected_pdf = test_utils.make_pdf([
         (1000, 7, 100, [id[0], id[1]]),
         (1000, 3, 200, [id[0], id[1]]),
         (1050, 3, 300, [id[0], id[1], id[2], id[3]]),
         (1050, 7, 400, [id[0], id[1], id[2], id[3]]),
         (1100, 3, 500, [id[2], id[3], id[4], id[5]]),
         (1100, 7, 600, [id[2], id[3], id[4], id[5]]),
         (1150, 3, 700, [id[4], id[5], id[6], id[7]]),
         (1150, 7, 800, [id[4], id[5], id[6], id[7]]),
         (1200, 3, 900, [id[6], id[7], id[8], id[9]]),
         (1200, 7, 1000, [id[6], id[7], id[8], id[9]]),
         (1250, 3, 1100, [id[8], id[9], id[10], id[11]]),
         (1250, 7, 1200, [id[8], id[9], id[10], id[11]]),
     ], ["time", "id", "volume", "window_past_50ns"])
     new_pdf = vol.addWindows(windows.past_absolute_time("50ns")).toPandas()
     test_utils.assert_same(new_pdf, expected_pdf)
Example #8
0
    def test_summarizeWindows_numpy_udf(self):
        from ts.flint import windows
        from ts.flint.functions import udf
        from pyspark.sql.types import DoubleType, LongType

        vol = self.vol()
        df = self.flintContext.read.pandas(
            make_pdf([
                (1000, 3, 10.0),
                (1000, 7, 20.0),
                (1050, 3, 30.0),
                (1050, 7, 40.0),
                (1100, 3, 50.0),
                (1150, 3, 60.0),
                (1150, 7, 70.0),
                (1200, 3, 80.0),
                (1200, 7, 90.0),
                (1250, 7, 100.0),
            ], ['time', 'id', 'v']))

        @udf(DoubleType(), arg_type='numpy')
        def mean_np(v):
            assert isinstance(v, np.ndarray)
            return v.mean()

        @udf((DoubleType(), LongType()), arg_type='numpy')
        def mean_and_sum_np(v):
            assert isinstance(v, np.ndarray)
            return v.mean(), v.sum()

        @udf(DoubleType(), arg_type='numpy')
        def mean_np_df(window):
            assert isinstance(window, list)
            assert isinstance(window[-1], np.ndarray)
            return window[-1].mean()

        @udf(DoubleType(), arg_type='numpy')
        def mean_np_2(v, window):
            assert isinstance(v, np.float64)
            assert isinstance(window, list)
            assert isinstance(window[-1], np.ndarray)
            return v + window[-1].mean()

        @udf(DoubleType(), arg_type='numpy')
        def mean_np_df_2(left, window):
            assert isinstance(left, list)
            assert isinstance(left[0], np.float64)
            assert isinstance(window, list)
            assert isinstance(window[-1], np.ndarray)
            return window[-1].mean()

        w = windows.past_absolute_time('99s')

        result1 = vol.summarizeWindows(w, {
            'mean': mean_np(vol['volume'])
        }).toPandas()
        expected1 = make_pdf([
            (1000, 7, 100, 150.0),
            (1000, 3, 200, 150.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 450.0),
            (1100, 7, 600, 450.0),
            (1150, 3, 700, 650.0),
            (1150, 7, 800, 650.0),
            (1200, 3, 900, 850.0),
            (1200, 7, 1000, 850.0),
            (1250, 3, 1100, 1050.0),
            (1250, 7, 1200, 1050.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result1, expected1)

        result2 = vol.summarizeWindows(w, {
            'mean': mean_np(vol['volume'])
        },
                                       key='id').toPandas()
        expected2 = make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 400.0),
            (1100, 7, 600, 500.0),
            (1150, 3, 700, 600.0),
            (1150, 7, 800, 700.0),
            (1200, 3, 900, 800.0),
            (1200, 7, 1000, 900.0),
            (1250, 3, 1100, 1000.0),
            (1250, 7, 1200, 1100.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result2, expected2)

        result3 = vol.summarizeWindows(
            w,
            {
                'mean': mean_np_df(vol[['volume']])
            },
        ).toPandas()
        expected3 = expected1
        assert_same(result3, expected3)

        result4 = vol.summarizeWindows(
            w,
            {
                'mean': mean_np_df(vol[['time', 'volume']])
            },
        ).toPandas()
        expected4 = expected1
        assert_same(result4, expected4)

        result5 = df.summarizeWindows(
            w, {
                'mean': mean_np_2(df['v'], vol[['time', 'volume']])
            },
            other=vol,
            key='id').toPandas()
        expected5 = make_pdf([
            (1000, 3, 10.0, 210.0),
            (1000, 7, 20.0, 120.0),
            (1050, 3, 30.0, 280.0),
            (1050, 7, 40.0, 290.0),
            (1100, 3, 50.0, 450.0),
            (1150, 3, 60.0, 660.0),
            (1150, 7, 70.0, 770.0),
            (1200, 3, 80.0, 880.0),
            (1200, 7, 90.0, 990.0),
            (1250, 7, 100.0, 1200.0),
        ], ['time', 'id', 'v', 'mean'])
        assert_same(result5, expected5)

        result6 = df.summarizeWindows(
            w, {
                'mean': mean_np_df_2(df[['v']], vol[['time', 'volume']])
            },
            other=vol,
            key='id').toPandas()
        expected6 = result6
        assert_same(result6, expected6)

        result7 = df.summarizeWindows(
            w, {
                'mean': mean_np_df(vol[['time', 'volume']])
            },
            other=vol,
            key='id').toPandas()
        expected7 = make_pdf([
            (1000, 3, 10.0, 200.0),
            (1000, 7, 20.0, 100.0),
            (1050, 3, 30.0, 250.0),
            (1050, 7, 40.0, 250.0),
            (1100, 3, 50.0, 400.0),
            (1150, 3, 60.0, 600.0),
            (1150, 7, 70.0, 700.0),
            (1200, 3, 80.0, 800.0),
            (1200, 7, 90.0, 900.0),
            (1250, 7, 100.0, 1100.0),
        ], ['time', 'id', 'v', 'mean'])
        assert_same(result7, expected7)

        result8 = vol.summarizeWindows(
            w, {
                ('mean', 'sum'): mean_and_sum_np(vol['volume'])
            }, key='id').toPandas()
        expected8 = make_pdf([
            (1000, 7, 100, 100.0, 100),
            (1000, 3, 200, 200.0, 200),
            (1050, 3, 300, 250.0, 500),
            (1050, 7, 400, 250.0, 500),
            (1100, 3, 500, 400.0, 800),
            (1100, 7, 600, 500.0, 1000),
            (1150, 3, 700, 600.0, 1200),
            (1150, 7, 800, 700.0, 1400),
            (1200, 3, 900, 800.0, 1600),
            (1200, 7, 1000, 900.0, 1800),
            (1250, 3, 1100, 1000.0, 2000),
            (1250, 7, 1200, 1100.0, 2200),
        ], ['time', 'id', 'volume', 'mean', 'sum'])
        assert_same(result8, expected8)
Example #9
0
    def test_summarizeWindows_udf(self):
        from ts.flint import udf
        from ts.flint import windows
        from collections import OrderedDict
        from pyspark.sql.types import DoubleType, LongType

        vol = self.vol()
        w = windows.past_absolute_time('99s')

        df = self.flintContext.read.pandas(
            make_pdf([
                (1000, 3, 10.0),
                (1000, 7, 20.0),
                (1050, 3, 30.0),
                (1050, 7, 40.0),
                (1100, 3, 50.0),
                (1150, 3, 60.0),
                (1150, 7, 70.0),
                (1200, 3, 80.0),
                (1200, 7, 90.0),
                (1250, 7, 100.0),
            ], ['time', 'id', 'v']))

        result1 = df.summarizeWindows(
            w,
            OrderedDict([('mean',
                          udf(lambda time, window: window.mean(),
                              DoubleType())(df['time'], vol['volume']))]),
            key="id",
            other=vol).toPandas()
        expected1 = make_pdf([
            (1000, 3, 10.0, 200.0),
            (1000, 7, 20.0, 100.0),
            (1050, 3, 30.0, 250.0),
            (1050, 7, 40.0, 250.0),
            (1100, 3, 50.0, 400.0),
            (1150, 3, 60.0, 600.0),
            (1150, 7, 70.0, 700.0),
            (1200, 3, 80.0, 800.0),
            (1200, 7, 90.0, 900.0),
            (1250, 7, 100.0, 1100.0),
        ], ['time', 'id', 'v', 'mean'])
        assert_same(result1, expected1)

        result2 = df.summarizeWindows(w,
                                      OrderedDict([
                                          ('mean',
                                           udf(lambda window: window.mean(),
                                               DoubleType())(vol['volume']))
                                      ]),
                                      key='id',
                                      other=vol).toPandas()
        expected2 = expected1
        assert_same(result2, expected2)

        result3 = df.summarizeWindows(
            w,
            OrderedDict([
                ('mean', udf(lambda window: window.mean(),
                             DoubleType())(vol['volume'])),
                ('count', udf(lambda time, window: len(window),
                              LongType())(df['time'], vol['volume']))
            ]),
            key='id',
            other=vol).toPandas()
        expected3 = make_pdf([
            (1000, 3, 10.0, 200.0, 1),
            (1000, 7, 20.0, 100.0, 1),
            (1050, 3, 30.0, 250.0, 2),
            (1050, 7, 40.0, 250.0, 2),
            (1100, 3, 50.0, 400.0, 2),
            (1150, 3, 60.0, 600.0, 2),
            (1150, 7, 70.0, 700.0, 2),
            (1200, 3, 80.0, 800.0, 2),
            (1200, 7, 90.0, 900.0, 2),
            (1250, 7, 100.0, 1100.0, 2),
        ], ['time', 'id', 'v', 'mean', 'count'])
        assert_same(result3, expected3)

        @udf('double')
        def window_udf(time, window):
            return (time - window.time).mean().seconds + window.volume.mean()

        result4 = df.summarizeWindows(w,
                                      OrderedDict([
                                          ('mean',
                                           window_udf(df['time'],
                                                      vol[['time',
                                                           'volume']])),
                                      ]),
                                      key='id',
                                      other=vol).toPandas()

        expected4 = make_pdf([
            (1000, 3, 10.0, 200.0),
            (1000, 7, 20.0, 100.0),
            (1050, 3, 30.0, 275.0),
            (1050, 7, 40.0, 275.0),
            (1100, 3, 50.0, 425.0),
            (1150, 3, 60.0, 625.0),
            (1150, 7, 70.0, 725.0),
            (1200, 3, 80.0, 825.0),
            (1200, 7, 90.0, 925.0),
            (1250, 7, 100.0, 1125.0),
        ], ['time', 'id', 'v', 'mean'])
        assert_same(result4, expected4)

        @udf(DoubleType())
        def foo5(row, window):
            return (row[0] - window.time).mean().seconds + window.volume.mean()

        result5 = df.summarizeWindows(w,
                                      OrderedDict([
                                          ('mean',
                                           foo5(df[['time', 'v']],
                                                vol[['time', 'volume']])),
                                      ]),
                                      key='id',
                                      other=vol).toPandas()
        expected5 = expected4
        assert_same(result5, expected5)

        @udf((DoubleType(), LongType()))
        def mean_and_count(v):
            return v.mean(), len(v)

        result6 = df.summarizeWindows(w,
                                      OrderedDict([[
                                          ('mean', 'count'),
                                          mean_and_count(vol['volume'])
                                      ]]),
                                      key='id',
                                      other=vol).toPandas()
        expected6 = expected3
        assert_same(result6, expected6)

        @udf(DoubleType())
        def mean(v):
            return v.mean()

        result7 = vol.summarizeWindows(w, {
            'mean': mean(vol['volume'])
        },
                                       key='id').toPandas()
        expected7 = make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 400.0),
            (1100, 7, 600, 500.0),
            (1150, 3, 700, 600.0),
            (1150, 7, 800, 700.0),
            (1200, 3, 900, 800.0),
            (1200, 7, 1000, 900.0),
            (1250, 3, 1100, 1000.0),
            (1250, 7, 1200, 1100.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result7, expected7)

        result8 = vol.summarizeWindows(w, {
            'mean': mean(vol['volume'])
        }).toPandas()
        expected8 = make_pdf([
            (1000, 7, 100, 150.0),
            (1000, 3, 200, 150.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 450.0),
            (1100, 7, 600, 450.0),
            (1150, 3, 700, 650.0),
            (1150, 7, 800, 650.0),
            (1200, 3, 900, 850.0),
            (1200, 7, 1000, 850.0),
            (1250, 3, 1100, 1050.0),
            (1250, 7, 1200, 1050.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result8, expected8)
Example #10
0
    def test_summarizeWindows(self):
        from ts.flint import windows
        from ts.flint import summarizers

        vol = self.vol()

        w = windows.past_absolute_time('99s')

        new_pdf1 = vol.summarizeWindows(w,
                                        summarizers.sum("volume")).toPandas()
        expected_pdf1 = make_pdf([
            (1000, 7, 100, 300.0),
            (1000, 3, 200, 300.0),
            (1050, 3, 300, 1000.0),
            (1050, 7, 400, 1000.0),
            (1100, 3, 500, 1800.0),
            (1100, 7, 600, 1800.0),
            (1150, 3, 700, 2600.0),
            (1150, 7, 800, 2600.0),
            (1200, 3, 900, 3400.0),
            (1200, 7, 1000, 3400.0),
            (1250, 3, 1100, 4200.0),
            (1250, 7, 1200, 4200.0),
        ], ["time", "id", "volume", "volume_sum"])
        assert_same(new_pdf1, expected_pdf1)

        new_pdf2 = (vol.summarizeWindows(w,
                                         summarizers.sum("volume"),
                                         key="id").toPandas())
        expected_pdf2 = make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 500.0),
            (1050, 7, 400, 500.0),
            (1100, 3, 500, 800.0),
            (1100, 7, 600, 1000.0),
            (1150, 3, 700, 1200.0),
            (1150, 7, 800, 1400.0),
            (1200, 3, 900, 1600.0),
            (1200, 7, 1000, 1800.0),
            (1250, 3, 1100, 2000.0),
            (1250, 7, 1200, 2200.0),
        ], ["time", "id", "volume", "volume_sum"])
        assert_same(new_pdf2, expected_pdf2)

        interval_with_id = self.flintContext.read.pandas(
            make_pdf([
                (1000, 3),
                (1000, 7),
                (1050, 3),
                (1050, 7),
                (1100, 3),
                (1150, 3),
                (1150, 7),
                (1200, 3),
                (1200, 7),
                (1250, 7),
            ], ["time", "id"]))

        new_pdf3 = (interval_with_id.summarizeWindows(
            w, summarizers.sum("volume"), key="id", other=vol).toPandas())
        expected_pdf3 = make_pdf([
            (1000, 3, 200.0),
            (1000, 7, 100.0),
            (1050, 3, 500.0),
            (1050, 7, 500.0),
            (1100, 3, 800.0),
            (1150, 3, 1200.0),
            (1150, 7, 1400.0),
            (1200, 3, 1600.0),
            (1200, 7, 1800.0),
            (1250, 7, 2200.0),
        ], ["time", "id", "volume_sum"])
        assert_same(new_pdf3, expected_pdf3)
Example #11
0
df_control_previous_day_val = df_control.shiftTime(
    windows.future_absolute_time('1day')).toDF('time', 'previous_day_val')
df_control_previous_wk_val = df_control.shiftTime(
    windows.future_absolute_time('7day')).toDF('time', 'previous_wk_val')
df_control_joined = df_control.leftJoin(df_control_previous_day_val).leftJoin(
    df_control_previous_wk_val)
df_control_joined.show()

# COMMAND ----------

from ts.flint import summarizers

df_control_decayed_return = df_control_joined.where(
    "time > '2018-06-15'").summarizeWindows(
        window=windows.past_absolute_time('42day'),
        summarizer=summarizers.ewma('previous_wk_val', alpha=0.5))

display(df_control_decayed_return)

# COMMAND ----------

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["previous_wk_val", "previous_wk_val_ewma"],
    outputCol="features")

output = assembler.transform(df_control_decayed_return).select(
    'ACTL_VAL', 'features').toDF('label', 'features')
Example #12
0
    def test_summarizeWindows_numpy_udf(self):
        from ts.flint import windows
        from ts.flint.functions import udf
        from pyspark.sql.types import DoubleType, LongType

        vol = self.vol()
        df = self.flintContext.read.pandas(make_pdf([
            (1000, 3, 10.0),
            (1000, 7, 20.0),
            (1050, 3, 30.0),
            (1050, 7, 40.0),
            (1100, 3, 50.0),
            (1150, 3, 60.0),
            (1150, 7, 70.0),
            (1200, 3, 80.0),
            (1200, 7, 90.0),
            (1250, 7, 100.0),
        ], ['time', 'id', 'v']))

        @udf(DoubleType(), arg_type='numpy')
        def mean_np(v):
            assert isinstance(v, np.ndarray)
            return v.mean()

        @udf((DoubleType(), LongType()), arg_type='numpy')
        def mean_and_sum_np(v):
            assert isinstance(v, np.ndarray)
            return v.mean(), v.sum()

        @udf(DoubleType(), arg_type='numpy')
        def mean_np_df(window):
            assert isinstance(window, list)
            assert isinstance(window[-1], np.ndarray)
            return window[-1].mean()

        @udf(DoubleType(), arg_type='numpy')
        def mean_np_2(v, window):
            assert isinstance(v, np.float64)
            assert isinstance(window, list)
            assert isinstance(window[-1], np.ndarray)
            return v + window[-1].mean()

        @udf(DoubleType(), arg_type='numpy')
        def mean_np_df_2(left, window):
            assert isinstance(left, list)
            assert isinstance(left[0], np.float64)
            assert isinstance(window, list)
            assert isinstance(window[-1], np.ndarray)
            return window[-1].mean()

        w = windows.past_absolute_time('99s')

        result1 = vol.summarizeWindows(
            w,
            {'mean': mean_np(vol['volume'])}
        ).toPandas()
        expected1 = make_pdf([
            (1000, 7, 100, 150.0),
            (1000, 3, 200, 150.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 450.0),
            (1100, 7, 600, 450.0),
            (1150, 3, 700, 650.0),
            (1150, 7, 800, 650.0),
            (1200, 3, 900, 850.0),
            (1200, 7, 1000, 850.0),
            (1250, 3, 1100, 1050.0),
            (1250, 7, 1200, 1050.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result1, expected1)

        result2 = vol.summarizeWindows(
            w,
            {'mean': mean_np(vol['volume'])},
            key = 'id'
        ).toPandas()
        expected2 = make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 250.0),
            (1050, 7, 400, 250.0),
            (1100, 3, 500, 400.0),
            (1100, 7, 600, 500.0),
            (1150, 3, 700, 600.0),
            (1150, 7, 800, 700.0),
            (1200, 3, 900, 800.0),
            (1200, 7, 1000, 900.0),
            (1250, 3, 1100, 1000.0),
            (1250, 7, 1200, 1100.0),
        ], ['time', 'id', 'volume', 'mean'])
        assert_same(result2, expected2)

        result3 = vol.summarizeWindows(
            w,
            {'mean': mean_np_df(vol[['volume']])},
        ).toPandas()
        expected3 = expected1
        assert_same(result3, expected3)

        result4 = vol.summarizeWindows(
            w,
            {'mean': mean_np_df(vol[['time', 'volume']])},
        ).toPandas()
        expected4 = expected1
        assert_same(result4, expected4)

        result8 = vol.summarizeWindows(
            w,
            {('mean', 'sum'): mean_and_sum_np(vol['volume'])},
            key = 'id'
        ).toPandas()
        expected8 = make_pdf([
            (1000, 7, 100, 100.0, 100),
            (1000, 3, 200, 200.0, 200),
            (1050, 3, 300, 250.0, 500),
            (1050, 7, 400, 250.0, 500),
            (1100, 3, 500, 400.0, 800),
            (1100, 7, 600, 500.0, 1000),
            (1150, 3, 700, 600.0, 1200),
            (1150, 7, 800, 700.0, 1400),
            (1200, 3, 900, 800.0, 1600),
            (1200, 7, 1000, 900.0, 1800),
            (1250, 3, 1100, 1000.0, 2000),
            (1250, 7, 1200, 1100.0, 2200),
        ], ['time', 'id', 'volume', 'mean', 'sum'])
        assert_same(result8, expected8)
Example #13
0
def test_summarizeWindows(flintContext, tests_utils, windows, summarizers,
                          vol):
    new_pdf1 = vol.summarizeWindows(windows.past_absolute_time('99ns'),
                                    summarizers.sum("volume")).toPandas()
    expected_pdf1 = make_pdf([
        (1000, 7, 100, 300.0),
        (1000, 3, 200, 300.0),
        (1050, 3, 300, 1000.0),
        (1050, 7, 400, 1000.0),
        (1100, 3, 500, 1800.0),
        (1100, 7, 600, 1800.0),
        (1150, 3, 700, 2600.0),
        (1150, 7, 800, 2600.0),
        (1200, 3, 900, 3400.0),
        (1200, 7, 1000, 3400.0),
        (1250, 3, 1100, 4200.0),
        (1250, 7, 1200, 4200.0),
    ], ["time", "id", "volume", "volume_sum"])
    tests_utils.assert_same(new_pdf1, expected_pdf1)

    new_pdf2 = (vol.summarizeWindows(windows.past_absolute_time('99ns'),
                                     summarizers.sum("volume"),
                                     key="id").toPandas())
    expected_pdf2 = make_pdf([
        (1000, 7, 100, 100.0),
        (1000, 3, 200, 200.0),
        (1050, 3, 300, 500.0),
        (1050, 7, 400, 500.0),
        (1100, 3, 500, 800.0),
        (1100, 7, 600, 1000.0),
        (1150, 3, 700, 1200.0),
        (1150, 7, 800, 1400.0),
        (1200, 3, 900, 1600.0),
        (1200, 7, 1000, 1800.0),
        (1250, 3, 1100, 2000.0),
        (1250, 7, 1200, 2200.0),
    ], ["time", "id", "volume", "volume_sum"])
    tests_utils.assert_same(new_pdf2, expected_pdf2)

    interval_with_id = flintContext.read.pandas(
        make_pdf([
            (1000, 3),
            (1000, 7),
            (1050, 3),
            (1050, 7),
            (1100, 3),
            (1150, 3),
            (1150, 7),
            (1200, 3),
            (1200, 7),
            (1250, 7),
        ], ["time", "id"]))

    new_pdf3 = (interval_with_id.summarizeWindows(
        windows.past_absolute_time('99ns'),
        summarizers.sum("volume"),
        key="id",
        other=vol).toPandas())
    expected_pdf3 = make_pdf([
        (1000, 3, 200.0),
        (1000, 7, 100.0),
        (1050, 3, 500.0),
        (1050, 7, 500.0),
        (1100, 3, 800.0),
        (1150, 3, 1200.0),
        (1150, 7, 1400.0),
        (1200, 3, 1600.0),
        (1200, 7, 1800.0),
        (1250, 7, 2200.0),
    ], ["time", "id", "volume_sum"])
    tests_utils.assert_same(new_pdf3, expected_pdf3)
Example #14
0
sp500_previous_day_return = sp500_return.shiftTime(windows.future_absolute_time('1day')).toDF('time', 'previous_day_return')
sp500_joined_return = sp500_return.leftJoin(sp500_previous_day_return)
sp500_joined_return.show()

# COMMAND ----------

sp500_joined_return = sp500_return.leftJoin(sp500_previous_day_return, tolerance='3days').dropna()
sp500_joined_return.show()

# COMMAND ----------

from ts.flint import summarizers

sp500_decayed_return = sp500_joined_return.summarizeWindows(
    window = windows.past_absolute_time('7day'),
    summarizer = summarizers.ewma('previous_day_return', alpha=0.5)
)

sp500_decayed_return.show()

# COMMAND ----------

from ts.flint import udf
import numpy as np

@udf('double', arg_type='numpy')
def decayed(columns): 
    v = columns[0]
    decay = np.power(0.5, np.arange(len(v)))[::-1]
    return (v * decay).sum()