Example #1
0
    def test_summarizeIntervals(self):
        from ts.flint import summarizers
        vol = self.vol()
        clock = self.flintContext.read.pandas(
            test_utils.make_pdf([
                (1000, ),
                (1100, ),
                (1200, ),
                (1300, ),
            ], ["time"]))

        new_pdf1 = vol.summarizeIntervals(
            clock, summarizers.sum("volume")).toPandas()
        expected_pdf1 = test_utils.make_pdf([
            (1000, 1000.0),
            (1100, 2600.0),
            (1200, 4200.0),
        ], ["time", "volume_sum"])
        test_utils.assert_same(new_pdf1, expected_pdf1)

        new_pdf2 = vol.summarizeIntervals(clock,
                                          summarizers.sum("volume"),
                                          key="id").toPandas()
        expected_pdf2 = test_utils.make_pdf([
            (1000, 7, 500.0),
            (1000, 3, 500.0),
            (1100, 3, 1200.0),
            (1100, 7, 1400.0),
            (1200, 3, 2000.0),
            (1200, 7, 2200.0),
        ], ["time", "id", "volume_sum"])

        test_utils.assert_same(new_pdf2, expected_pdf2)
Example #2
0
    def test_summarizeCycles(self):
        from ts.flint import summarizers
        vol = self.vol()
        vol2 = self.vol2()
        expected_pdf1 = test_utils.make_pdf([
            (
                1000,
                300.0,
            ),
            (
                1050,
                700.0,
            ),
            (
                1100,
                1100.0,
            ),
            (
                1150,
                1500.0,
            ),
            (
                1200,
                1900.0,
            ),
            (
                1250,
                2300.0,
            ),
        ], ["time", "volume_sum"])
        new_pdf1 = vol.summarizeCycles(summarizers.sum("volume")).toPandas()
        test_utils.assert_same(new_pdf1, expected_pdf1)

        expected_pdf2 = test_utils.make_pdf([
            (1000, 7, 200.0),
            (1000, 3, 400.0),
            (1050, 3, 600.0),
            (1050, 7, 800.0),
            (1100, 3, 1000.0),
            (1100, 7, 1200.0),
            (1150, 3, 1400.0),
            (1150, 7, 1600.0),
            (1200, 3, 1800.0),
            (1200, 7, 2000.0),
            (1250, 3, 2200.0),
            (1250, 7, 2400.0),
        ], ["time", "id", "volume_sum"])
        new_pdf2 = vol2.summarizeCycles(summarizers.sum("volume"),
                                        key="id").toPandas()
        test_utils.assert_same(new_pdf2, expected_pdf2)
Example #3
0
    def test_groupByInterval(self):
        vol = self.vol()
        intervals = self.intervals()
        id = vol.collect()

        expected_pdf = test_utils.make_pdf([
            (1000, 7, [id[0], id[3]]),
            (1000, 3, [id[1], id[2]]),
            (1100, 7, [id[5], id[7]]),
            (1100, 3, [id[4], id[6]]),
            (1200, 7, [id[9], id[11]]),
            (1200, 3, [id[8], id[10]]),
        ], ["time", "id", "rows"])

        new_pdf = vol.groupByInterval(intervals, key=["id"]).toPandas()
        new_pdf1 = vol.groupByInterval(intervals, key="id").toPandas()
        test_utils.assert_same(new_pdf, new_pdf1)

        # XXX: should just do
        # test_utils.assert_same(new_pdf, expected_pdf)
        # once https://gitlab.twosigma.com/analytics/huohua/issues/26
        # gets resolved.
        test_utils.assert_same(
            new_pdf[new_pdf['id'] == 3].reset_index(drop=True),
            expected_pdf[expected_pdf['id'] == 3].reset_index(drop=True),
        )
        test_utils.assert_same(
            new_pdf[new_pdf['id'] == 7].reset_index(drop=True),
            expected_pdf[expected_pdf['id'] == 7].reset_index(drop=True),
        )
Example #4
0
    def test_futureLeftJoin(self):
        import pyspark.sql.types as pyspark_types
        price = self.price()
        vol = self.vol()
        expected_pdf = test_utils.make_pdf([
            (1000, 7, 0.5, 400, 1050),
            (1000, 3, 1.0, 300, 1050),
            (1050, 3, 1.5, 500, 1100),
            (1050, 7, 2.0, 600, 1100),
            (1100, 3, 2.5, 700, 1150),
            (1100, 7, 3.0, 800, 1150),
            (1150, 3, 3.5, 900, 1200),
            (1150, 7, 4.0, 1000, 1200),
            (1200, 3, 4.5, 1100, 1250),
            (1200, 7, 5.0, 1200, 1250),
            (1250, 3, 5.5, None, None),
            (1250, 7, 6.0, None, None),
        ], ["time", "id", "price", "volume", "time2"])

        new_pdf = price.futureLeftJoin(vol.withColumn(
            "time2", vol.time.cast(pyspark_types.LongType())),
                                       tolerance=pd.Timedelta("100ns"),
                                       key=["id"],
                                       strict_lookahead=True).toPandas()
        new_pdf1 = price.futureLeftJoin(vol.withColumn(
            "time2", vol.time.cast(pyspark_types.LongType())),
                                        tolerance=pd.Timedelta("100ns"),
                                        key="id",
                                        strict_lookahead=True).toPandas()
        test_utils.assert_same(new_pdf, new_pdf1)
        test_utils.assert_same(new_pdf, expected_pdf)
Example #5
0
 def test_summary_min(self):
     from ts.flint import summarizers
     forecast = self.forecast()
     expected_pdf = test_utils.make_pdf([(
         0,
         -9.6,
     )], ["time", "forecast_min"])
     result = forecast.summarize(summarizers.min("forecast")).toPandas()
     pdt.assert_frame_equal(result, expected_pdf)
Example #6
0
 def test_summary_quantile(self):
     from ts.flint import summarizers
     forecast = self.forecast()
     expected_pdf = test_utils.make_pdf(
         [(0, -2.22, 1.75)],
         ["time", "forecast_0.2quantile", "forecast_0.5quantile"])
     result = forecast.summarize(
         summarizers.quantile(self.sc, "forecast", (0.2, 0.5))).toPandas()
     pdt.assert_frame_equal(result, expected_pdf)
Example #7
0
    def test_summary_sum(self):
        from ts.flint import summarizers
        vol = self.vol()
        expected_pdf = test_utils.make_pdf([(
            0,
            7800.0,
        )], ["time", "volume_sum"])

        new_pdf = vol.summarize(summarizers.sum("volume")).toPandas()
        test_utils.assert_same(new_pdf, expected_pdf)

        expected_pdf = test_utils.make_pdf([
            (
                0,
                7,
                4100.0,
            ),
            (
                0,
                3,
                3700.0,
            ),
        ], ["time", "id", "volume_sum"])

        new_pdf = vol.summarize(summarizers.sum("volume"),
                                key=["id"]).toPandas()
        new_pdf1 = vol.summarize(summarizers.sum("volume"),
                                 key="id").toPandas()
        test_utils.assert_same(new_pdf, new_pdf1)

        # XXX: should just do:
        # test_utils.assert_same(new_pdf, expected_pdf, "by id")
        # once https://gitlab.twosigma.com/analytics/huohua/issues/26
        # gets resolved.
        test_utils.assert_same(
            new_pdf[new_pdf['id'] == 3].reset_index(drop=True),
            expected_pdf[expected_pdf['id'] == 3].reset_index(drop=True),
            "by id 3")
        test_utils.assert_same(
            new_pdf[new_pdf['id'] == 7].reset_index(drop=True),
            expected_pdf[expected_pdf['id'] == 7].reset_index(drop=True),
            "by id 7")
Example #8
0
    def test_summary_zscore(self):
        from ts.flint import summarizers
        price = self.price()
        expected_pdf = test_utils.make_pdf([(
            0,
            1.5254255396193801,
        )], ["time", "price_zScore"])

        new_pdf = price.summarize(summarizers.zscore(
            "price", in_sample=True)).toPandas()
        test_utils.assert_same(new_pdf, expected_pdf, "in-sample")

        expected_pdf = test_utils.make_pdf([(
            0,
            1.8090680674665818,
        )], ["time", "price_zScore"])

        new_pdf = price.summarize(summarizers.zscore(
            "price", in_sample=False)).toPandas()
        test_utils.assert_same(new_pdf, expected_pdf, "out-of-sample)")
Example #9
0
 def test_summary_variance(self):
     from ts.flint import summarizers
     price = self.price()
     forecast = self.forecast()
     expected_pdf = test_utils.make_pdf([(
         0,
         3.25,
     )], ["time", "price_variance"])
     joined = price.leftJoin(forecast, key="id")
     result = joined.summarize(summarizers.variance("price")).toPandas()
     pdt.assert_frame_equal(result, expected_pdf)
Example #10
0
    def test_summarizeWindows(self):
        from ts.flint import windows
        from ts.flint import summarizers
        vol = self.vol()
        new_pdf1 = vol.summarizeWindows(windows.past_absolute_time('99ns'),
                                        summarizers.sum("volume")).toPandas()
        expected_pdf1 = test_utils.make_pdf([
            (1000, 7, 100, 300.0),
            (1000, 3, 200, 300.0),
            (1050, 3, 300, 1000.0),
            (1050, 7, 400, 1000.0),
            (1100, 3, 500, 1800.0),
            (1100, 7, 600, 1800.0),
            (1150, 3, 700, 2600.0),
            (1150, 7, 800, 2600.0),
            (1200, 3, 900, 3400.0),
            (1200, 7, 1000, 3400.0),
            (1250, 3, 1100, 4200.0),
            (1250, 7, 1200, 4200.0),
        ], ["time", "id", "volume", "volume_sum"])
        test_utils.assert_same(new_pdf1, expected_pdf1)

        new_pdf2 = (vol.summarizeWindows(windows.past_absolute_time('99ns'),
                                         summarizers.sum("volume"),
                                         key="id").toPandas())
        expected_pdf2 = test_utils.make_pdf([
            (1000, 7, 100, 100.0),
            (1000, 3, 200, 200.0),
            (1050, 3, 300, 500.0),
            (1050, 7, 400, 500.0),
            (1100, 3, 500, 800.0),
            (1100, 7, 600, 1000.0),
            (1150, 3, 700, 1200.0),
            (1150, 7, 800, 1400.0),
            (1200, 3, 900, 1600.0),
            (1200, 7, 1000, 1800.0),
            (1250, 3, 1100, 2000.0),
            (1250, 7, 1200, 2200.0),
        ], ["time", "id", "volume", "volume_sum"])
        test_utils.assert_same(new_pdf2, expected_pdf2)
Example #11
0
 def test_addSummaryColumns(self):
     from ts.flint import summarizers
     vol = self.vol()
     expected_pdf = test_utils.make_pdf([
         (1000, 7, 100, 100.0),
         (1000, 3, 200, 300.0),
         (1050, 3, 300, 600.0),
         (1050, 7, 400, 1000.0),
         (1100, 3, 500, 1500.0),
         (1100, 7, 600, 2100.0),
         (1150, 3, 700, 2800.0),
         (1150, 7, 800, 3600.0),
         (1200, 3, 900, 4500.0),
         (1200, 7, 1000, 5500.0),
         (1250, 3, 1100, 6600.0),
         (1250, 7, 1200, 7800.0),
     ], ["time", "id", "volume", "volume_sum"])
     new_pdf = vol.addSummaryColumns(summarizers.sum("volume")).toPandas()
     test_utils.assert_same(new_pdf, expected_pdf)
     expected_pdf = test_utils.make_pdf([
         (1000, 7, 100, 100.0),
         (1000, 3, 200, 200.0),
         (1050, 3, 300, 500.0),
         (1050, 7, 400, 500.0),
         (1100, 3, 500, 1000.0),
         (1100, 7, 600, 1100.0),
         (1150, 3, 700, 1700.0),
         (1150, 7, 800, 1900.0),
         (1200, 3, 900, 2600.0),
         (1200, 7, 1000, 2900.0),
         (1250, 3, 1100, 3700.0),
         (1250, 7, 1200, 4100.0),
     ], ["time", "id", "volume", "volume_sum"])
     new_pdf = vol.addSummaryColumns(summarizers.sum("volume"),
                                     "id").toPandas()
     test_utils.assert_same(new_pdf, expected_pdf, "with key")
Example #12
0
 def test_summary_compose(self):
     from ts.flint import summarizers
     price = self.price()
     expected_pdf = test_utils.make_pdf([(
         0,
         6.0,
         0.5,
         3.25,
         1.802775638,
     )], ["time", "price_max", "price_min", "price_mean", "price_stddev"])
     result = price.summarize([
         summarizers.max("price"),
         summarizers.min("price"),
         summarizers.mean("price"),
         summarizers.stddev("price")
     ]).toPandas()
     pdt.assert_frame_equal(result, expected_pdf)
Example #13
0
 def test_summary_weighted_mean(self):
     from ts.flint import summarizers
     price = self.price()
     vol = self.vol()
     expected_pdf = test_utils.make_pdf([(
         0,
         4.166667,
         1.547494,
         8.237545,
         12,
     )], [
         "time", "price_volume_weightedMean",
         "price_volume_weightedStandardDeviation",
         "price_volume_weightedTStat", "price_volume_observationCount"
     ])
     joined = price.leftJoin(vol, key="id")
     result = joined.summarize(summarizers.weighted_mean(
         "price", "volume")).toPandas()
     pdt.assert_frame_equal(result, expected_pdf)
Example #14
0
 def test_addWindows(self):
     from ts.flint import windows
     vol = self.vol()
     id = vol.collect()
     expected_pdf = test_utils.make_pdf([
         (1000, 7, 100, [id[0], id[1]]),
         (1000, 3, 200, [id[0], id[1]]),
         (1050, 3, 300, [id[0], id[1], id[2], id[3]]),
         (1050, 7, 400, [id[0], id[1], id[2], id[3]]),
         (1100, 3, 500, [id[2], id[3], id[4], id[5]]),
         (1100, 7, 600, [id[2], id[3], id[4], id[5]]),
         (1150, 3, 700, [id[4], id[5], id[6], id[7]]),
         (1150, 7, 800, [id[4], id[5], id[6], id[7]]),
         (1200, 3, 900, [id[6], id[7], id[8], id[9]]),
         (1200, 7, 1000, [id[6], id[7], id[8], id[9]]),
         (1250, 3, 1100, [id[8], id[9], id[10], id[11]]),
         (1250, 7, 1200, [id[8], id[9], id[10], id[11]]),
     ], ["time", "id", "volume", "window_past_50ns"])
     new_pdf = vol.addWindows(windows.past_absolute_time("50ns")).toPandas()
     test_utils.assert_same(new_pdf, expected_pdf)
Example #15
0
 def vol3(self):
     return self.flintContext.read.pandas(
         test_utils.make_pdf(VOL3_DATA, ["time", "id", "volume"]))
Example #16
0
 def forecast(self):
     return self.flintContext.read.pandas(
         test_utils.make_pdf(FORECAST_DATA, ["time", "id", "forecast"]))
Example #17
0
 def price2(self):
     return self.flintContext.read.pandas(
         test_utils.make_pdf(PRICE2_DATA, ["time", "id", "price"]))
Example #18
0
 def intervals(self):
     return self.flintContext.read.pandas(
         test_utils.make_pdf(INTERVALS_DATA, ['time']))
Example #19
0
    def test_leftJoin(self):
        price = self.price()
        vol = self.vol()
        expected_pdf = test_utils.make_pdf([(
            1000,
            7,
            0.5,
            100,
        ), (
            1000,
            3,
            1.0,
            200,
        ), (
            1050,
            3,
            1.5,
            300,
        ), (
            1050,
            7,
            2.0,
            400,
        ), (
            1100,
            3,
            2.5,
            500,
        ), (
            1100,
            7,
            3.0,
            600,
        ), (
            1150,
            3,
            3.5,
            700,
        ), (
            1150,
            7,
            4.0,
            800,
        ), (
            1200,
            3,
            4.5,
            900,
        ), (
            1200,
            7,
            5.0,
            1000,
        ), (
            1250,
            3,
            5.5,
            1100,
        ), (
            1250,
            7,
            6.0,
            1200,
        )], ["time", "id", "price", "volume"])

        new_pdf = price.leftJoin(vol, key=["id"]).toPandas()
        test_utils.assert_same(new_pdf, expected_pdf)
        test_utils.assert_same(new_pdf,
                               price.leftJoin(vol, key="id").toPandas())

        expected_pdf = test_utils.make_pdf([
            (1000, 7, 0.5, 100),
            (1000, 3, 1.0, 200),
            (1050, 3, 1.5, None),
            (1050, 7, 2.0, None),
            (1100, 3, 2.5, 500),
            (1100, 7, 3.0, 600),
            (1150, 3, 3.5, 700),
            (1150, 7, 4.0, 800),
            (1200, 3, 4.5, 900),
            (1200, 7, 5.0, 1000),
            (1250, 3, 5.5, 1100),
            (1250, 7, 6.0, 1200),
        ], ["time", "id", "price", "volume"])

        new_pdf = price.leftJoin(vol.filter(vol.time != 1050),
                                 key="id").toPandas()
        test_utils.assert_same(new_pdf, expected_pdf)
Example #20
0
    def test_groupByCycle(self):
        vol = self.vol()
        expected_pdf1 = test_utils.make_pdf([
            (1000, [(
                1000,
                7,
                100,
            ), (
                1000,
                3,
                200,
            )]),
            (1050, [(
                1050,
                3,
                300,
            ), (
                1050,
                7,
                400,
            )]),
            (1100, [(
                1100,
                3,
                500,
            ), (
                1100,
                7,
                600,
            )]),
            (1150, [(
                1150,
                3,
                700,
            ), (
                1150,
                7,
                800,
            )]),
            (1200, [(
                1200,
                3,
                900,
            ), (
                1200,
                7,
                1000,
            )]),
            (1250, [(
                1250,
                3,
                1100,
            ), (
                1250,
                7,
                1200,
            )]),
        ], ["time", "rows"])

        new_pdf1 = vol.groupByCycle().toPandas()
        test_utils.assert_same(new_pdf1, expected_pdf1)
Example #21
0
 def vol3(self):
     return self.flintContext.read.pandas(
         test_utils.make_pdf(VOL3_DATA, ["time", "id", "volume"]))
Example #22
0
 def price(self):
     return self.flintContext.read.pandas(
         test_utils.make_pdf(PRICE_DATA, ["time", "id", "price"]))
Example #23
0
    def test_addColumnsForCycle(self):
        import pyspark.sql.types as pyspark_types
        price = self.price()
        vol3 = self.vol3()
        expected_pdf = test_utils.make_pdf([
            [1000, 7, 0.5, 1.0],
            [1000, 3, 1.0, 2.0],
            [1050, 3, 1.5, 3.0],
            [1050, 7, 2.0, 4.0],
            [1100, 3, 2.5, 5.0],
            [1100, 7, 3.0, 6.0],
            [1150, 3, 3.5, 7.0],
            [1150, 7, 4.0, 8.0],
            [1200, 3, 4.5, 9.0],
            [1200, 7, 5.0, 10.0],
            [1250, 3, 5.5, 11.0],
            [1250, 7, 6.0, 12.0],
        ], ["time", "id", "price", "adjustedPrice"])

        def fn_1(rows):
            size = len(rows)
            return {row: row.price * size for row in rows}

        new_pdf = price.addColumnsForCycle({
            "adjustedPrice": (pyspark_types.DoubleType(), fn_1)
        }).toPandas()
        test_utils.assert_same(new_pdf, expected_pdf)

        expected_pdf = test_utils.make_pdf([
            [1000, 7, 100, 301],
            [1000, 7, 101, 302],
            [1000, 3, 200, 601],
            [1000, 3, 201, 602],
            [1050, 7, 400, 1201],
            [1050, 7, 401, 1202],
            [1050, 3, 300, 901],
            [1050, 3, 301, 902],
            [1100, 7, 600, 1801],
            [1100, 7, 601, 1802],
            [1100, 3, 500, 1501],
            [1100, 3, 501, 1502],
            [1150, 7, 800, 2401],
            [1150, 7, 801, 2402],
            [1150, 3, 700, 2101],
            [1150, 3, 701, 2102],
            [1200, 7, 1000, 3001],
            [1200, 7, 1001, 3002],
            [1200, 3, 900, 2701],
            [1200, 3, 901, 2702],
            [1250, 7, 1200, 3601],
            [1250, 7, 1201, 3602],
            [1250, 3, 1100, 3301],
            [1250, 3, 1101, 3302],
        ], ["time", "id", "volume", "totalVolume"])

        def fn_2(rows):
            volsum = sum([row.volume for row in rows])
            return {row: row.volume + volsum for row in rows}

        new_pdf = vol3.addColumnsForCycle(
            {
                "totalVolume": (pyspark_types.LongType(), fn_2)
            }, key=["id"]).toPandas()

        # Test API to support key as list.
        test_utils.assert_same(
            new_pdf,
            vol3.addColumnsForCycle(
                {
                    "totalVolume": (pyspark_types.LongType(), fn_2)
                }, key="id").toPandas())

        # XXX: should just do
        # test_utils.assert_same(new_pdf, expected_pdf, "with key")
        # once https://gitlab.twosigma.com/analytics/huohua/issues/26
        # gets resolved.
        test_utils.assert_same(
            new_pdf[new_pdf['id'] == 3].reset_index(drop=True),
            expected_pdf[expected_pdf['id'] == 3].reset_index(drop=True),
            "with key 3")
        test_utils.assert_same(
            new_pdf[new_pdf['id'] == 7].reset_index(drop=True),
            expected_pdf[expected_pdf['id'] == 7].reset_index(drop=True),
            "with key 7")
Example #24
0
 def intervals(self):
     return self.flintContext.read.pandas(
         test_utils.make_pdf(INTERVALS_DATA, ['time']))
Example #25
0
 def forecast(self):
     return self.flintContext.read.pandas(
         test_utils.make_pdf(FORECAST_DATA, ["time", "id", "forecast"]))