def test_summarizeIntervals(self): from ts.flint import summarizers vol = self.vol() clock = self.flintContext.read.pandas( test_utils.make_pdf([ (1000, ), (1100, ), (1200, ), (1300, ), ], ["time"])) new_pdf1 = vol.summarizeIntervals( clock, summarizers.sum("volume")).toPandas() expected_pdf1 = test_utils.make_pdf([ (1000, 1000.0), (1100, 2600.0), (1200, 4200.0), ], ["time", "volume_sum"]) test_utils.assert_same(new_pdf1, expected_pdf1) new_pdf2 = vol.summarizeIntervals(clock, summarizers.sum("volume"), key="id").toPandas() expected_pdf2 = test_utils.make_pdf([ (1000, 7, 500.0), (1000, 3, 500.0), (1100, 3, 1200.0), (1100, 7, 1400.0), (1200, 3, 2000.0), (1200, 7, 2200.0), ], ["time", "id", "volume_sum"]) test_utils.assert_same(new_pdf2, expected_pdf2)
def test_summarizeCycles(self): from ts.flint import summarizers vol = self.vol() vol2 = self.vol2() expected_pdf1 = test_utils.make_pdf([ ( 1000, 300.0, ), ( 1050, 700.0, ), ( 1100, 1100.0, ), ( 1150, 1500.0, ), ( 1200, 1900.0, ), ( 1250, 2300.0, ), ], ["time", "volume_sum"]) new_pdf1 = vol.summarizeCycles(summarizers.sum("volume")).toPandas() test_utils.assert_same(new_pdf1, expected_pdf1) expected_pdf2 = test_utils.make_pdf([ (1000, 7, 200.0), (1000, 3, 400.0), (1050, 3, 600.0), (1050, 7, 800.0), (1100, 3, 1000.0), (1100, 7, 1200.0), (1150, 3, 1400.0), (1150, 7, 1600.0), (1200, 3, 1800.0), (1200, 7, 2000.0), (1250, 3, 2200.0), (1250, 7, 2400.0), ], ["time", "id", "volume_sum"]) new_pdf2 = vol2.summarizeCycles(summarizers.sum("volume"), key="id").toPandas() test_utils.assert_same(new_pdf2, expected_pdf2)
def test_groupByInterval(self): vol = self.vol() intervals = self.intervals() id = vol.collect() expected_pdf = test_utils.make_pdf([ (1000, 7, [id[0], id[3]]), (1000, 3, [id[1], id[2]]), (1100, 7, [id[5], id[7]]), (1100, 3, [id[4], id[6]]), (1200, 7, [id[9], id[11]]), (1200, 3, [id[8], id[10]]), ], ["time", "id", "rows"]) new_pdf = vol.groupByInterval(intervals, key=["id"]).toPandas() new_pdf1 = vol.groupByInterval(intervals, key="id").toPandas() test_utils.assert_same(new_pdf, new_pdf1) # XXX: should just do # test_utils.assert_same(new_pdf, expected_pdf) # once https://gitlab.twosigma.com/analytics/huohua/issues/26 # gets resolved. test_utils.assert_same( new_pdf[new_pdf['id'] == 3].reset_index(drop=True), expected_pdf[expected_pdf['id'] == 3].reset_index(drop=True), ) test_utils.assert_same( new_pdf[new_pdf['id'] == 7].reset_index(drop=True), expected_pdf[expected_pdf['id'] == 7].reset_index(drop=True), )
def test_futureLeftJoin(self): import pyspark.sql.types as pyspark_types price = self.price() vol = self.vol() expected_pdf = test_utils.make_pdf([ (1000, 7, 0.5, 400, 1050), (1000, 3, 1.0, 300, 1050), (1050, 3, 1.5, 500, 1100), (1050, 7, 2.0, 600, 1100), (1100, 3, 2.5, 700, 1150), (1100, 7, 3.0, 800, 1150), (1150, 3, 3.5, 900, 1200), (1150, 7, 4.0, 1000, 1200), (1200, 3, 4.5, 1100, 1250), (1200, 7, 5.0, 1200, 1250), (1250, 3, 5.5, None, None), (1250, 7, 6.0, None, None), ], ["time", "id", "price", "volume", "time2"]) new_pdf = price.futureLeftJoin(vol.withColumn( "time2", vol.time.cast(pyspark_types.LongType())), tolerance=pd.Timedelta("100ns"), key=["id"], strict_lookahead=True).toPandas() new_pdf1 = price.futureLeftJoin(vol.withColumn( "time2", vol.time.cast(pyspark_types.LongType())), tolerance=pd.Timedelta("100ns"), key="id", strict_lookahead=True).toPandas() test_utils.assert_same(new_pdf, new_pdf1) test_utils.assert_same(new_pdf, expected_pdf)
def test_summary_min(self): from ts.flint import summarizers forecast = self.forecast() expected_pdf = test_utils.make_pdf([( 0, -9.6, )], ["time", "forecast_min"]) result = forecast.summarize(summarizers.min("forecast")).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summary_quantile(self): from ts.flint import summarizers forecast = self.forecast() expected_pdf = test_utils.make_pdf( [(0, -2.22, 1.75)], ["time", "forecast_0.2quantile", "forecast_0.5quantile"]) result = forecast.summarize( summarizers.quantile(self.sc, "forecast", (0.2, 0.5))).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summary_sum(self): from ts.flint import summarizers vol = self.vol() expected_pdf = test_utils.make_pdf([( 0, 7800.0, )], ["time", "volume_sum"]) new_pdf = vol.summarize(summarizers.sum("volume")).toPandas() test_utils.assert_same(new_pdf, expected_pdf) expected_pdf = test_utils.make_pdf([ ( 0, 7, 4100.0, ), ( 0, 3, 3700.0, ), ], ["time", "id", "volume_sum"]) new_pdf = vol.summarize(summarizers.sum("volume"), key=["id"]).toPandas() new_pdf1 = vol.summarize(summarizers.sum("volume"), key="id").toPandas() test_utils.assert_same(new_pdf, new_pdf1) # XXX: should just do: # test_utils.assert_same(new_pdf, expected_pdf, "by id") # once https://gitlab.twosigma.com/analytics/huohua/issues/26 # gets resolved. test_utils.assert_same( new_pdf[new_pdf['id'] == 3].reset_index(drop=True), expected_pdf[expected_pdf['id'] == 3].reset_index(drop=True), "by id 3") test_utils.assert_same( new_pdf[new_pdf['id'] == 7].reset_index(drop=True), expected_pdf[expected_pdf['id'] == 7].reset_index(drop=True), "by id 7")
def test_summary_zscore(self): from ts.flint import summarizers price = self.price() expected_pdf = test_utils.make_pdf([( 0, 1.5254255396193801, )], ["time", "price_zScore"]) new_pdf = price.summarize(summarizers.zscore( "price", in_sample=True)).toPandas() test_utils.assert_same(new_pdf, expected_pdf, "in-sample") expected_pdf = test_utils.make_pdf([( 0, 1.8090680674665818, )], ["time", "price_zScore"]) new_pdf = price.summarize(summarizers.zscore( "price", in_sample=False)).toPandas() test_utils.assert_same(new_pdf, expected_pdf, "out-of-sample)")
def test_summary_variance(self): from ts.flint import summarizers price = self.price() forecast = self.forecast() expected_pdf = test_utils.make_pdf([( 0, 3.25, )], ["time", "price_variance"]) joined = price.leftJoin(forecast, key="id") result = joined.summarize(summarizers.variance("price")).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summarizeWindows(self): from ts.flint import windows from ts.flint import summarizers vol = self.vol() new_pdf1 = vol.summarizeWindows(windows.past_absolute_time('99ns'), summarizers.sum("volume")).toPandas() expected_pdf1 = test_utils.make_pdf([ (1000, 7, 100, 300.0), (1000, 3, 200, 300.0), (1050, 3, 300, 1000.0), (1050, 7, 400, 1000.0), (1100, 3, 500, 1800.0), (1100, 7, 600, 1800.0), (1150, 3, 700, 2600.0), (1150, 7, 800, 2600.0), (1200, 3, 900, 3400.0), (1200, 7, 1000, 3400.0), (1250, 3, 1100, 4200.0), (1250, 7, 1200, 4200.0), ], ["time", "id", "volume", "volume_sum"]) test_utils.assert_same(new_pdf1, expected_pdf1) new_pdf2 = (vol.summarizeWindows(windows.past_absolute_time('99ns'), summarizers.sum("volume"), key="id").toPandas()) expected_pdf2 = test_utils.make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 500.0), (1050, 7, 400, 500.0), (1100, 3, 500, 800.0), (1100, 7, 600, 1000.0), (1150, 3, 700, 1200.0), (1150, 7, 800, 1400.0), (1200, 3, 900, 1600.0), (1200, 7, 1000, 1800.0), (1250, 3, 1100, 2000.0), (1250, 7, 1200, 2200.0), ], ["time", "id", "volume", "volume_sum"]) test_utils.assert_same(new_pdf2, expected_pdf2)
def test_addSummaryColumns(self): from ts.flint import summarizers vol = self.vol() expected_pdf = test_utils.make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 300.0), (1050, 3, 300, 600.0), (1050, 7, 400, 1000.0), (1100, 3, 500, 1500.0), (1100, 7, 600, 2100.0), (1150, 3, 700, 2800.0), (1150, 7, 800, 3600.0), (1200, 3, 900, 4500.0), (1200, 7, 1000, 5500.0), (1250, 3, 1100, 6600.0), (1250, 7, 1200, 7800.0), ], ["time", "id", "volume", "volume_sum"]) new_pdf = vol.addSummaryColumns(summarizers.sum("volume")).toPandas() test_utils.assert_same(new_pdf, expected_pdf) expected_pdf = test_utils.make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 500.0), (1050, 7, 400, 500.0), (1100, 3, 500, 1000.0), (1100, 7, 600, 1100.0), (1150, 3, 700, 1700.0), (1150, 7, 800, 1900.0), (1200, 3, 900, 2600.0), (1200, 7, 1000, 2900.0), (1250, 3, 1100, 3700.0), (1250, 7, 1200, 4100.0), ], ["time", "id", "volume", "volume_sum"]) new_pdf = vol.addSummaryColumns(summarizers.sum("volume"), "id").toPandas() test_utils.assert_same(new_pdf, expected_pdf, "with key")
def test_summary_compose(self): from ts.flint import summarizers price = self.price() expected_pdf = test_utils.make_pdf([( 0, 6.0, 0.5, 3.25, 1.802775638, )], ["time", "price_max", "price_min", "price_mean", "price_stddev"]) result = price.summarize([ summarizers.max("price"), summarizers.min("price"), summarizers.mean("price"), summarizers.stddev("price") ]).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_summary_weighted_mean(self): from ts.flint import summarizers price = self.price() vol = self.vol() expected_pdf = test_utils.make_pdf([( 0, 4.166667, 1.547494, 8.237545, 12, )], [ "time", "price_volume_weightedMean", "price_volume_weightedStandardDeviation", "price_volume_weightedTStat", "price_volume_observationCount" ]) joined = price.leftJoin(vol, key="id") result = joined.summarize(summarizers.weighted_mean( "price", "volume")).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def test_addWindows(self): from ts.flint import windows vol = self.vol() id = vol.collect() expected_pdf = test_utils.make_pdf([ (1000, 7, 100, [id[0], id[1]]), (1000, 3, 200, [id[0], id[1]]), (1050, 3, 300, [id[0], id[1], id[2], id[3]]), (1050, 7, 400, [id[0], id[1], id[2], id[3]]), (1100, 3, 500, [id[2], id[3], id[4], id[5]]), (1100, 7, 600, [id[2], id[3], id[4], id[5]]), (1150, 3, 700, [id[4], id[5], id[6], id[7]]), (1150, 7, 800, [id[4], id[5], id[6], id[7]]), (1200, 3, 900, [id[6], id[7], id[8], id[9]]), (1200, 7, 1000, [id[6], id[7], id[8], id[9]]), (1250, 3, 1100, [id[8], id[9], id[10], id[11]]), (1250, 7, 1200, [id[8], id[9], id[10], id[11]]), ], ["time", "id", "volume", "window_past_50ns"]) new_pdf = vol.addWindows(windows.past_absolute_time("50ns")).toPandas() test_utils.assert_same(new_pdf, expected_pdf)
def vol3(self): return self.flintContext.read.pandas( test_utils.make_pdf(VOL3_DATA, ["time", "id", "volume"]))
def forecast(self): return self.flintContext.read.pandas( test_utils.make_pdf(FORECAST_DATA, ["time", "id", "forecast"]))
def price2(self): return self.flintContext.read.pandas( test_utils.make_pdf(PRICE2_DATA, ["time", "id", "price"]))
def intervals(self): return self.flintContext.read.pandas( test_utils.make_pdf(INTERVALS_DATA, ['time']))
def test_leftJoin(self): price = self.price() vol = self.vol() expected_pdf = test_utils.make_pdf([( 1000, 7, 0.5, 100, ), ( 1000, 3, 1.0, 200, ), ( 1050, 3, 1.5, 300, ), ( 1050, 7, 2.0, 400, ), ( 1100, 3, 2.5, 500, ), ( 1100, 7, 3.0, 600, ), ( 1150, 3, 3.5, 700, ), ( 1150, 7, 4.0, 800, ), ( 1200, 3, 4.5, 900, ), ( 1200, 7, 5.0, 1000, ), ( 1250, 3, 5.5, 1100, ), ( 1250, 7, 6.0, 1200, )], ["time", "id", "price", "volume"]) new_pdf = price.leftJoin(vol, key=["id"]).toPandas() test_utils.assert_same(new_pdf, expected_pdf) test_utils.assert_same(new_pdf, price.leftJoin(vol, key="id").toPandas()) expected_pdf = test_utils.make_pdf([ (1000, 7, 0.5, 100), (1000, 3, 1.0, 200), (1050, 3, 1.5, None), (1050, 7, 2.0, None), (1100, 3, 2.5, 500), (1100, 7, 3.0, 600), (1150, 3, 3.5, 700), (1150, 7, 4.0, 800), (1200, 3, 4.5, 900), (1200, 7, 5.0, 1000), (1250, 3, 5.5, 1100), (1250, 7, 6.0, 1200), ], ["time", "id", "price", "volume"]) new_pdf = price.leftJoin(vol.filter(vol.time != 1050), key="id").toPandas() test_utils.assert_same(new_pdf, expected_pdf)
def test_groupByCycle(self): vol = self.vol() expected_pdf1 = test_utils.make_pdf([ (1000, [( 1000, 7, 100, ), ( 1000, 3, 200, )]), (1050, [( 1050, 3, 300, ), ( 1050, 7, 400, )]), (1100, [( 1100, 3, 500, ), ( 1100, 7, 600, )]), (1150, [( 1150, 3, 700, ), ( 1150, 7, 800, )]), (1200, [( 1200, 3, 900, ), ( 1200, 7, 1000, )]), (1250, [( 1250, 3, 1100, ), ( 1250, 7, 1200, )]), ], ["time", "rows"]) new_pdf1 = vol.groupByCycle().toPandas() test_utils.assert_same(new_pdf1, expected_pdf1)
def price(self): return self.flintContext.read.pandas( test_utils.make_pdf(PRICE_DATA, ["time", "id", "price"]))
def test_addColumnsForCycle(self): import pyspark.sql.types as pyspark_types price = self.price() vol3 = self.vol3() expected_pdf = test_utils.make_pdf([ [1000, 7, 0.5, 1.0], [1000, 3, 1.0, 2.0], [1050, 3, 1.5, 3.0], [1050, 7, 2.0, 4.0], [1100, 3, 2.5, 5.0], [1100, 7, 3.0, 6.0], [1150, 3, 3.5, 7.0], [1150, 7, 4.0, 8.0], [1200, 3, 4.5, 9.0], [1200, 7, 5.0, 10.0], [1250, 3, 5.5, 11.0], [1250, 7, 6.0, 12.0], ], ["time", "id", "price", "adjustedPrice"]) def fn_1(rows): size = len(rows) return {row: row.price * size for row in rows} new_pdf = price.addColumnsForCycle({ "adjustedPrice": (pyspark_types.DoubleType(), fn_1) }).toPandas() test_utils.assert_same(new_pdf, expected_pdf) expected_pdf = test_utils.make_pdf([ [1000, 7, 100, 301], [1000, 7, 101, 302], [1000, 3, 200, 601], [1000, 3, 201, 602], [1050, 7, 400, 1201], [1050, 7, 401, 1202], [1050, 3, 300, 901], [1050, 3, 301, 902], [1100, 7, 600, 1801], [1100, 7, 601, 1802], [1100, 3, 500, 1501], [1100, 3, 501, 1502], [1150, 7, 800, 2401], [1150, 7, 801, 2402], [1150, 3, 700, 2101], [1150, 3, 701, 2102], [1200, 7, 1000, 3001], [1200, 7, 1001, 3002], [1200, 3, 900, 2701], [1200, 3, 901, 2702], [1250, 7, 1200, 3601], [1250, 7, 1201, 3602], [1250, 3, 1100, 3301], [1250, 3, 1101, 3302], ], ["time", "id", "volume", "totalVolume"]) def fn_2(rows): volsum = sum([row.volume for row in rows]) return {row: row.volume + volsum for row in rows} new_pdf = vol3.addColumnsForCycle( { "totalVolume": (pyspark_types.LongType(), fn_2) }, key=["id"]).toPandas() # Test API to support key as list. test_utils.assert_same( new_pdf, vol3.addColumnsForCycle( { "totalVolume": (pyspark_types.LongType(), fn_2) }, key="id").toPandas()) # XXX: should just do # test_utils.assert_same(new_pdf, expected_pdf, "with key") # once https://gitlab.twosigma.com/analytics/huohua/issues/26 # gets resolved. test_utils.assert_same( new_pdf[new_pdf['id'] == 3].reset_index(drop=True), expected_pdf[expected_pdf['id'] == 3].reset_index(drop=True), "with key 3") test_utils.assert_same( new_pdf[new_pdf['id'] == 7].reset_index(drop=True), expected_pdf[expected_pdf['id'] == 7].reset_index(drop=True), "with key 7")