Esempio n. 1
0
    def test_upsample(self):
        """Test of range stats for 20 minute rolling window"""
        schema = StructType([
            StructField("symbol", StringType()),
            StructField("date", StringType()),
            StructField("event_ts", StringType()),
            StructField("trade_pr", FloatType()),
            StructField("trade_pr_2", FloatType())
        ])

        expectedSchema = StructType([
            StructField("symbol", StringType()),
            StructField("event_ts", StringType()),
            StructField("floor_trade_pr", FloatType()),
            StructField("floor_date", StringType()),
            StructField("floor_trade_pr_2", FloatType())
        ])

        expectedBarsSchema = StructType([
            StructField("symbol", StringType()),
            StructField("event_ts", StringType()),
            StructField("close_trade_pr", FloatType()),
            StructField("close_trade_pr_2", FloatType()),
            StructField("high_trade_pr", FloatType()),
            StructField("high_trade_pr_2", FloatType()),
            StructField("low_trade_pr", FloatType()),
            StructField("low_trade_pr_2", FloatType()),
            StructField("open_trade_pr", FloatType()),
            StructField("open_trade_pr_2", FloatType())
        ])

        data = [["S1", "SAME_DT", "2020-08-01 00:00:10", 349.21, 10.0],
                ["S1", "SAME_DT", "2020-08-01 00:00:11", 340.21, 9.0],
                ["S1", "SAME_DT", "2020-08-01 00:01:12", 353.32, 8.0],
                ["S1", "SAME_DT", "2020-08-01 00:01:13", 351.32, 7.0],
                ["S1", "SAME_DT", "2020-08-01 00:01:14", 350.32, 6.0],
                ["S1", "SAME_DT", "2020-09-01 00:01:12", 361.1, 5.0],
                ["S1", "SAME_DT", "2020-09-01 00:19:12", 362.1, 4.0]]

        expected_data = [[
            "S1", "2020-08-01 00:00:00", 349.21, "SAME_DT", 10.0
        ], ["S1", "2020-08-01 00:01:00", 353.32, "SAME_DT", 8.0],
                         ["S1", "2020-09-01 00:01:00", 361.1, "SAME_DT", 5.0],
                         ["S1", "2020-09-01 00:19:00", 362.1, "SAME_DT", 4.0]]

        expected_bars = [[
            'S1', '2020-08-01 00:00:00', 340.21, 9.0, 349.21, 10.0, 340.21,
            9.0, 349.21, 10.0
        ],
                         [
                             'S1', '2020-08-01 00:01:00', 350.32, 6.0, 353.32,
                             8.0, 350.32, 6.0, 353.32, 8.0
                         ],
                         [
                             'S1', '2020-09-01 00:01:00', 361.1, 5.0, 361.1,
                             5.0, 361.1, 5.0, 361.1, 5.0
                         ],
                         [
                             'S1', '2020-09-01 00:19:00', 362.1, 4.0, 362.1,
                             4.0, 362.1, 4.0, 362.1, 4.0
                         ]]

        # construct dataframes
        df = self.buildTestDF(schema, data)
        dfExpected = self.buildTestDF(expectedSchema, expected_data)
        barsExpected = self.buildTestDF(expectedBarsSchema, expected_bars)

        # convert to TSDF
        tsdf_left = TSDF(df, partition_cols=["symbol"])

        # using lookback of 20 minutes
        featured_df = tsdf_left.resample(freq="min", func="floor").df

        bars = tsdf_left.calc_bars(freq='min',
                                   metricCols=['trade_pr', 'trade_pr_2']).df

        # should be equal to the expected dataframe
        self.assertDataFramesEqual(featured_df, dfExpected)

        #test bars summary
        self.assertDataFramesEqual(bars, barsExpected)
Esempio n. 2
0
    def test_upsample(self):
        """Test of range stats for 20 minute rolling window"""
        schema = StructType([
            StructField("symbol", StringType()),
            StructField("date", StringType()),
            StructField("event_ts", StringType()),
            StructField("trade_pr", FloatType()),
            StructField("trade_pr_2", FloatType())
        ])

        expectedSchema = StructType([
            StructField("symbol", StringType()),
            StructField("event_ts", StringType()),
            StructField("floor_trade_pr", FloatType()),
            StructField("floor_date", StringType()),
            StructField("floor_trade_pr_2", FloatType())
        ])

        expected_30m_Schema = StructType([
            StructField("symbol", StringType()),
            StructField("event_ts", StringType()),
            StructField("date", DoubleType()),
            StructField("trade_pr", DoubleType()),
            StructField("trade_pr_2", DoubleType())
        ])

        expectedBarsSchema = StructType([
            StructField("symbol", StringType()),
            StructField("event_ts", StringType()),
            StructField("close_trade_pr", FloatType()),
            StructField("close_trade_pr_2", FloatType()),
            StructField("high_trade_pr", FloatType()),
            StructField("high_trade_pr_2", FloatType()),
            StructField("low_trade_pr", FloatType()),
            StructField("low_trade_pr_2", FloatType()),
            StructField("open_trade_pr", FloatType()),
            StructField("open_trade_pr_2", FloatType())
        ])

        data = [["S1", "SAME_DT", "2020-08-01 00:00:10", 349.21, 10.0],
                ["S1", "SAME_DT", "2020-08-01 00:00:11", 340.21, 9.0],
                ["S1", "SAME_DT", "2020-08-01 00:01:12", 353.32, 8.0],
                ["S1", "SAME_DT", "2020-08-01 00:01:13", 351.32, 7.0],
                ["S1", "SAME_DT", "2020-08-01 00:01:14", 350.32, 6.0],
                ["S1", "SAME_DT", "2020-09-01 00:01:12", 361.1, 5.0],
                ["S1", "SAME_DT", "2020-09-01 00:19:12", 362.1, 4.0]]

        expected_data = [[
            "S1", "2020-08-01 00:00:00", 349.21, "SAME_DT", 10.0
        ], ["S1", "2020-08-01 00:01:00", 353.32, "SAME_DT", 8.0],
                         ["S1", "2020-09-01 00:01:00", 361.1, "SAME_DT", 5.0],
                         ["S1", "2020-09-01 00:19:00", 362.1, "SAME_DT", 4.0]]

        expected_data_30m = [["S1", "2020-08-01 00:00:00", 0.0, 348.88, 8.0],
                             ["S1", "2020-08-01 00:05:00", 0.0, 0.0, 0.0],
                             ["S1", "2020-09-01 00:00:00", 0.0, 361.1, 5.0],
                             ["S1", "2020-09-01 00:15:00", 0.0, 362.1, 4.0]]

        expected_bars = [[
            'S1', '2020-08-01 00:00:00', 340.21, 9.0, 349.21, 10.0, 340.21,
            9.0, 349.21, 10.0
        ],
                         [
                             'S1', '2020-08-01 00:01:00', 350.32, 6.0, 353.32,
                             8.0, 350.32, 6.0, 353.32, 8.0
                         ],
                         [
                             'S1', '2020-09-01 00:01:00', 361.1, 5.0, 361.1,
                             5.0, 361.1, 5.0, 361.1, 5.0
                         ],
                         [
                             'S1', '2020-09-01 00:19:00', 362.1, 4.0, 362.1,
                             4.0, 362.1, 4.0, 362.1, 4.0
                         ]]

        # construct dataframes
        df = self.buildTestDF(schema, data)
        dfExpected = self.buildTestDF(expectedSchema, expected_data)
        expected_30s_df = self.buildTestDF(expected_30m_Schema,
                                           expected_data_30m)
        barsExpected = self.buildTestDF(expectedBarsSchema, expected_bars)

        # convert to TSDF
        tsdf_left = TSDF(df, partition_cols=["symbol"])

        resample_30m = tsdf_left.resample(freq="5 minutes",
                                          func="mean",
                                          fill=True).df.withColumn(
                                              "trade_pr",
                                              F.round(F.col('trade_pr'), 2))

        bars = tsdf_left.calc_bars(freq='min',
                                   metricCols=['trade_pr', 'trade_pr_2']).df

        upsampled = resample_30m.filter(
            F.col("event_ts").isin('2020-08-01 00:00:00',
                                   '2020-08-01 00:05:00',
                                   '2020-09-01 00:00:00',
                                   '2020-09-01 00:15:00'))

        #test upsample summary
        self.assertDataFramesEqual(upsampled, expected_30s_df)

        # test bars summary
        self.assertDataFramesEqual(bars, barsExpected)