def buildTestingDataFrame(self): schema = StructType([ StructField("partition_a", StringType()), StructField("partition_b", StringType()), StructField("event_ts", StringType()), StructField("value_a", FloatType()), StructField("value_b", FloatType()), ]) self.expected_schema = StructType([ StructField("partition_a", StringType()), StructField("partition_b", StringType()), StructField("event_ts", StringType(), False), StructField("value_a", DoubleType()), StructField("value_b", DoubleType()), StructField("is_ts_interpolated", BooleanType(), False), StructField("is_interpolated_value_a", BooleanType(), False), StructField("is_interpolated_value_b", BooleanType(), False), ]) # TODO: This data set tests with multiple partitions, should still be implemented at some time. data = [ ["A", "A-1", "2020-01-01 00:01:10", 349.21, None], ["A", "A-1", "2020-01-01 00:02:03", None, 4.0], ["A", "A-2", "2020-01-01 00:01:15", 340.21, 9.0], ["B", "B-1", "2020-01-01 00:01:15", 362.1, 4.0], ["A", "A-2", "2020-01-01 00:01:17", 353.32, 8.0], ["B", "B-2", "2020-01-01 00:02:14", None, 6.0], ["A", "A-1", "2020-01-01 00:03:02", 351.32, 7.0], ["B", "B-2", "2020-01-01 00:01:12", 361.1, 5.0], ] simple_data = [ ["A", "A-1", "2020-01-01 00:00:10", 0.0, None], ["A", "A-1", "2020-01-01 00:01:10", 2.0, 2.0], ["A", "A-1", "2020-01-01 00:01:32", None, None], ["A", "A-1", "2020-01-01 00:02:03", None, None], ["A", "A-1", "2020-01-01 00:03:32", None, 7.0], ["A", "A-1", "2020-01-01 00:04:12", 8.0, 8.0], ["A", "A-1", "2020-01-01 00:05:31", 11.0, None], ] # construct dataframes self.input_df = self.buildTestDF(schema, data) self.simple_input_df = self.buildTestDF(schema, simple_data) # generate TSDF self.input_tsdf = TSDF( self.input_df, partition_cols=["partition_a", "partition_b"], ts_col="event_ts", ) self.simple_input_tsdf = TSDF( self.simple_input_df, partition_cols=["partition_a", "partition_b"], ts_col="event_ts", ) # register interpolation helper self.interpolate_helper = Interpolation(is_resampled=False)
def test_resample_millis(self): """Test of resampling for millisecond windows""" schema = StructType([ StructField("symbol", StringType()), StructField("date", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()), StructField("trade_pr_2", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("floor_trade_pr", FloatType()), StructField("floor_date", StringType()), StructField("floor_trade_pr_2", FloatType()) ]) expectedSchemaMS = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType(), True), StructField("date", DoubleType()), StructField("trade_pr", DoubleType()), StructField("trade_pr_2", DoubleType()) ]) data = [["S1", "SAME_DT", "2020-08-01 00:00:10.12345", 349.21, 10.0], ["S1", "SAME_DT", "2020-08-01 00:00:10.123", 340.21, 9.0], ["S1", "SAME_DT", "2020-08-01 00:00:10.124", 353.32, 8.0]] expected_data_ms = [[ "S1", "2020-08-01 00:00:10.123", None, 344.71, 9.5 ], ["S1", "2020-08-01 00:00:10.124", None, 353.32, 8.0]] # construct dataframes df = self.buildTestDF(schema, data) dfExpected = self.buildTestDF(expectedSchemaMS, expected_data_ms) # convert to TSDF tsdf_left = TSDF(df, partition_cols=["symbol"]) # 30 minute aggregation resample_ms = tsdf_left.resample(freq="ms", func="mean").df.withColumn( "trade_pr", F.round(F.col('trade_pr'), 2)) int_df = TSDF(tsdf_left.df.withColumn( "event_ts", F.col("event_ts").cast("timestamp")), partition_cols=['symbol']) interpolated = int_df.interpolate(freq='ms', func='floor', method='ffill') self.assertDataFramesEqual(resample_ms, dfExpected)
def test_write_to_delta(self): """Test table write to delta format""" schema = StructType([ StructField("symbol", StringType()), StructField("date", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()), StructField("trade_pr_2", FloatType()) ]) data = [["S1", "SAME_DT", "2020-08-01 00:00:10", 349.21, 10.0], ["S1", "SAME_DT", "2020-08-01 00:00:11", 340.21, 9.0], ["S1", "SAME_DT", "2020-08-01 00:01:12", 353.32, 8.0], ["S1", "SAME_DT", "2020-08-01 00:01:13", 351.32, 7.0], ["S1", "SAME_DT", "2020-08-01 00:01:14", 350.32, 6.0], ["S1", "SAME_DT", "2020-09-01 00:01:12", 361.1, 5.0], ["S1", "SAME_DT", "2020-09-01 00:19:12", 362.1, 4.0]] # construct dataframe df = self.buildTestDF(schema, data) # convert to TSDF tsdf_left = TSDF(df, partition_cols=["symbol"], ts_col="event_ts") #test write to delta tsdf_left.write(self.spark, "my_table") logging.info('delta table count ' + str(self.spark.table("my_table").count())) # should be equal to the expected dataframe assert self.spark.table("my_table").count() == 7
def test_fourier_transform(self): """Test of fourier transform functionality in TSDF objects""" schema = StructType([ StructField("group", StringType()), StructField("time", LongType()), StructField("val", DoubleType()) ]) expectedSchema = StructType([ StructField("group", StringType()), StructField("time", LongType()), StructField("val", DoubleType()), StructField("freq", DoubleType()), StructField("ft_real", DoubleType()), StructField("ft_imag", DoubleType()) ]) data = [["Emissions", 1949, 2206.690829], ["Emissions", 1950, 2382.046176], ["Emissions", 1951, 2526.687327], ["Emissions", 1952, 2473.373964], ["WindGen", 1980, 0.0], ["WindGen", 1981, 0.0], ["WindGen", 1982, 0.0], ["WindGen", 1983, 0.029667962]] expected_data = [ ["Emissions", 1949, 2206.690829, 0.0, 9588.798296, -0.0], [ "Emissions", 1950, 2382.046176, 0.25, -319.996498, 91.32778800000006 ], ["Emissions", 1951, 2526.687327, -0.5, -122.0419839999995, -0.0], [ "Emissions", 1952, 2473.373964, -0.25, -319.996498, -91.32778800000006 ], ["WindGen", 1980, 0.0, 0.0, 0.029667962, -0.0], ["WindGen", 1981, 0.0, 0.25, 0.0, 0.029667962], ["WindGen", 1982, 0.0, -0.5, -0.029667962, -0.0], ["WindGen", 1983, 0.029667962, -0.25, 0.0, -0.029667962] ] # construct dataframes df = self.buildTestDF(schema, data, ts_cols=['time']) dfExpected = self.buildTestDF(expectedSchema, expected_data, ts_cols=['time']) # convert to TSDF tsdf_left = TSDF(df, ts_col="time", partition_cols=["group"]) result_tsdf = tsdf_left.fourier_transform(1, 'val') # should be equal to the expected dataframe self.assertDataFramesEqual(result_tsdf.df, dfExpected)
def test_interpolation_using_custom_params(self): """Verify that by specifying optional paramters it will change the result of the interpolation based on those modified params.""" self.buildTestingDataFrame() expected_data = [ ["A", "A-1", "2020-01-01 00:00:00", 0.0, False, False], ["A", "A-1", "2020-01-01 00:00:30", 1.0, True, True], ["A", "A-1", "2020-01-01 00:01:00", 2.0, False, False], ["A", "A-1", "2020-01-01 00:01:30", 3.0, False, True], ["A", "A-1", "2020-01-01 00:02:00", 4.0, False, True], ["A", "A-1", "2020-01-01 00:02:30", 5.0, True, True], ["A", "A-1", "2020-01-01 00:03:00", 6.0, True, True], ["A", "A-1", "2020-01-01 00:03:30", 7.0, False, True], ["A", "A-1", "2020-01-01 00:04:00", 8.0, False, False], ["A", "A-1", "2020-01-01 00:04:30", 9.0, True, True], ["A", "A-1", "2020-01-01 00:05:00", 10.0, True, True], ["A", "A-1", "2020-01-01 00:05:30", 11.0, False, False], ] expected_schema = StructType([ StructField("partition_a", StringType()), StructField("partition_b", StringType()), StructField("other_ts_col", StringType(), False), StructField("value_a", DoubleType()), StructField("is_ts_interpolated", BooleanType(), False), StructField("is_interpolated_value_a", BooleanType(), False), ]) # Modify input DataFrame using different ts_col expected_df: DataFrame = self.buildTestDF(expected_schema, expected_data, ts_cols=["other_ts_col"]) input_tsdf = TSDF( self.simple_input_tsdf.df.withColumnRenamed( "event_ts", "other_ts_col"), partition_cols=["partition_a", "partition_b"], ts_col="other_ts_col", ) actual_df: DataFrame = input_tsdf.interpolate( ts_col="other_ts_col", show_interpolated=True, partition_cols=["partition_a", "partition_b"], target_cols=["value_a"], freq="30 seconds", func="mean", method="linear", ).df assert_df_equality(expected_df, actual_df, ignore_nullable=True)
def test_write_to_delta(self): """Test of range stats for 20 minute rolling window""" schema = StructType([ StructField("symbol", StringType()), StructField("date", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()), StructField("trade_pr_2", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("date", StringType()), StructField("trade_pr_2", FloatType()), StructField("trade_pr", FloatType()) ]) data = [["S1", "SAME_DT", "2020-08-01 00:00:10", 349.21, 10.0], ["S1", "SAME_DT", "2020-08-01 00:00:11", 340.21, 9.0], ["S1", "SAME_DT", "2020-08-01 00:01:12", 353.32, 8.0], ["S1", "SAME_DT", "2020-08-01 00:01:13", 351.32, 7.0], ["S1", "SAME_DT", "2020-08-01 00:01:14", 350.32, 6.0], ["S1", "SAME_DT", "2020-09-01 00:01:12", 361.1, 5.0], ["S1", "SAME_DT", "2020-09-01 00:19:12", 362.1, 4.0]] expected_data = [[ "S1", "2020-08-01 00:00:00", "SAME_DT", 10.0, 349.21 ], ["S1", "2020-08-01 00:01:00", "SAME_DT", 8.0, 353.32], ["S1", "2020-09-01 00:01:00", "SAME_DT", 5.0, 361.1], ["S1", "2020-09-01 00:19:00", "SAME_DT", 4.0, 362.1]] # construct dataframes df = self.buildTestDF(schema, data) dfExpected = self.buildTestDF(expectedSchema, expected_data) # convert to TSDF tsdf_left = TSDF(df, partition_cols=["symbol"]) # using lookback of 20 minutes #featured_df = tsdf_left.resample(freq = "min", func = "closest_lead").df tsdf_left.write(self.spark, "my_table") print('delta table count ' + str(self.spark.table("my_table").count())) # should be equal to the expected dataframe assert self.spark.table("my_table").count() == 7
def buildTestingDataFrame(self): schema = StructType([ StructField("partition_a", StringType()), StructField("partition_b", StringType()), StructField("event_ts", StringType()), StructField("value_a", FloatType()), StructField("value_b", FloatType()), ]) simple_data = [ ["A", "A-1", "2020-01-01 00:00:10", 0.0, None], ["A", "A-1", "2020-01-01 00:01:10", 2.0, 2.0], ["A", "A-1", "2020-01-01 00:01:32", None, None], ["A", "A-1", "2020-01-01 00:02:03", None, None], ["A", "A-1", "2020-01-01 00:03:32", None, 7.0], ["A", "A-1", "2020-01-01 00:04:12", 8.0, 8.0], ["A", "A-1", "2020-01-01 00:05:31", 11.0, None], ["A", "A-2", "2020-01-01 00:00:10", 0.0, None], ["A", "A-2", "2020-01-01 00:01:10", 2.0, 2.0], ["A", "A-2", "2020-01-01 00:01:32", None, None], ["A", "A-2", "2020-01-01 00:02:03", None, None], ["A", "A-2", "2020-01-01 00:04:12", 8.0, 8.0], ["A", "A-2", "2020-01-01 00:05:31", 11.0, None], ["B", "A-2", "2020-01-01 00:01:10", 2.0, 2.0], ["B", "A-2", "2020-01-01 00:01:32", None, None], ["B", "A-2", "2020-01-01 00:02:03", None, None], ["B", "A-2", "2020-01-01 00:03:32", None, 7.0], ["B", "A-2", "2020-01-01 00:04:12", 8.0, 8.0], ] # construct dataframes self.simple_input_df = self.buildTestDF(schema, simple_data) self.simple_input_tsdf = TSDF( self.simple_input_df, partition_cols=["partition_a", "partition_b"], ts_col="event_ts", )
def test_upsample(self): """Test of range stats for 20 minute rolling window""" schema = StructType([ StructField("symbol", StringType()), StructField("date", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()), StructField("trade_pr_2", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("floor_trade_pr", FloatType()), StructField("floor_date", StringType()), StructField("floor_trade_pr_2", FloatType()) ]) expectedBarsSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("close_trade_pr", FloatType()), StructField("close_trade_pr_2", FloatType()), StructField("high_trade_pr", FloatType()), StructField("high_trade_pr_2", FloatType()), StructField("low_trade_pr", FloatType()), StructField("low_trade_pr_2", FloatType()), StructField("open_trade_pr", FloatType()), StructField("open_trade_pr_2", FloatType()) ]) data = [["S1", "SAME_DT", "2020-08-01 00:00:10", 349.21, 10.0], ["S1", "SAME_DT", "2020-08-01 00:00:11", 340.21, 9.0], ["S1", "SAME_DT", "2020-08-01 00:01:12", 353.32, 8.0], ["S1", "SAME_DT", "2020-08-01 00:01:13", 351.32, 7.0], ["S1", "SAME_DT", "2020-08-01 00:01:14", 350.32, 6.0], ["S1", "SAME_DT", "2020-09-01 00:01:12", 361.1, 5.0], ["S1", "SAME_DT", "2020-09-01 00:19:12", 362.1, 4.0]] expected_data = [[ "S1", "2020-08-01 00:00:00", 349.21, "SAME_DT", 10.0 ], ["S1", "2020-08-01 00:01:00", 353.32, "SAME_DT", 8.0], ["S1", "2020-09-01 00:01:00", 361.1, "SAME_DT", 5.0], ["S1", "2020-09-01 00:19:00", 362.1, "SAME_DT", 4.0]] expected_bars = [[ 'S1', '2020-08-01 00:00:00', 340.21, 9.0, 349.21, 10.0, 340.21, 9.0, 349.21, 10.0 ], [ 'S1', '2020-08-01 00:01:00', 350.32, 6.0, 353.32, 8.0, 350.32, 6.0, 353.32, 8.0 ], [ 'S1', '2020-09-01 00:01:00', 361.1, 5.0, 361.1, 5.0, 361.1, 5.0, 361.1, 5.0 ], [ 'S1', '2020-09-01 00:19:00', 362.1, 4.0, 362.1, 4.0, 362.1, 4.0, 362.1, 4.0 ]] # construct dataframes df = self.buildTestDF(schema, data) dfExpected = self.buildTestDF(expectedSchema, expected_data) barsExpected = self.buildTestDF(expectedBarsSchema, expected_bars) # convert to TSDF tsdf_left = TSDF(df, partition_cols=["symbol"]) # using lookback of 20 minutes featured_df = tsdf_left.resample(freq="min", func="floor").df bars = tsdf_left.calc_bars(freq='min', metricCols=['trade_pr', 'trade_pr_2']).df # should be equal to the expected dataframe self.assertDataFramesEqual(featured_df, dfExpected) #test bars summary self.assertDataFramesEqual(bars, barsExpected)
def test_range_stats(self): """Test of range stats for 20 minute rolling window""" schema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("mean_trade_pr", FloatType()), StructField("count_trade_pr", LongType(), nullable=False), StructField("min_trade_pr", FloatType()), StructField("max_trade_pr", FloatType()), StructField("sum_trade_pr", FloatType()), StructField("stddev_trade_pr", FloatType()), StructField("zscore_trade_pr", FloatType()) ]) data = [["S1", "2020-08-01 00:00:10", 349.21], ["S1", "2020-08-01 00:01:12", 351.32], ["S1", "2020-09-01 00:02:10", 361.1], ["S1", "2020-09-01 00:19:12", 362.1]] expected_data = [[ "S1", "2020-08-01 00:00:10", 349.21, 1, 349.21, 349.21, 349.21, None, None ], [ "S1", "2020-08-01 00:01:12", 350.26, 2, 349.21, 351.32, 700.53, 1.49, 0.71 ], [ "S1", "2020-09-01 00:02:10", 361.1, 1, 361.1, 361.1, 361.1, None, None ], [ "S1", "2020-09-01 00:19:12", 361.6, 2, 361.1, 362.1, 723.2, 0.71, 0.71 ]] # construct dataframes df = self.buildTestDF(schema, data) dfExpected = self.buildTestDF(expectedSchema, expected_data) # convert to TSDF tsdf_left = TSDF(df, partition_cols=["symbol"]) # using lookback of 20 minutes featured_df = tsdf_left.withRangeStats(rangeBackWindowSecs=1200).df # cast to decimal with precision in cents for simplicity featured_df = featured_df.select( F.col("symbol"), F.col("event_ts"), F.col("mean_trade_pr").cast("decimal(5, 2)"), F.col("count_trade_pr"), F.col("min_trade_pr").cast("decimal(5,2)"), F.col("max_trade_pr").cast("decimal(5,2)"), F.col("sum_trade_pr").cast("decimal(5,2)"), F.col("stddev_trade_pr").cast("decimal(5,2)"), F.col("zscore_trade_pr").cast("decimal(5,2)")) # cast to decimal with precision in cents for simplicity dfExpected = dfExpected.select( F.col("symbol"), F.col("event_ts"), F.col("mean_trade_pr").cast("decimal(5, 2)"), F.col("count_trade_pr"), F.col("min_trade_pr").cast("decimal(5,2)"), F.col("max_trade_pr").cast("decimal(5,2)"), F.col("sum_trade_pr").cast("decimal(5,2)"), F.col("stddev_trade_pr").cast("decimal(5,2)"), F.col("zscore_trade_pr").cast("decimal(5,2)")) # should be equal to the expected dataframe self.assertDataFramesEqual(featured_df, dfExpected)
def test_partitioned_asof_join(self): """AS-OF Join with a time-partition""" leftSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()) ]) rightSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("bid_pr", FloatType()), StructField("ask_pr", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("left_event_ts", StringType()), StructField("left_trade_pr", FloatType()), StructField("right_event_ts", StringType()), StructField("right_bid_pr", FloatType()), StructField("right_ask_pr", FloatType()) ]) left_data = [["S1", "2020-08-01 00:00:02", 349.21], ["S1", "2020-08-01 00:00:08", 351.32], ["S1", "2020-08-01 00:00:11", 361.12], ["S1", "2020-08-01 00:00:18", 364.31], ["S1", "2020-08-01 00:00:19", 362.94], ["S1", "2020-08-01 00:00:21", 364.27], ["S1", "2020-08-01 00:00:23", 367.36]] right_data = [["S1", "2020-08-01 00:00:01", 345.11, 351.12], ["S1", "2020-08-01 00:00:09", 348.10, 353.13], ["S1", "2020-08-01 00:00:12", 358.93, 365.12], ["S1", "2020-08-01 00:00:19", 359.21, 365.31]] expected_data = [[ "S1", "2020-08-01 00:00:02", 349.21, "2020-08-01 00:00:01", 345.11, 351.12 ], [ "S1", "2020-08-01 00:00:08", 351.32, "2020-08-01 00:00:01", 345.11, 351.12 ], [ "S1", "2020-08-01 00:00:11", 361.12, "2020-08-01 00:00:09", 348.10, 353.13 ], [ "S1", "2020-08-01 00:00:18", 364.31, "2020-08-01 00:00:12", 358.93, 365.12 ], [ "S1", "2020-08-01 00:00:19", 362.94, "2020-08-01 00:00:19", 359.21, 365.31 ], [ "S1", "2020-08-01 00:00:21", 364.27, "2020-08-01 00:00:19", 359.21, 365.31 ], [ "S1", "2020-08-01 00:00:23", 367.36, "2020-08-01 00:00:19", 359.21, 365.31 ]] # Construct dataframes dfLeft = self.buildTestDF(leftSchema, left_data) dfRight = self.buildTestDF(rightSchema, right_data) dfExpected = self.buildTestDF(expectedSchema, expected_data, ["left_event_ts", "right_event_ts"]) tsdf_left = TSDF(dfLeft, ts_col="event_ts", partition_cols=["symbol"]) tsdf_right = TSDF(dfRight, ts_col="event_ts", partition_cols=["symbol"]) joined_df = tsdf_left.asofJoin(tsdf_right, left_prefix="left", right_prefix="right", tsPartitionVal=10, fraction=0.1).df self.assertDataFramesEqual(joined_df, dfExpected)
def test_sequence_number_sort(self): """Skew AS-OF Join with Partition Window Test""" leftSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()), StructField("trade_id", IntegerType()) ]) rightSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("bid_pr", FloatType()), StructField("ask_pr", FloatType()), StructField("seq_nb", LongType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()), StructField("trade_id", IntegerType()), StructField("right_event_ts", StringType()), StructField("right_bid_pr", FloatType()), StructField("right_ask_pr", FloatType()), StructField("right_seq_nb", LongType()) ]) left_data = [["S1", "2020-08-01 00:00:10", 349.21, 1], ["S1", "2020-08-01 00:01:12", 351.32, 2], ["S1", "2020-09-01 00:02:10", 361.1, 3], ["S1", "2020-09-01 00:19:12", 362.1, 4]] right_data = [["S1", "2020-08-01 00:00:01", 345.11, 351.12, 1], ["S1", "2020-08-01 00:01:05", 348.10, 1000.13, 3], ["S1", "2020-08-01 00:01:05", 348.10, 100.13, 2], ["S1", "2020-09-01 00:02:01", 358.93, 365.12, 4], ["S1", "2020-09-01 00:15:01", 359.21, 365.31, 5]] expected_data = [[ "S1", "2020-08-01 00:00:10", 349.21, 1, "2020-08-01 00:00:01", 345.11, 351.12, 1 ], [ "S1", "2020-08-01 00:01:12", 351.32, 2, "2020-08-01 00:01:05", 348.10, 1000.13, 3 ], [ "S1", "2020-09-01 00:02:10", 361.1, 3, "2020-09-01 00:02:01", 358.93, 365.12, 4 ], [ "S1", "2020-09-01 00:19:12", 362.1, 4, "2020-09-01 00:15:01", 359.21, 365.31, 5 ]] # construct dataframes dfLeft = self.buildTestDF(leftSchema, left_data) dfRight = self.buildTestDF(rightSchema, right_data) dfExpected = self.buildTestDF(expectedSchema, expected_data, ["right_event_ts", "event_ts"]) # perform the join tsdf_left = TSDF(dfLeft, partition_cols=["symbol"]) tsdf_right = TSDF(dfRight, partition_cols=["symbol"], sequence_col="seq_nb") joined_df = tsdf_left.asofJoin(tsdf_right, right_prefix='right').df # joined dataframe should equal the expected dataframe self.assertDataFramesEqual(joined_df, dfExpected)
def test_asof_join(self): """AS-OF Join with out a time-partition test""" leftSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()) ]) rightSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("bid_pr", FloatType()), StructField("ask_pr", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("left_event_ts", StringType()), StructField("left_trade_pr", FloatType()), StructField("right_event_ts", StringType()), StructField("right_bid_pr", FloatType()), StructField("right_ask_pr", FloatType()) ]) left_data = [["S1", "2020-08-01 00:00:10", 349.21], ["S1", "2020-08-01 00:01:12", 351.32], ["S1", "2020-09-01 00:02:10", 361.1], ["S1", "2020-09-01 00:19:12", 362.1]] right_data = [["S1", "2020-08-01 00:00:01", 345.11, 351.12], ["S1", "2020-08-01 00:01:05", 348.10, 353.13], ["S1", "2020-09-01 00:02:01", 358.93, 365.12], ["S1", "2020-09-01 00:15:01", 359.21, 365.31]] expected_data = [[ "S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12 ], [ "S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 348.10, 353.13 ], [ "S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 358.93, 365.12 ], [ "S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31 ]] # Construct dataframes dfLeft = self.buildTestDF(leftSchema, left_data) dfRight = self.buildTestDF(rightSchema, right_data) dfExpected = self.buildTestDF(expectedSchema, expected_data, ["left_event_ts", "right_event_ts"]) # perform the join tsdf_left = TSDF(dfLeft, ts_col="event_ts", partition_cols=["symbol"]) tsdf_right = TSDF(dfRight, ts_col="event_ts", partition_cols=["symbol"]) joined_df = tsdf_left.asofJoin(tsdf_right, left_prefix="left", right_prefix="right").df # joined dataframe should equal the expected dataframe self.assertDataFramesEqual(joined_df, dfExpected)
def test_describe(self): """AS-OF Join with out a time-partition test""" leftSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()) ]) rightSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("bid_pr", FloatType()), StructField("ask_pr", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("left_event_ts", StringType()), StructField("left_trade_pr", FloatType()), StructField("right_event_ts", StringType()), StructField("right_bid_pr", FloatType()), StructField("right_ask_pr", FloatType()) ]) left_data = [["S1", "2020-08-01 00:00:10", 349.21], ["S1", "2020-08-01 00:01:12", 351.32], ["S1", "2020-09-01 00:02:10", 361.1], ["S1", "2020-09-01 00:19:12", 362.1]] right_data = [["S1", "2020-08-01 00:00:01", 345.11, 351.12], ["S1", "2020-08-01 00:01:05", 348.10, 353.13], ["S1", "2020-09-01 00:02:01", 358.93, 365.12], ["S1", "2020-09-01 00:15:01", 359.21, 365.31]] expected_data = [[ "S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12 ], [ "S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 348.10, 353.13 ], [ "S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 358.93, 365.12 ], [ "S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31 ]] # Construct dataframes dfLeft = self.buildTestDF(leftSchema, left_data) dfRight = self.buildTestDF(rightSchema, right_data) dfExpected = self.buildTestDF(expectedSchema, expected_data, ["left_event_ts", "right_event_ts"]) # perform the join tsdf_left = TSDF(dfLeft, ts_col="event_ts", partition_cols=["symbol"]) res = tsdf_left.describe() # joined dataframe should equal the expected dataframe #self.assertDataFramesEqual(res, dfExpected) assert res.count() == 7 assert res.filter(F.col("unique_ts_count") != " ").select( F.max(F.col('unique_ts_count'))).collect()[0][0] == "1" assert res.filter(F.col("min_ts") != " ").select( F.col('min_ts').cast( "string")).collect()[0][0] == '2020-08-01 00:00:10' assert res.filter(F.col("max_ts") != " ").select( F.col('max_ts').cast( "string")).collect()[0][0] == '2020-09-01 00:19:12'
def test_asof_join_nanos(self): """As of join with nanosecond timestamps""" leftSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()) ]) rightSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("bid_pr", FloatType()), StructField("ask_pr", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("left_event_ts", StringType()), StructField("left_trade_pr", FloatType()), StructField("right_event_ts", StringType()), StructField("right_ask_pr", FloatType()), StructField("right_bid_pr", FloatType()) ]) left_data = [ ["S1", "2022-01-01 09:59:59.123456789", 349.21], ["S1", "2022-01-01 10:00:00.123456788", 351.32], ["S1", "2022-01-01 10:00:00.123456789", 361.12], ["S1", "2022-01-01 10:00:01.123456789", 364.31], ] right_data = [["S1", "2022-01-01 10:00:00.1234567", 345.11, 351.12], ["S1", "2022-01-01 10:00:00.12345671", 348.10, 353.13], ["S1", "2022-01-01 10:00:00.12345675", 358.93, 365.12], ["S1", "2022-01-01 10:00:00.12345677", 358.91, 365.33], ["S1", "2022-01-01 10:00:01.10000001", 359.21, 365.31]] expected_data = [[ "S1", "2022-01-01 09:59:59.123456789", 349.21, None, None, None ], [ "S1", "2022-01-01 10:00:00.123456788", 351.32, "2022-01-01 10:00:00.12345677", 365.33, 358.91 ], [ "S1", "2022-01-01 10:00:00.123456789", 361.12, "2022-01-01 10:00:00.12345677", 365.33, 358.91 ], [ "S1", "2022-01-01 10:00:01.123456789", 364.31, "2022-01-01 10:00:01.10000001", 365.31, 359.21 ]] dfLeft = self.buildTestDF(leftSchema, left_data) dfRight = self.buildTestDF(rightSchema, right_data) dfExpected = self.buildTestDF(expectedSchema, expected_data, ts_cols=["left_event_ts"]) tsdf_left = TSDF(dfLeft, ts_col="event_ts", partition_cols=["symbol"]) tsdf_right = TSDF(dfRight, ts_col="event_ts", partition_cols=["symbol"]) joined_df = tsdf_left.asofJoin(tsdf_right, left_prefix="left", right_prefix="right").df self.assertDataFramesEqual(joined_df, dfExpected)
def test_asof_join_skip_nulls_disabled(self): """AS-OF Join with skip nulls disabled""" leftSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()) ]) rightSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("bid_pr", FloatType()), StructField("ask_pr", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("left_event_ts", StringType()), StructField("left_trade_pr", FloatType()), StructField("right_event_ts", StringType()), StructField("right_bid_pr", FloatType()), StructField("right_ask_pr", FloatType()) ]) left_data = [["S1", "2020-08-01 00:00:10", 349.21], ["S1", "2020-08-01 00:01:12", 351.32], ["S1", "2020-09-01 00:02:10", 361.1], ["S1", "2020-09-01 00:19:12", 362.1]] right_data = [["S1", "2020-08-01 00:00:01", 345.11, 351.12], ["S1", "2020-08-01 00:01:05", None, 353.13], ["S1", "2020-09-01 00:02:01", None, None], ["S1", "2020-09-01 00:15:01", 359.21, 365.31]] expected_data_skip_nulls = [[ "S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12 ], [ "S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", 345.11, 353.13 ], [ "S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", 345.11, 353.13 ], [ "S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31 ]] expected_data_skip_nulls_disabled = [[ "S1", "2020-08-01 00:00:10", 349.21, "2020-08-01 00:00:01", 345.11, 351.12 ], [ "S1", "2020-08-01 00:01:12", 351.32, "2020-08-01 00:01:05", None, 353.13 ], [ "S1", "2020-09-01 00:02:10", 361.1, "2020-09-01 00:02:01", None, None ], [ "S1", "2020-09-01 00:19:12", 362.1, "2020-09-01 00:15:01", 359.21, 365.31 ]] # Construct dataframes dfLeft = self.buildTestDF(leftSchema, left_data) dfRight = self.buildTestDF(rightSchema, right_data) dfExpectedSkipNulls = self.buildTestDF( expectedSchema, expected_data_skip_nulls, ["left_event_ts", "right_event_ts"]) dfExpectedSkipNullsDisabled = self.buildTestDF( expectedSchema, expected_data_skip_nulls_disabled, ["left_event_ts", "right_event_ts"]) tsdf_left = TSDF(dfLeft, ts_col="event_ts", partition_cols=["symbol"]) tsdf_right = TSDF(dfRight, ts_col="event_ts", partition_cols=["symbol"]) # perform the join with skip nulls enabled (default) joined_df = tsdf_left.asofJoin(tsdf_right, left_prefix="left", right_prefix="right").df # joined dataframe should equal the expected dataframe with nulls skipped self.assertDataFramesEqual(joined_df, dfExpectedSkipNulls) # perform the join with skip nulls disabled joined_df = tsdf_left.asofJoin(tsdf_right, left_prefix="left", right_prefix="right", skipNulls=False).df # joined dataframe should equal the expected dataframe without nulls skipped self.assertDataFramesEqual(joined_df, dfExpectedSkipNullsDisabled)
def test_upsample(self): """Test of range stats for 20 minute rolling window""" schema = StructType([ StructField("symbol", StringType()), StructField("date", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()), StructField("trade_pr_2", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("floor_trade_pr", FloatType()), StructField("floor_date", StringType()), StructField("floor_trade_pr_2", FloatType()) ]) expected_30m_Schema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("date", DoubleType()), StructField("trade_pr", DoubleType()), StructField("trade_pr_2", DoubleType()) ]) expectedBarsSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("close_trade_pr", FloatType()), StructField("close_trade_pr_2", FloatType()), StructField("high_trade_pr", FloatType()), StructField("high_trade_pr_2", FloatType()), StructField("low_trade_pr", FloatType()), StructField("low_trade_pr_2", FloatType()), StructField("open_trade_pr", FloatType()), StructField("open_trade_pr_2", FloatType()) ]) data = [["S1", "SAME_DT", "2020-08-01 00:00:10", 349.21, 10.0], ["S1", "SAME_DT", "2020-08-01 00:00:11", 340.21, 9.0], ["S1", "SAME_DT", "2020-08-01 00:01:12", 353.32, 8.0], ["S1", "SAME_DT", "2020-08-01 00:01:13", 351.32, 7.0], ["S1", "SAME_DT", "2020-08-01 00:01:14", 350.32, 6.0], ["S1", "SAME_DT", "2020-09-01 00:01:12", 361.1, 5.0], ["S1", "SAME_DT", "2020-09-01 00:19:12", 362.1, 4.0]] expected_data = [[ "S1", "2020-08-01 00:00:00", 349.21, "SAME_DT", 10.0 ], ["S1", "2020-08-01 00:01:00", 353.32, "SAME_DT", 8.0], ["S1", "2020-09-01 00:01:00", 361.1, "SAME_DT", 5.0], ["S1", "2020-09-01 00:19:00", 362.1, "SAME_DT", 4.0]] expected_data_30m = [["S1", "2020-08-01 00:00:00", 0.0, 348.88, 8.0], ["S1", "2020-08-01 00:05:00", 0.0, 0.0, 0.0], ["S1", "2020-09-01 00:00:00", 0.0, 361.1, 5.0], ["S1", "2020-09-01 00:15:00", 0.0, 362.1, 4.0]] expected_bars = [[ 'S1', '2020-08-01 00:00:00', 340.21, 9.0, 349.21, 10.0, 340.21, 9.0, 349.21, 10.0 ], [ 'S1', '2020-08-01 00:01:00', 350.32, 6.0, 353.32, 8.0, 350.32, 6.0, 353.32, 8.0 ], [ 'S1', '2020-09-01 00:01:00', 361.1, 5.0, 361.1, 5.0, 361.1, 5.0, 361.1, 5.0 ], [ 'S1', '2020-09-01 00:19:00', 362.1, 4.0, 362.1, 4.0, 362.1, 4.0, 362.1, 4.0 ]] # construct dataframes df = self.buildTestDF(schema, data) dfExpected = self.buildTestDF(expectedSchema, expected_data) expected_30s_df = self.buildTestDF(expected_30m_Schema, expected_data_30m) barsExpected = self.buildTestDF(expectedBarsSchema, expected_bars) # convert to TSDF tsdf_left = TSDF(df, partition_cols=["symbol"]) resample_30m = tsdf_left.resample(freq="5 minutes", func="mean", fill=True).df.withColumn( "trade_pr", F.round(F.col('trade_pr'), 2)) bars = tsdf_left.calc_bars(freq='min', metricCols=['trade_pr', 'trade_pr_2']).df upsampled = resample_30m.filter( F.col("event_ts").isin('2020-08-01 00:00:00', '2020-08-01 00:05:00', '2020-09-01 00:00:00', '2020-09-01 00:15:00')) #test upsample summary self.assertDataFramesEqual(upsampled, expected_30s_df) # test bars summary self.assertDataFramesEqual(bars, barsExpected)