def test_resample_millis(self): """Test of resampling for millisecond windows""" schema = StructType([ StructField("symbol", StringType()), StructField("date", StringType()), StructField("event_ts", StringType()), StructField("trade_pr", FloatType()), StructField("trade_pr_2", FloatType()) ]) expectedSchema = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType()), StructField("floor_trade_pr", FloatType()), StructField("floor_date", StringType()), StructField("floor_trade_pr_2", FloatType()) ]) expectedSchemaMS = StructType([ StructField("symbol", StringType()), StructField("event_ts", StringType(), True), StructField("date", DoubleType()), StructField("trade_pr", DoubleType()), StructField("trade_pr_2", DoubleType()) ]) data = [["S1", "SAME_DT", "2020-08-01 00:00:10.12345", 349.21, 10.0], ["S1", "SAME_DT", "2020-08-01 00:00:10.123", 340.21, 9.0], ["S1", "SAME_DT", "2020-08-01 00:00:10.124", 353.32, 8.0]] expected_data_ms = [[ "S1", "2020-08-01 00:00:10.123", None, 344.71, 9.5 ], ["S1", "2020-08-01 00:00:10.124", None, 353.32, 8.0]] # construct dataframes df = self.buildTestDF(schema, data) dfExpected = self.buildTestDF(expectedSchemaMS, expected_data_ms) # convert to TSDF tsdf_left = TSDF(df, partition_cols=["symbol"]) # 30 minute aggregation resample_ms = tsdf_left.resample(freq="ms", func="mean").df.withColumn( "trade_pr", F.round(F.col('trade_pr'), 2)) int_df = TSDF(tsdf_left.df.withColumn( "event_ts", F.col("event_ts").cast("timestamp")), partition_cols=['symbol']) interpolated = int_df.interpolate(freq='ms', func='floor', method='ffill') self.assertDataFramesEqual(resample_ms, dfExpected)
def test_interpolation_using_custom_params(self): """Verify that by specifying optional paramters it will change the result of the interpolation based on those modified params.""" self.buildTestingDataFrame() expected_data = [ ["A", "A-1", "2020-01-01 00:00:00", 0.0, False, False], ["A", "A-1", "2020-01-01 00:00:30", 1.0, True, True], ["A", "A-1", "2020-01-01 00:01:00", 2.0, False, False], ["A", "A-1", "2020-01-01 00:01:30", 3.0, False, True], ["A", "A-1", "2020-01-01 00:02:00", 4.0, False, True], ["A", "A-1", "2020-01-01 00:02:30", 5.0, True, True], ["A", "A-1", "2020-01-01 00:03:00", 6.0, True, True], ["A", "A-1", "2020-01-01 00:03:30", 7.0, False, True], ["A", "A-1", "2020-01-01 00:04:00", 8.0, False, False], ["A", "A-1", "2020-01-01 00:04:30", 9.0, True, True], ["A", "A-1", "2020-01-01 00:05:00", 10.0, True, True], ["A", "A-1", "2020-01-01 00:05:30", 11.0, False, False], ] expected_schema = StructType([ StructField("partition_a", StringType()), StructField("partition_b", StringType()), StructField("other_ts_col", StringType(), False), StructField("value_a", DoubleType()), StructField("is_ts_interpolated", BooleanType(), False), StructField("is_interpolated_value_a", BooleanType(), False), ]) # Modify input DataFrame using different ts_col expected_df: DataFrame = self.buildTestDF(expected_schema, expected_data, ts_cols=["other_ts_col"]) input_tsdf = TSDF( self.simple_input_tsdf.df.withColumnRenamed( "event_ts", "other_ts_col"), partition_cols=["partition_a", "partition_b"], ts_col="other_ts_col", ) actual_df: DataFrame = input_tsdf.interpolate( ts_col="other_ts_col", show_interpolated=True, partition_cols=["partition_a", "partition_b"], target_cols=["value_a"], freq="30 seconds", func="mean", method="linear", ).df assert_df_equality(expected_df, actual_df, ignore_nullable=True)