def __calc_linear_spark(self, df: DataFrame, ts_col: str, target_col: str): """ Native Spark function for calculating linear interpolation on a DataFrame. :param df: prepared dataframe to be interpolated :param ts_col: timeseries column name :param target_col: column to be interpolated """ interpolation_expr = f""" case when is_interpolated_{target_col} = false then {target_col} when {target_col} is null then (next_null_{target_col} - previous_{target_col}) /(unix_timestamp(next_timestamp_{target_col})-unix_timestamp(previous_timestamp_{target_col})) *(unix_timestamp({ts_col}) - unix_timestamp(previous_timestamp_{target_col})) + previous_{target_col} else (next_{target_col}-{target_col}) /(unix_timestamp(next_timestamp)-unix_timestamp(previous_timestamp)) *(unix_timestamp({ts_col}) - unix_timestamp(previous_timestamp)) + {target_col} end as {target_col} """ # remove target column to avoid duplication during interpolation expression cols: List[str] = df.columns cols.remove(target_col) interpolated: DataFrame = df.selectExpr(*cols, interpolation_expr) # Preserve column order return interpolated.select(*df.columns)
def convert_types_for_ml(df: DataFrame) -> DataFrame: return df.selectExpr("CAST(value AS STRING)") \ .select(from_json("value", schema=schema).alias("data")) \ .select("data.*")
def convert_types_elastic_for_ml(df: DataFrame) -> DataFrame: df = df.selectExpr("CAST(value AS STRING)") \ .select(from_json("value", schema=schema_elastic).alias("data")) \ .select("data.*") return df.withColumn("radiant_win_int", df.radiant_win.cast(IntegerType()))