Esempio n. 1
0
    def __calc_linear_spark(self, df: DataFrame, ts_col: str, target_col: str):
        """
        Native Spark function for calculating linear interpolation on a DataFrame.

        :param df: prepared dataframe to be interpolated
        :param ts_col: timeseries column name
        :param target_col: column to be interpolated
        """
        interpolation_expr = f"""
        case when is_interpolated_{target_col} = false then {target_col}
            when {target_col} is null then 
            (next_null_{target_col} - previous_{target_col})
            /(unix_timestamp(next_timestamp_{target_col})-unix_timestamp(previous_timestamp_{target_col}))
            *(unix_timestamp({ts_col}) - unix_timestamp(previous_timestamp_{target_col}))
            + previous_{target_col}
        else 
            (next_{target_col}-{target_col})
            /(unix_timestamp(next_timestamp)-unix_timestamp(previous_timestamp))
            *(unix_timestamp({ts_col}) - unix_timestamp(previous_timestamp)) 
            + {target_col}
        end as {target_col}
        """

        # remove target column to avoid duplication during interpolation expression
        cols: List[str] = df.columns
        cols.remove(target_col)
        interpolated: DataFrame = df.selectExpr(*cols, interpolation_expr)
        # Preserve column order
        return interpolated.select(*df.columns)
Esempio n. 2
0
def convert_types_for_ml(df: DataFrame) -> DataFrame:
    return df.selectExpr("CAST(value AS STRING)") \
           .select(from_json("value", schema=schema).alias("data")) \
           .select("data.*")
Esempio n. 3
0
def convert_types_elastic_for_ml(df: DataFrame) -> DataFrame:
    df = df.selectExpr("CAST(value AS STRING)") \
           .select(from_json("value", schema=schema_elastic).alias("data")) \
           .select("data.*")
    return df.withColumn("radiant_win_int", df.radiant_win.cast(IntegerType()))