def add_increment( current_df: pyspark.sql.DataFrame, increment_df: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: union_df = current_df.union(increment_df) return (union_df.withColumn( '_row_number', F.row_number().over( Window.partitionBy(union_df['link']).orderBy([ 'scraped_at' ]))).where(F.col('_row_number') == 1).drop('_row_number'))
def my_spark(df: pyspark.sql.DataFrame) -> my_schema: session = flytekit.current_context().spark_session new_df = session.createDataFrame([("Bob", 10)], my_schema.column_names()) return df.union(new_df)