def full_silver_df(spark: SparkSession) -> DataFrame: stream_name = "create_silver" silver_json_df = load_dataframe( spark, format="json", path=paths.test_silver, schema=schemas.silver, streaming=True, ) (silver_json_df.writeStream.format( "delta").partitionBy("p_eventdate").outputMode("append").option( "checkpointLocation", paths.silver_checkpoint).option( "path", paths.silver).queryName(stream_name).start()) until_stream_is_ready(spark, stream_name) yield load_dataframe(spark, format="delta", path=paths.silver) rmtree(paths.silver) rmtree(paths.silver_checkpoint)
def loaded_raw_df(spark: SparkSession) -> DataFrame: yield load_dataframe( spark, format="text", path=paths.test_raw, schema=schemas.raw, streaming=True, )
def full_bronze_df(spark: SparkSession) -> DataFrame: yield load_dataframe( spark, format="json", path=paths.test_bronze, schema=schemas.bronze, streaming=True, )
def loaded_silver_df(spark: SparkSession) -> DataFrame: yield load_dataframe( spark, format="delta", path=paths.silver, schema=schemas.silver, streaming=True, ) rmtree(paths.silver) rmtree(paths.silver_checkpoint)
def update_silver_table(spark: SparkSession, silverPath: str) -> bool: from delta.tables import DeltaTable silver_df = load_dataframe(spark, format="delta", path=silverPath) silverTable = DeltaTable.forPath(spark, silverPath) update_match = """ health_tracker.eventtime = updates.eventtime AND health_tracker.device_id = updates.device_id """ update = {"heartrate": "updates.heartrate"} updates_df = prepare_interpolated_updates_dataframe(spark, silver_df) (silverTable.alias("health_tracker").merge( updates_df.alias("updates"), update_match).whenMatchedUpdate(set=update).execute()) return True
# Databricks notebook source from shutil import rmtree from pipelines.config import paths, schemas from pipelines.operations import create_batch_writer, transform_bronze, transform_raw from pipelines.utility import generate_spark_session, load_dataframe if __name__ == "__main__": spark = generate_spark_session() rmtree(paths.test_bronze, ignore_errors=True) rmtree(paths.test_silver, ignore_errors=True) raw_df = load_dataframe(spark, format="text", path=paths.test_raw, schema=schemas.raw) transformed_raw_df = transform_raw(spark, raw_df) raw_to_bronze_json_writer = create_batch_writer( dataframe=transformed_raw_df, path=paths.test_bronze, partition_column="p_ingestdate", format="json", ) raw_to_bronze_json_writer.save() bronze_df = transformed_raw_df transformed_bronze_df = transform_bronze(spark, bronze_df) bronze_to_silver_json_writer = create_batch_writer(