def full_silver_df(spark: SparkSession) -> DataFrame:
    stream_name = "create_silver"
    silver_json_df = load_dataframe(
        spark,
        format="json",
        path=paths.test_silver,
        schema=schemas.silver,
        streaming=True,
    )
    (silver_json_df.writeStream.format(
        "delta").partitionBy("p_eventdate").outputMode("append").option(
            "checkpointLocation", paths.silver_checkpoint).option(
                "path", paths.silver).queryName(stream_name).start())
    until_stream_is_ready(spark, stream_name)
    yield load_dataframe(spark, format="delta", path=paths.silver)
    rmtree(paths.silver)
    rmtree(paths.silver_checkpoint)
def loaded_raw_df(spark: SparkSession) -> DataFrame:
    yield load_dataframe(
        spark,
        format="text",
        path=paths.test_raw,
        schema=schemas.raw,
        streaming=True,
    )
def full_bronze_df(spark: SparkSession) -> DataFrame:
    yield load_dataframe(
        spark,
        format="json",
        path=paths.test_bronze,
        schema=schemas.bronze,
        streaming=True,
    )
def loaded_silver_df(spark: SparkSession) -> DataFrame:
    yield load_dataframe(
        spark,
        format="delta",
        path=paths.silver,
        schema=schemas.silver,
        streaming=True,
    )
    rmtree(paths.silver)
    rmtree(paths.silver_checkpoint)
Ejemplo n.º 5
0
def update_silver_table(spark: SparkSession, silverPath: str) -> bool:
    from delta.tables import DeltaTable

    silver_df = load_dataframe(spark, format="delta", path=silverPath)
    silverTable = DeltaTable.forPath(spark, silverPath)

    update_match = """
    health_tracker.eventtime = updates.eventtime
    AND
    health_tracker.device_id = updates.device_id
    """

    update = {"heartrate": "updates.heartrate"}

    updates_df = prepare_interpolated_updates_dataframe(spark, silver_df)

    (silverTable.alias("health_tracker").merge(
        updates_df.alias("updates"),
        update_match).whenMatchedUpdate(set=update).execute())

    return True
# Databricks notebook source
from shutil import rmtree
from pipelines.config import paths, schemas
from pipelines.operations import create_batch_writer, transform_bronze, transform_raw
from pipelines.utility import generate_spark_session, load_dataframe

if __name__ == "__main__":
    spark = generate_spark_session()

    rmtree(paths.test_bronze, ignore_errors=True)
    rmtree(paths.test_silver, ignore_errors=True)

    raw_df = load_dataframe(spark,
                            format="text",
                            path=paths.test_raw,
                            schema=schemas.raw)

    transformed_raw_df = transform_raw(spark, raw_df)

    raw_to_bronze_json_writer = create_batch_writer(
        dataframe=transformed_raw_df,
        path=paths.test_bronze,
        partition_column="p_ingestdate",
        format="json",
    )
    raw_to_bronze_json_writer.save()

    bronze_df = transformed_raw_df
    transformed_bronze_df = transform_bronze(spark, bronze_df)

    bronze_to_silver_json_writer = create_batch_writer(