Example #1
0
 def test_isDeltaTable(self) -> None:
     df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"])
     df.write.format("parquet").save(self.tempFile)
     tempFile2 = self.tempFile + '_2'
     df.write.format("delta").save(tempFile2)
     self.assertEqual(DeltaTable.isDeltaTable(self.spark, self.tempFile), False)
     self.assertEqual(DeltaTable.isDeltaTable(self.spark, tempFile2), True)
def createDeltaBackedState(tableName, overwrite=False):

    from delta.tables import DeltaTable
    import pyspark.sql.types as T

    db_location = "dbfs:/home/[email protected]/streamingWorkshop/db"
    db_table_name = "sw_db." + tableName
    checkpoint_location = db_location + "/checkpointTables/" + db_table_name

    delta_schema = (T.StructType([
        T.StructField("item_id", T.LongType()),
        T.StructField("timestamp", T.TimestampType()),
        T.StructField("sales", T.LongType())
    ]))

    # Create an empty Delta table if it does not exist. This is required for the MERGE to work in the first mini batch.
    if overwrite or not DeltaTable.isDeltaTable(
            spark, db_location + "/" + db_table_name):
        (spark.createDataFrame(
            [], delta_schema).write.mode("overwrite").option(
                "overwriteSchema",
                "true").format("delta").saveAsTable(db_table_name))
        spark.sql(
            f"ALTER TABLE {db_table_name} SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = false)"
        )
Example #3
0
    def _merge_into_table(self, df, destination_path, checkpoints_path,
                          condition):
        """ Merges data from the given dataframe into the delta table at the specified destination_path, based on the given condition.
            If not delta table exists at the specified destination_path, a new delta table is created and the data from the given dataframe is inserted.
            eg, merge_into_table(df_lookup, np_destination_path, source_path + '/_checkpoints/delta_np', "current.id_pseudonym = updates.id_pseudonym")
        """
        if DeltaTable.isDeltaTable(spark, destination_path):
            dt = DeltaTable.forPath(spark, destination_path)

            def upsert(batch_df, batchId):
                dt.alias("current").merge(
                    batch_df.alias("updates"), condition).whenMatchedUpdateAll(
                    ).whenNotMatchedInsertAll().execute()

            query = df.writeStream.format("delta").foreachBatch(
                upsert).outputMode("update").trigger(once=True).option(
                    "checkpointLocation", checkpoints_path)
        else:
            logger.info(
                f'Delta table does not yet exist at {destination_path} - creating one now and inserting initial data.'
            )
            query = df.writeStream.format("delta").outputMode(
                "append").trigger(once=True).option("checkpointLocation",
                                                    checkpoints_path)
        query = query.start(destination_path)
        query.awaitTermination(
        )  # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
        logger.info(query.lastProgress)
Example #4
0
    def merge_write(logger, df_dict: Dict[str, DataFrame], rules: Dict[str,
                                                                       str],
                    output_path: str, spark: SparkSession):
        """
        Write data if the dataset doesn't exist or merge it to the existing dataset
        Args:
            logger: Logger instance used to log events
            df_dict: Dictionary of the datasets with the structure {Name: Dataframe}
            rules: Matching rules use to merge
            output_path: Path to write the data
            spark: Spark instance

        Returns:

        """
        try:
            from delta.tables import DeltaTable
            for df_name, df in df_dict.items():
                file_path = path.join(output_path, df_name)
                if DeltaTable.isDeltaTable(spark, file_path):
                    delta_table = DeltaTable.forPath(spark, file_path)
                    delta_table.alias("old").merge(
                        df.alias("new"), rules.get(df_name)
                    ).whenMatchedUpdateAll().whenNotMatchedInsertAll()
                else:
                    df.write.format("delta").save(file_path)

        except Exception as e:
            logger.error(
                "Writing sanitized data couldn't be performed: {}".format(e),
                traceback.format_exc())
            raise e
        else:
            logger.info("Sanitized dataframes written in {} folder".format(
                output_path))
def get_delta_table(
        spark: SparkSession,
        schema: StructType,
        delta_library_jar: str,
        delta_path: str):
    # load delta library jar, so we can use delta module
    spark.sparkContext.addPyFile(delta_library_jar)
    from delta.tables import DeltaTable

    # check existence of delta table
    if not DeltaTable.isDeltaTable(spark, delta_path):
        print(f">>> Delta table: {delta_path} is not initialized, performing initialization..")
        df = spark.createDataFrame([], schema=schema)
        df.write.format("delta").save(delta_path)

    return DeltaTable.forPath(spark, delta_path)