def test_isDeltaTable(self) -> None: df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) tempFile2 = self.tempFile + '_2' df.write.format("delta").save(tempFile2) self.assertEqual(DeltaTable.isDeltaTable(self.spark, self.tempFile), False) self.assertEqual(DeltaTable.isDeltaTable(self.spark, tempFile2), True)
def createDeltaBackedState(tableName, overwrite=False): from delta.tables import DeltaTable import pyspark.sql.types as T db_location = "dbfs:/home/[email protected]/streamingWorkshop/db" db_table_name = "sw_db." + tableName checkpoint_location = db_location + "/checkpointTables/" + db_table_name delta_schema = (T.StructType([ T.StructField("item_id", T.LongType()), T.StructField("timestamp", T.TimestampType()), T.StructField("sales", T.LongType()) ])) # Create an empty Delta table if it does not exist. This is required for the MERGE to work in the first mini batch. if overwrite or not DeltaTable.isDeltaTable( spark, db_location + "/" + db_table_name): (spark.createDataFrame( [], delta_schema).write.mode("overwrite").option( "overwriteSchema", "true").format("delta").saveAsTable(db_table_name)) spark.sql( f"ALTER TABLE {db_table_name} SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = false)" )
def _merge_into_table(self, df, destination_path, checkpoints_path, condition): """ Merges data from the given dataframe into the delta table at the specified destination_path, based on the given condition. If not delta table exists at the specified destination_path, a new delta table is created and the data from the given dataframe is inserted. eg, merge_into_table(df_lookup, np_destination_path, source_path + '/_checkpoints/delta_np', "current.id_pseudonym = updates.id_pseudonym") """ if DeltaTable.isDeltaTable(spark, destination_path): dt = DeltaTable.forPath(spark, destination_path) def upsert(batch_df, batchId): dt.alias("current").merge( batch_df.alias("updates"), condition).whenMatchedUpdateAll( ).whenNotMatchedInsertAll().execute() query = df.writeStream.format("delta").foreachBatch( upsert).outputMode("update").trigger(once=True).option( "checkpointLocation", checkpoints_path) else: logger.info( f'Delta table does not yet exist at {destination_path} - creating one now and inserting initial data.' ) query = df.writeStream.format("delta").outputMode( "append").trigger(once=True).option("checkpointLocation", checkpoints_path) query = query.start(destination_path) query.awaitTermination( ) # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs. logger.info(query.lastProgress)
def merge_write(logger, df_dict: Dict[str, DataFrame], rules: Dict[str, str], output_path: str, spark: SparkSession): """ Write data if the dataset doesn't exist or merge it to the existing dataset Args: logger: Logger instance used to log events df_dict: Dictionary of the datasets with the structure {Name: Dataframe} rules: Matching rules use to merge output_path: Path to write the data spark: Spark instance Returns: """ try: from delta.tables import DeltaTable for df_name, df in df_dict.items(): file_path = path.join(output_path, df_name) if DeltaTable.isDeltaTable(spark, file_path): delta_table = DeltaTable.forPath(spark, file_path) delta_table.alias("old").merge( df.alias("new"), rules.get(df_name) ).whenMatchedUpdateAll().whenNotMatchedInsertAll() else: df.write.format("delta").save(file_path) except Exception as e: logger.error( "Writing sanitized data couldn't be performed: {}".format(e), traceback.format_exc()) raise e else: logger.info("Sanitized dataframes written in {} folder".format( output_path))
def get_delta_table( spark: SparkSession, schema: StructType, delta_library_jar: str, delta_path: str): # load delta library jar, so we can use delta module spark.sparkContext.addPyFile(delta_library_jar) from delta.tables import DeltaTable # check existence of delta table if not DeltaTable.isDeltaTable(spark, delta_path): print(f">>> Delta table: {delta_path} is not initialized, performing initialization..") df = spark.createDataFrame([], schema=schema) df.write.format("delta").save(delta_path) return DeltaTable.forPath(spark, delta_path)