def test_convertToDelta(self) -> None: df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % self.tempFile) self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile), [('a', 1), ('b', 2), ('c', 3)]) # test if convert to delta with partition columns work tempFile2 = self.tempFile + "_2" df.write.partitionBy("value").format("parquet").save(tempFile2) schema = StructType() schema.add("value", IntegerType(), True) dt = DeltaTable.convertToDelta( self.spark, "parquet.`%s`" % tempFile2, schema) self.__checkAnswer( self.spark.read.format("delta").load(tempFile2), [('a', 1), ('b', 2), ('c', 3)]) self.assertEqual(type(dt), DeltaTable) # convert to delta with partition column provided as a string tempFile3 = self.tempFile + "_3" df.write.partitionBy("value").format("parquet").save(tempFile3) dt = DeltaTable.convertToDelta( self.spark, "parquet.`%s`" % tempFile3, "value int") self.__checkAnswer( self.spark.read.format("delta").load(tempFile3), [('a', 1), ('b', 2), ('c', 3)]) self.assertEqual(type(dt), DeltaTable)
def test_convertToDelta(self): df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) self.tempFile2 = self.tempFile + "_" dt = DeltaTable.convertToDelta(self.spark, "parquet.`" + self.tempFile + "`") self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile), [('a', 1), ('b', 2), ('c', 3)]) # test if convert to delta with partition columns work df.write.partitionBy("value").format("parquet").save(self.tempFile2) schema = StructType() schema.add("value", IntegerType(), True) dt = DeltaTable.convertToDelta(self.spark, "parquet.`" + self.tempFile2 + "`", schema) self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile2), [('a', 1), ('b', 2), ('c', 3)])
.getOrCreate() # Clear previous run's delta-tables try: shutil.rmtree("/tmp/delta-table") except: pass # Create a table print("########### Create a Parquet table ##############") data = spark.range(0, 5) data.write.format("parquet").save("/tmp/delta-table") # Convert to delta print("########### Convert to Delta ###########") DeltaTable.convertToDelta(spark, "parquet.`/tmp/delta-table`") # Read the table df = spark.read.format("delta").load("/tmp/delta-table") df.show() deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table") print("######## Vacuum the table ########") deltaTable.vacuum() print("######## Describe history for the table ######") deltaTable.history().show() # Generate manifest print("######## Generating manifest ######") deltaTable.generate("SYMLINK_FORMAT_MANIFEST")
# MAGIC # MAGIC DESCRIBE DETAIL health_tracker_processed # COMMAND ---------- # MAGIC %md # MAGIC Convert parquet table to delta table # COMMAND ---------- from delta.tables import DeltaTable parquet_table = f"parquet.`{health_tracker}processed`" partitioning_scheme = "p_device_id int" DeltaTable.convertToDelta(spark, parquet_table, partitioning_scheme) # COMMAND ---------- # MAGIC %md # MAGIC Register delta table in the metastore # COMMAND ---------- # MAGIC %sql # MAGIC # MAGIC DROP TABLE IF EXISTS health_tracker_processed; # MAGIC # MAGIC CREATE TABLE health_tracker_processed # MAGIC USING DELTA # MAGIC LOCATION "/dbacademy/$username/DLRS/healthtracker/processed"