def test_standardize_sensordata(spark): """Test data transform""" # Arrange schema = standardize.get_schema("in_sensordata_schema") sensordata_sdf = spark.read.json("./data/MelbParkingSensorData.json", multiLine=True, schema=schema) load_id = 1 loaded_on = datetime.datetime.now() # Act t_sensordata_sdf, t_sensordata_malformed_sdf = standardize.standardize_sensordata(sensordata_sdf, load_id, loaded_on) # noqa: E501 # Assert assert t_sensordata_sdf.count() != 0 assert t_sensordata_malformed_sdf.count() == 0 assert t_sensordata_sdf.filter(isnull("bay_id")).count() == 0
import datetime # For testing # infilefolder = 'datalake/data/lnd/2019_03_11_01_38_00/' load_id = loadid loaded_on = datetime.datetime.now() base_path = os.path.join('dbfs:/mnt/datalake/data/lnd/', infilefolder) parkingbay_filepath = os.path.join(base_path, "MelbParkingBayData.json") sensors_filepath = os.path.join(base_path, "MelbParkingSensorData.json") # COMMAND ---------- import ddo_transform.standardize as s # Retrieve schema parkingbay_schema = s.get_schema("in_parkingbay_schema") sensordata_schema = s.get_schema("in_sensordata_schema") # Read data parkingbay_sdf = spark.read\ .schema(parkingbay_schema)\ .option("badRecordsPath", os.path.join(base_path, "__corrupt", "MelbParkingBayData"))\ .option("multiLine", True)\ .json(parkingbay_filepath) sensordata_sdf = spark.read\ .schema(sensordata_schema)\ .option("badRecordsPath", os.path.join(base_path, "__corrupt", "MelbParkingSensorData"))\ .option("multiLine", True)\ .json(sensors_filepath) # Standardize