def test_auto_mapper_complex_with_extension( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).complex( MyClass( name=A.column("last_name"), age=A.number(A.column("my_age")), extension=AutoMapperList([ MyProcessingStatusExtension( processing_status=A.text("foo"), request_id=A.text("bar"), date_processed=A.date("2021-01-01"), ) ]), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["name"]) == str( col("b.last_name").cast("string").alias("name")) assert str(sql_expressions["age"]) == str( col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show(truncate=False) assert result_df.where("member_id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_date_column_typed(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn( "date_of_birth", to_date("date_of_birth", format="yyyy-MM-dd") ) assert dict(source_df.dtypes)["date_of_birth"] == "date" df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(birthDate=A.date(A.column("date_of_birth"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["birthDate"]) == str( col("b.date_of_birth").alias("birthDate") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("birthDate").collect()[0][ 0 ] == date(1970, 1, 1) assert result_df.where("member_id == 2").select("birthDate").collect()[0][ 0 ] == date(1970, 2, 2) assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_date_literal(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(birthDate=A.date("1970-01-01")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["birthDate"]) == str( coalesce( to_date(lit("1970-01-01"), format="y-M-d"), to_date(lit("1970-01-01"), format="yyyyMMdd"), to_date(lit("1970-01-01"), format="M/d/y"), ).alias("birthDate") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("birthDate").collect()[0][ 0 ] == date(1970, 1, 1) assert result_df.where("member_id == 2").select("birthDate").collect()[0][ 0 ] == date(1970, 1, 1)
def test_auto_mapper_date_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran', "1970-01-01"), (2, 'Vidal', 'Michael', "12/31/2020"), ], ['member_id', 'last_name', 'first_name', "date_of_birth" ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=[ "member_id" ]).columns(birthDate=A.date(A.column("date_of_birth"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["birthDate"]) == str( coalesce(to_date(col("b.date_of_birth"), format='y-M-d'), to_date(col("b.date_of_birth"), format='yyyyMMdd'), to_date(col("b.date_of_birth"), format='M/d/y')).alias("birthDate")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "birthDate").collect()[0][0] == date(1970, 1, 1) assert result_df.where("member_id == 2").select( "birthDate").collect()[0][0] == date(2020, 12, 31) assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_schema_pruning_with_extension( spark_session: SparkSession, ) -> None: # Arrange clean_spark_session(spark_session) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper( view="members", source_view="patients", enable_schema_pruning=True, skip_schema_validation=[], ).complex( MyClass( name=A.column("last_name"), age=A.number(A.column("my_age")), extension=AutoMapperList([ MyProcessingStatusExtension( processing_status=A.text("foo"), request_id=A.text("bar"), date_processed=A.date("2021-01-01"), ) ]), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # Assert assert_compare_expressions(sql_expressions["name"], col("b.last_name").cast("string").alias("name")) assert_compare_expressions(sql_expressions["age"], col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show(truncate=False) assert result_df.where("member_id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint") # confirm schema expected_schema: StructType = StructType([ StructField("name", StringType(), False), StructField("age", LongType(), True), StructField( "extension", ArrayType( StructType([ StructField("url", StringType()), StructField( "extension", ArrayType( StructType([ StructField("url", StringType()), StructField("valueString", StringType()), ])), ), ])), True, ), ]) result: SchemaComparerResult = SchemaComparer.compare_schema( parent_column_name=None, source_schema=result_df.schema, desired_schema=expected_schema, ) assert result.errors == [], str(result)
def test_auto_mapper_fhir_patient_resource( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01", "female"), (2, "Vidal", "Michael", "1970-02-02", None), ], ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).complex( Patient( id_=FhirId(A.column("member_id")), birthDate=A.date(A.column("date_of_birth")), name=FhirList([ HumanName(use=NameUseCode("usual"), family=A.column("last_name")) ]), gender=A.if_not_null( A.column("my_gender"), AdministrativeGenderCode( A.column("my_gender"))), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert len(sql_expressions) == 5 assert str(sql_expressions["id"]) == str( substring(regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "-"), 0, 63).cast("string").alias("id")) assert str(sql_expressions["resourceType"]) == str( lit("Patient").cast("string").alias("resourceType")) assert str(sql_expressions["birthDate"]) == str( coalesce( to_date(col("b.date_of_birth"), "y-M-d"), to_date(col("b.date_of_birth"), "yyyyMMdd"), to_date(col("b.date_of_birth"), "M/d/y"), ).cast("date").alias("birthDate")) # assert str(sql_expressions["name"]) == str( # filter( # array( # struct( # lit("usual").alias("use"), # col("b.last_name").alias("family"), # ) # ), lambda x: x.isNotNull() # ).alias("name") # ) # assert str(sql_expressions["gender"]) == str( # when(col("b.my_gender").isNull(), # None).otherwise(col("b.my_gender")).alias("gender") # ) result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").selectExpr( "name[0].use").collect()[0][0] == "usual") assert (result_df.where("member_id == 1").selectExpr( "name[0].family").collect()[0][0] == "Qureshi") assert (result_df.where("member_id == 2").selectExpr( "name[0].use").collect()[0][0] == "usual") assert (result_df.where("member_id == 2").selectExpr( "name[0].family").collect()[0][0] == "Vidal")