def test_automapper_null_remover(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(address=A.if_not_null( A.column("address"), value=A.column("address").select( A.if_not_null( A.field("line"), A.field("line").select( A.current().sanitize()).remove_null_or_empty(), )), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["age"]) == str( # filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age") # ) result_df: DataFrame = mapper.transform(df=source_df) print(result_df.select("address").collect()[0][0]) assert result_df.select("address").collect()[0][0][0] == [ "1111 STREET LN", "SUITE 256", ] result_df.show(truncate=False)
def test_automapper_if_not_null(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=[ "member_id" ]).columns(age=A.if_not_null(A.column( "my_age"), A.number(A.column("my_age")), A.number(A.text("100")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], when( col("b.my_age").isNull(), lit("100").cast(StringType()).cast(LongType())).otherwise( col("b.my_age").cast(LongType())).alias("age"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 100 assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_fhir_patient_resource( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01", "female"), (2, "Vidal", "Michael", "1970-02-02", None), ], ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).complex( Patient( id_=FhirId(A.column("member_id")), birthDate=A.date(A.column("date_of_birth")), name=FhirList([ HumanName(use=NameUseCode("usual"), family=A.column("last_name")) ]), gender=A.if_not_null( A.column("my_gender"), AdministrativeGenderCode( A.column("my_gender"))), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert len(sql_expressions) == 5 assert str(sql_expressions["id"]) == str( substring(regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "-"), 0, 63).cast("string").alias("id")) assert str(sql_expressions["resourceType"]) == str( lit("Patient").cast("string").alias("resourceType")) assert str(sql_expressions["birthDate"]) == str( coalesce( to_date(col("b.date_of_birth"), "y-M-d"), to_date(col("b.date_of_birth"), "yyyyMMdd"), to_date(col("b.date_of_birth"), "M/d/y"), ).cast("date").alias("birthDate")) # assert str(sql_expressions["name"]) == str( # filter( # array( # struct( # lit("usual").alias("use"), # col("b.last_name").alias("family"), # ) # ), lambda x: x.isNotNull() # ).alias("name") # ) # assert str(sql_expressions["gender"]) == str( # when(col("b.my_gender").isNull(), # None).otherwise(col("b.my_gender")).alias("gender") # ) result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").selectExpr( "name[0].use").collect()[0][0] == "usual") assert (result_df.where("member_id == 1").selectExpr( "name[0].family").collect()[0][0] == "Qureshi") assert (result_df.where("member_id == 2").selectExpr( "name[0].use").collect()[0][0] == "usual") assert (result_df.where("member_id == 2").selectExpr( "name[0].family").collect()[0][0] == "Vidal")
def test_automapper_first_valid_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "33"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df_1: DataFrame = spark_session.table("patients") df = source_df_1.select("member_id") df.createOrReplaceTempView("members") # The key thing in this test is that we are using the same mapper on sources with different columns, and they both # work as expected. # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( last_name=A.column("last_name"), age=A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), age2=A.first_valid_column( A.if_not_null( A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), A.number(A.text("100")), ), A.number(A.column("age")), A.number(A.column("his_age")), A.number(99999), ), ) assert isinstance(mapper, AutoMapper) sql_expressions_1: Dict[str, Column] = mapper.get_column_specs( source_df=source_df_1) for column_name, sql_expression in sql_expressions_1.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions_1["age"]) == str( col("b.my_age").cast("long").alias("age")) result_df_1: DataFrame = mapper.transform(df=df) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "age"], ).createOrReplaceTempView("patients") source_df_2 = spark_session.table("patients") df = source_df_1.select("member_id") df.createOrReplaceTempView("members") sql_expressions_2: Dict[str, Column] = mapper.get_column_specs( source_df=source_df_2) assert str(sql_expressions_2["age"]) == str( col("b.age").cast("long").alias("___age")) result_df_2 = mapper.transform(df=df) # Assert result_df_1.printSchema() result_df_1.show() result_df_2.printSchema() result_df_2.show() assert result_df_1.where("member_id == 1").select( "age", "age2").collect()[0][:] == (54, 54) assert result_df_1.where("member_id == 2").select( "age", "age2").collect()[0][:] == (33, 33) assert result_df_2.where("member_id == 1").select( "age", "age2").collect()[0][:] == (54, 54) assert result_df_2.where("member_id == 2").select( "age", "age2").collect()[0][:] == (None, 100)