def test_automapper_null_remover(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients").columns(address=A.if_not_null(
            A.column("address"),
            value=A.column("address").select(
                A.if_not_null(
                    A.field("line"),
                    A.field("line").select(
                        A.current().sanitize()).remove_null_or_empty(),
                )),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["age"]) == str(
    #     filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age")
    # )
    result_df: DataFrame = mapper.transform(df=source_df)

    print(result_df.select("address").collect()[0][0])
    assert result_df.select("address").collect()[0][0][0] == [
        "1111 STREET LN",
        "SUITE 256",
    ]
    result_df.show(truncate=False)
Ejemplo n.º 2
0
def test_automapper_if_not_null(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=[
            "member_id"
        ]).columns(age=A.if_not_null(A.column(
            "my_age"), A.number(A.column("my_age")), A.number(A.text("100"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        when(
            col("b.my_age").isNull(),
            lit("100").cast(StringType()).cast(LongType())).otherwise(
                col("b.my_age").cast(LongType())).alias("age"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 100

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_fhir_patient_resource(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01", "female"),
            (2, "Vidal", "Michael", "1970-02-02", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).complex(
                            Patient(
                                id_=FhirId(A.column("member_id")),
                                birthDate=A.date(A.column("date_of_birth")),
                                name=FhirList([
                                    HumanName(use=NameUseCode("usual"),
                                              family=A.column("last_name"))
                                ]),
                                gender=A.if_not_null(
                                    A.column("my_gender"),
                                    AdministrativeGenderCode(
                                        A.column("my_gender"))),
                            ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert len(sql_expressions) == 5
    assert str(sql_expressions["id"]) == str(
        substring(regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "-"),
                  0, 63).cast("string").alias("id"))
    assert str(sql_expressions["resourceType"]) == str(
        lit("Patient").cast("string").alias("resourceType"))
    assert str(sql_expressions["birthDate"]) == str(
        coalesce(
            to_date(col("b.date_of_birth"), "y-M-d"),
            to_date(col("b.date_of_birth"), "yyyyMMdd"),
            to_date(col("b.date_of_birth"), "M/d/y"),
        ).cast("date").alias("birthDate"))
    # assert str(sql_expressions["name"]) == str(
    #     filter(
    #         array(
    #             struct(
    #                 lit("usual").alias("use"),
    #                 col("b.last_name").alias("family"),
    #             )
    #         ), lambda x: x.isNotNull()
    #     ).alias("name")
    # )
    # assert str(sql_expressions["gender"]) == str(
    #     when(col("b.my_gender").isNull(),
    #          None).otherwise(col("b.my_gender")).alias("gender")
    # )

    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").selectExpr(
        "name[0].use").collect()[0][0] == "usual")
    assert (result_df.where("member_id == 1").selectExpr(
        "name[0].family").collect()[0][0] == "Qureshi")

    assert (result_df.where("member_id == 2").selectExpr(
        "name[0].use").collect()[0][0] == "usual")
    assert (result_df.where("member_id == 2").selectExpr(
        "name[0].family").collect()[0][0] == "Vidal")
def test_automapper_first_valid_column(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", "33"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df_1: DataFrame = spark_session.table("patients")

    df = source_df_1.select("member_id")
    df.createOrReplaceTempView("members")

    # The key thing in this test is that we are using the same mapper on sources with different columns, and they both
    # work as expected.

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        last_name=A.column("last_name"),
        age=A.first_valid_column(
            A.number(A.column("age")),
            A.number(A.column("my_age")),
            A.text(None),
        ),
        age2=A.first_valid_column(
            A.if_not_null(
                A.first_valid_column(
                    A.number(A.column("age")),
                    A.number(A.column("my_age")),
                    A.text(None),
                ),
                A.first_valid_column(
                    A.number(A.column("age")),
                    A.number(A.column("my_age")),
                    A.text(None),
                ),
                A.number(A.text("100")),
            ),
            A.number(A.column("age")),
            A.number(A.column("his_age")),
            A.number(99999),
        ),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions_1: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df_1)
    for column_name, sql_expression in sql_expressions_1.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions_1["age"]) == str(
        col("b.my_age").cast("long").alias("age"))
    result_df_1: DataFrame = mapper.transform(df=df)

    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "age"],
    ).createOrReplaceTempView("patients")

    source_df_2 = spark_session.table("patients")

    df = source_df_1.select("member_id")
    df.createOrReplaceTempView("members")

    sql_expressions_2: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df_2)
    assert str(sql_expressions_2["age"]) == str(
        col("b.age").cast("long").alias("___age"))

    result_df_2 = mapper.transform(df=df)

    # Assert
    result_df_1.printSchema()
    result_df_1.show()

    result_df_2.printSchema()
    result_df_2.show()

    assert result_df_1.where("member_id == 1").select(
        "age", "age2").collect()[0][:] == (54, 54)
    assert result_df_1.where("member_id == 2").select(
        "age", "age2").collect()[0][:] == (33, 33)

    assert result_df_2.where("member_id == 1").select(
        "age", "age2").collect()[0][:] == (54, 54)
    assert result_df_2.where("member_id == 2").select(
        "age", "age2").collect()[0][:] == (None, 100)