def test_auto_mapper_complex_with_extension(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).complex(
        MyClass(
            name=A.column("last_name"),
            age=A.number(A.column("my_age")),
            extension=AutoMapperList([
                MyProcessingStatusExtension(
                    processing_status=A.text("foo"),
                    request_id=A.text("bar"),
                    date_processed=A.date("2021-01-01"),
                )
            ]),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["name"]) == str(
        col("b.last_name").cast("string").alias("name"))
    assert str(sql_expressions["age"]) == str(
        col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show(truncate=False)

    assert result_df.where("member_id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_date_column_typed(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn(
        "date_of_birth", to_date("date_of_birth", format="yyyy-MM-dd")
    )

    assert dict(source_df.dtypes)["date_of_birth"] == "date"

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(birthDate=A.date(A.column("date_of_birth")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["birthDate"]) == str(
        col("b.date_of_birth").alias("birthDate")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("birthDate").collect()[0][
        0
    ] == date(1970, 1, 1)
    assert result_df.where("member_id == 2").select("birthDate").collect()[0][
        0
    ] == date(1970, 2, 2)

    assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_date_literal(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(birthDate=A.date("1970-01-01"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["birthDate"]) == str(
        coalesce(
            to_date(lit("1970-01-01"), format="y-M-d"),
            to_date(lit("1970-01-01"), format="yyyyMMdd"),
            to_date(lit("1970-01-01"), format="M/d/y"),
        ).alias("birthDate")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("birthDate").collect()[0][
        0
    ] == date(1970, 1, 1)
    assert result_df.where("member_id == 2").select("birthDate").collect()[0][
        0
    ] == date(1970, 1, 1)
def test_auto_mapper_date_column(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran', "1970-01-01"),
        (2, 'Vidal', 'Michael', "12/31/2020"),
    ], ['member_id', 'last_name', 'first_name', "date_of_birth"
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=[
                            "member_id"
                        ]).columns(birthDate=A.date(A.column("date_of_birth")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["birthDate"]) == str(
        coalesce(to_date(col("b.date_of_birth"), format='y-M-d'),
                 to_date(col("b.date_of_birth"), format='yyyyMMdd'),
                 to_date(col("b.date_of_birth"),
                         format='M/d/y')).alias("birthDate"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "birthDate").collect()[0][0] == date(1970, 1, 1)
    assert result_df.where("member_id == 2").select(
        "birthDate").collect()[0][0] == date(2020, 12, 31)

    assert dict(result_df.dtypes)["birthDate"] == "date"
Beispiel #5
0
def test_auto_mapper_schema_pruning_with_extension(
    spark_session: SparkSession, ) -> None:
    # Arrange
    clean_spark_session(spark_session)

    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        enable_schema_pruning=True,
        skip_schema_validation=[],
    ).complex(
        MyClass(
            name=A.column("last_name"),
            age=A.number(A.column("my_age")),
            extension=AutoMapperList([
                MyProcessingStatusExtension(
                    processing_status=A.text("foo"),
                    request_id=A.text("bar"),
                    date_processed=A.date("2021-01-01"),
                )
            ]),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    assert_compare_expressions(sql_expressions["name"],
                               col("b.last_name").cast("string").alias("name"))
    assert_compare_expressions(sql_expressions["age"],
                               col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show(truncate=False)

    assert result_df.where("member_id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")

    # confirm schema
    expected_schema: StructType = StructType([
        StructField("name", StringType(), False),
        StructField("age", LongType(), True),
        StructField(
            "extension",
            ArrayType(
                StructType([
                    StructField("url", StringType()),
                    StructField(
                        "extension",
                        ArrayType(
                            StructType([
                                StructField("url", StringType()),
                                StructField("valueString", StringType()),
                            ])),
                    ),
                ])),
            True,
        ),
    ])

    result: SchemaComparerResult = SchemaComparer.compare_schema(
        parent_column_name=None,
        source_schema=result_df.schema,
        desired_schema=expected_schema,
    )

    assert result.errors == [], str(result)
def test_auto_mapper_fhir_patient_resource(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01", "female"),
            (2, "Vidal", "Michael", "1970-02-02", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).complex(
                            Patient(
                                id_=FhirId(A.column("member_id")),
                                birthDate=A.date(A.column("date_of_birth")),
                                name=FhirList([
                                    HumanName(use=NameUseCode("usual"),
                                              family=A.column("last_name"))
                                ]),
                                gender=A.if_not_null(
                                    A.column("my_gender"),
                                    AdministrativeGenderCode(
                                        A.column("my_gender"))),
                            ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert len(sql_expressions) == 5
    assert str(sql_expressions["id"]) == str(
        substring(regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "-"),
                  0, 63).cast("string").alias("id"))
    assert str(sql_expressions["resourceType"]) == str(
        lit("Patient").cast("string").alias("resourceType"))
    assert str(sql_expressions["birthDate"]) == str(
        coalesce(
            to_date(col("b.date_of_birth"), "y-M-d"),
            to_date(col("b.date_of_birth"), "yyyyMMdd"),
            to_date(col("b.date_of_birth"), "M/d/y"),
        ).cast("date").alias("birthDate"))
    # assert str(sql_expressions["name"]) == str(
    #     filter(
    #         array(
    #             struct(
    #                 lit("usual").alias("use"),
    #                 col("b.last_name").alias("family"),
    #             )
    #         ), lambda x: x.isNotNull()
    #     ).alias("name")
    # )
    # assert str(sql_expressions["gender"]) == str(
    #     when(col("b.my_gender").isNull(),
    #          None).otherwise(col("b.my_gender")).alias("gender")
    # )

    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").selectExpr(
        "name[0].use").collect()[0][0] == "usual")
    assert (result_df.where("member_id == 1").selectExpr(
        "name[0].family").collect()[0][0] == "Qureshi")

    assert (result_df.where("member_id == 2").selectExpr(
        "name[0].use").collect()[0][0] == "usual")
    assert (result_df.where("member_id == 2").selectExpr(
        "name[0].family").collect()[0][0] == "Vidal")