def test_auto_mapper_datetime_regex_replace_format(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "1/13/1995"),
            (2, "1/3/1995"),
            (3, "11/3/1995"),
        ],
        ["member_id", "opening_date"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(formatted_date=A.datetime(
                            value=A.regex_replace(A.column("opening_date"),
                                                  pattern=r"\b(\d)(?=/)",
                                                  replacement="0$1"),
                            formats=["M/dd/yyyy"],
                        ).to_date_format("yyyy-M-dd"))
    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")
    result_df: DataFrame = mapper.transform(df=source_df)

    assert (result_df.where("member_id == 1").select(
        "formatted_date").collect()[0][0] == "1995-1-13")
    assert (result_df.where("member_id == 2").select(
        "formatted_date").collect()[0][0] == "1995-1-03")
    assert (result_df.where("member_id == 3").select(
        "formatted_date").collect()[0][0] == "1995-11-03")
Beispiel #2
0
def test_automapper_flatten(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)

    source_view_name = "cascaded_list_view"
    result_view_name = "flatten_list_view"
    source_df = spark_session.createDataFrame([([[1], [2, 3, 4], [3, 5]], )],
                                              ["column"])
    source_df.createOrReplaceTempView(source_view_name)

    # Act
    mapper = AutoMapper(view=result_view_name,
                        source_view=source_view_name).columns(
                            column_flat=A.flatten(A.column("column")))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # assert
    assert result_df.select("column_flat").collect()[0][0] == [
        1, 2, 3, 4, 3, 5
    ]
def test_automapper_transform(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").complex(
        MyObject(age=A.transform(
            A.column("identifier"),
            A.complex(bar=A.field("value"), bar2=A.field("system")))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        transform(
            "b.identifier", lambda x: struct(
                col("x[value]").alias("bar"),
                col("x[system]").alias("bar2"))).alias("age"))
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0][0][0] == "123"
def test_auto_mapper_handles_duplicates(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran'),
        (2, 'Qureshi', 'Imran'),
        (3, 'Qureshi', 'Imran2'),
        (4, 'Vidal', 'Michael'),
    ], ['member_id', 'last_name', 'first_name'
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"
                              ]).columns(dst1="src1",
                                         dst2=A.column("last_name"),
                                         dst3=A.column("first_name"))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    mapper.transform(df=source_df)
    result_df: DataFrame = spark_session.table("members")

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.count() == 3
Beispiel #5
0
def test_automapper_filter(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").columns(
        age=A.filter(column=A.column("identifier"),
                     func=lambda x: x["use"] == lit("usual")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        filter("b.identifier",
               lambda x: x["use"] == lit("usual")).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)
Beispiel #6
0
def test_automapper_field(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").columns(
        age=A.column("identifier").select_one(A.field("type.coding[0].code"))
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["age"]
    #            ) == str(col("b.identifier[0]").alias("age"))
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0] == "PRN"
def test_auto_mapper_amount(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54.45"),
            (2, "Vidal", "Michael", "67.67"),
            (3, "Alex", "Hearn", "1286782.17"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        age=A.amount(A.column("my_age")),
        null_col=A.amount(AutoMapperDataTypeLiteral(None)),
    )

    debug_text: str = mapper.to_debug_string()
    print(debug_text)

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"], col("b.my_age").cast("double").alias("age")
    )

    assert_compare_expressions(
        sql_expressions["null_col"], lit(None).cast("double").alias("null_col")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert approx(
        result_df.where("member_id == 1").select("age", "null_col").collect()[0][:]
    ) == (approx(54.45), None)
    assert approx(
        result_df.where("member_id == 2").select("age", "null_col").collect()[0][:]
    ) == (approx(67.67), None)
    # Ensuring exact match in situations in which float arithmetic errors might occur
    assert (
        str(result_df.where("member_id == 3").select("age").collect()[0][0])
        == "1286782.17"
    )

    assert dict(result_df.dtypes)["age"] == "double"
    assert dict(result_df.dtypes)["null_col"] == "double"
def test_automapper_flatten_with_null(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)

    source_view_name = "cascaded_list_view"
    result_view_name = "flatten_list_view"
    schema = StructType([
        StructField(
            "column",
            ArrayType(elementType=ArrayType(elementType=IntegerType())))
    ])
    source_df = spark_session.createDataFrame(
        [([[1], [2, 3, 4], [3, 5], None], )], schema=schema)
    source_df.printSchema()
    source_df.createOrReplaceTempView(source_view_name)

    # Act
    mapper = AutoMapper(view=result_view_name,
                        source_view=source_view_name).columns(
                            column_flat=A.flatten(A.column("column")))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # assert
    assert result_df.select("column_flat").collect()[0][0] == [
        1, 2, 3, 4, 3, 5
    ]
def test_automapper_complex_with_skip_if_null(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=True,
        skip_if_columns_null_or_empty=["first_name"],
    ).complex(
        MyClass(
            id_=A.column("member_id"),
            name=A.column("last_name"),
            age=A.number(A.column("my_age")),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["name"]) == str(
        when(
            col("b.first_name").isNull() | col("b.first_name").eqNullSafe(""),
            lit(None)).otherwise(col("b.last_name")).cast(
                StringType()).alias("name"))
    assert str(sql_expressions["age"]) == str(
        when(
            col("b.first_name").isNull() | col("b.first_name").eqNullSafe(""),
            lit(None)).otherwise(col("b.my_age")).cast(
                LongType()).alias("age"))

    result_df.printSchema()

    result_df.show()

    assert result_df.count() == 1
    assert result_df.where("id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
Beispiel #10
0
def test_auto_mapper_full_no_keys(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame([
        ('Qureshi', 'Imran'),
        ('Vidal', 'Michael'),
    ], ['last_name', 'first_name']).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # example of a variable
    client_address_variable: str = "address1"

    # Act
    mapper = AutoMapper(view="members", source_view="patients").columns(
        dst1="src1",
        dst2=AutoMapperList([client_address_variable]),
        dst3=AutoMapperList([client_address_variable, "address2"]))

    company_name: str = "Microsoft"

    if company_name == "Microsoft":
        mapper = mapper.columns(
            dst4=AutoMapperList(
                [A.complex(use="usual", family=A.column("last_name"))]),
            dst5=AutoMapperList(
                [A.complex(use="usual", first=A.column("first_name"))]),
        )

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    mapper.transform(df=source_df)
    result_df: DataFrame = spark_session.table("members")

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5, list(result_df.columns)
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst1").collect()[0][0] == "src1"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst2").collect()[0][0][0] == "address1"

    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst3").collect()[0][0][0] == "address1"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst3").collect()[0][0][1] == "address2"

    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst4").collect()[0][0][0][0] == "usual"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst4").collect()[0][0][0][1] == "Qureshi"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst5").collect()[0][0][0][1] == "Imran"
Beispiel #11
0
def test_automapper_map(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "Y"),
            (2, "Vidal", "Michael", "N"),
            (3, "Vidal", "Michael", "f"),
            (4, "Qureshi", "Imran", None),
        ],
        ["member_id", "last_name", "first_name", "has_kids"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(has_kids=A.map(
                            A.column("has_kids"),
                            {
                                None: "Unspecified",
                                "Y": "Yes",
                                "N": "No"
                            },
                            "unknown",
                        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["has_kids"],
        when(col("b.has_kids").eqNullSafe(lit(None)), lit("Unspecified")).when(
            col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")).when(
                col("b.has_kids").eqNullSafe(lit("N")),
                lit("No")).otherwise(lit("unknown")).alias("___has_kids"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "has_kids").collect()[0][0] == "Yes"
    assert result_df.where("member_id == 2").select(
        "has_kids").collect()[0][0] == "No"
    assert (result_df.where("member_id == 3").select("has_kids").collect()[0]
            [0] == "unknown")
    assert (result_df.where("member_id == 4").select("has_kids").collect()[0]
            [0] == "Unspecified")
Beispiel #12
0
def test_auto_mapper_datetime_column_default(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "18922"),
            (2, "Vidal", "Michael", "1609390500"),
        ],
        ["member_id", "last_name", "first_name", "ts"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        timestamp=A.unix_timestamp(A.column("ts")),
        literal_val=A.unix_timestamp("1609390500"),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["literal_val"]) == str(
        to_timestamp(
            from_unixtime("1609390500", "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd HH:mm:ss"
        ).alias("literal_val")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.count() == 2

    assert result_df.where("member_id == 1").select("timestamp").collect()[0][
        0
    ] == datetime(1970, 1, 1, 5, 15, 22)
    assert result_df.where("member_id == 2").select("timestamp").collect()[0][
        0
    ] == datetime(2020, 12, 31, 4, 55, 0)

    assert result_df.where("member_id == 1").select("literal_val").collect()[0][
        0
    ] == datetime(2020, 12, 31, 4, 55, 0)
    assert result_df.where("member_id == 2").select("literal_val").collect()[0][
        0
    ] == datetime(2020, 12, 31, 4, 55, 0)

    assert dict(result_df.dtypes)["timestamp"] == "timestamp"
    assert dict(result_df.dtypes)["literal_val"] == "timestamp"
def test_auto_mapper_number(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", "67"),
            (3, "Old", "Methusela", "131026061001"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        age=A.number(A.column("my_age")),
        null_field=A.number(AutoMapperDataTypeLiteral(None)),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) in (
        str(col("b.my_age").cast("int").alias("age")),
        str(col("b.my_age").cast("long").alias("age")),
    )

    assert str(sql_expressions["null_field"]) == str(
        lit(None).cast("long").alias("null_field"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 67
    assert (result_df.where("member_id == 3").select("age").collect()[0][0] ==
            131026061001)
    assert (
        result_df.where("member_id == 1").select("null_field").collect()[0][0]
        is None)

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_array_multiple_items_structs(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, None, "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df: DataFrame = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=AutoMapperList(
        [
            AutoMapperDataTypeComplexBase(a=A.column("first_name"),
                                          b=A.column("last_name")),
            AutoMapperDataTypeComplexBase(a=A.column("first_name"), b=None),
        ],
        include_null_properties=True,
    ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["dst2"]) == str(
    #     filter(
    #         array(lit("address1"), lit("address2")), lambda x: x.isNotNull()
    #     ).alias("dst2")
    # )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [0] == "Imran")
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [1] == "Qureshi")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0]
            [0] == "Michael")
    assert (
        result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1]
        is None)
def test_auto_mapper_coalesce(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", None),
            (2, None, "Michael", "1970-02-02"),
            (3, None, "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        my_column=A.coalesce(
            A.column("last_name"), A.column("date_of_birth"), A.text("last_resort")
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["my_column"],
        coalesce(
            col("b.last_name"),
            col("b.date_of_birth"),
            lit("last_resort").cast(StringType()),
        ).alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (
        result_df.where("member_id == 1").select("my_column").collect()[0][0]
        == "Qureshi"
    )
    assert (
        result_df.where("member_id == 2").select("my_column").collect()[0][0]
        == "1970-02-02"
    )
    assert (
        result_df.where("member_id == 3").select("my_column").collect()[0][0]
        == "last_resort"
    )
def test_automapper_if_not_null_or_empty(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame(
        [
            (1, 'Qureshi', 'Imran', "54"),
            (2, 'Vidal', 'Michael', ""),
            (3, 'Vidal3', 'Michael', None),
        ], ['member_id', 'last_name', 'first_name', "my_age"]
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    source_df.show()

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False
    ).columns(
        age=A.if_not_null_or_empty(
            A.column("my_age"), A.column("my_age"), A.text("100")
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df
    )
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        when(
            col("b.my_age").isNull() | col("b.my_age").eqNullSafe(""),
            lit("100").cast(StringType())
        ).otherwise(col("b.my_age")).alias("age")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("age"
                                                    ).collect()[0][0] == "54"
    assert result_df.where("member_id == 2").select("age"
                                                    ).collect()[0][0] == "100"
    assert result_df.where("member_id == 3").select("age"
                                                    ).collect()[0][0] == "100"

    assert dict(result_df.dtypes)["age"] == "string"
Beispiel #17
0
def test_auto_mapper_cast(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn("an_array", array())
    source_df.createOrReplaceTempView("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).complex(
        MyClass(
            name=A.column("last_name"),
            age=A.column("my_age").cast(AutoMapperNumberDataType),
            my_array=A.column("an_array").cast(
                AutoMapperList[AutoMapperNumberDataType]
            ),
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["name"]) == str(
        col("b.last_name").cast("string").alias("name")
    )
    assert str(sql_expressions["age"]) == str(col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_fhir_group_resource(spark_session: SparkSession) -> None:
    spark_session.createDataFrame(
        [(1, "practitioner", "affiliated practitioner", 2)],
        ["practitioner_id", "type", "name", "affiliated_id"],
    ).createOrReplaceTempView("groups")

    source_df: DataFrame = spark_session.table("groups")

    df = source_df.select("practitioner_id")
    df.createOrReplaceTempView("view_group")

    mapper = AutoMapper(
        view="view_group", source_view="groups",
        keys=["practitioner_id"]).complex(
            Group(
                id_=FhirId(A.column("practitioner_id")),
                meta=Meta(source="http://medstarhealth.org/provider"),
                identifier=FhirList([
                    Identifier(
                        value=A.column("practitioner_id"),
                        type_=CodeableConcept(coding=FhirList([
                            Coding(
                                system=IdentifierTypeCodesCode.codeset,
                                code=IdentifierTypeCodesCode(
                                    A.text("PractitionerAffiliation")),
                            )
                        ])),
                        system="http://medstarhealth.org",
                    )
                ]),
                type_=GroupTypeCodeValues.Practitioner,
                actual=True,
                name=A.text("Medstar Affiliated Practitioner"),
                member=FhirList([
                    GroupMember(entity=Reference(reference=FhirReference(
                        "Practitioner",
                        A.column("affiliated_id"),
                    )),
                                # inactive=False,
                                ),
                ]),
            ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    result_df.printSchema()
    result_df.show(truncate=False)
def test_auto_mapper_decimal(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54.45"),
            (2, "Vidal", "Michael", "123467.678"),
            (3, "Paul", "Kyle", "13"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(age=A.decimal(A.column("my_age"), 10, 2))

    debug_text: str = mapper.to_debug_string()
    print(debug_text)

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"], col("b.my_age").cast("decimal(10,2)").alias("age")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("age").collect()[0][0] == Decimal(
        "54.45"
    )
    assert result_df.where("member_id == 2").select("age").collect()[0][0] == Decimal(
        "123467.68"
    )
    assert result_df.where("member_id == 3").select("age").collect()[0][0] == Decimal(
        "13.00"
    )

    assert dict(result_df.dtypes)["age"] == "decimal(10,2)"
Beispiel #20
0
def test_auto_mapper_hash(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "54"),
            (2, "Vidal", "67"),
            (3, "Vidal", None),
            (4, None, None),
        ],
        ["member_id", "last_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn("my_age", col("my_age").cast("int"))

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        keys=["member_id"
              ]).columns(age=A.hash(A.column("my_age"), A.column("last_name")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        hash(col("b.my_age"), col("b.last_name")).cast("string").alias("age"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("age").collect()[0][0] ==
            "-543157534")
    assert (result_df.where("member_id == 2").select("age").collect()[0][0] ==
            "2048196121")
    assert (result_df.where("member_id == 3").select("age").collect()[0][0] ==
            "-80001407")
    assert result_df.where("member_id == 4").select(
        "age").collect()[0][0] == "42"

    assert dict(result_df.dtypes)["age"] == "string"
def test_auto_mapper_date_format(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01 12:30"),
            (2, "Vidal", "Michael", "1970-02-02 06:30"),
        ],
        ["member_id", "last_name", "first_name", "opening_time"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn(
        "opening_time", to_timestamp("opening_time",
                                     format="yyyy-MM-dd hh:mm"))

    assert dict(source_df.dtypes)["opening_time"] == "timestamp"

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        keys=["member_id"]).columns(openingTime=A.datetime(
            A.column("opening_time")).to_date_format("hh:mm:ss"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["openingTime"],
        date_format(coalesce(to_timestamp(col("b.opening_time"))),
                    "hh:mm:ss").alias("openingTime"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("openingTime").collect()
            [0][0] == "12:30:00")
    assert (result_df.where("member_id == 2").select("openingTime").collect()
            [0][0] == "06:30:00")

    # check type
    assert dict(result_df.dtypes)["openingTime"] == "string"
def test_auto_mapper_boolean(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "0"),
            (2, "Vidal", "Michael", "1"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(
                            age=A.boolean(A.column("my_age")),
                            is_active=A.boolean("False"),
                        )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(sql_expressions["age"],
                               col("b.my_age").cast("boolean").alias("age"))
    assert_compare_expressions(sql_expressions["is_active"],
                               lit("False").cast("boolean").alias("is_active"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age",
        "is_active",
    ).collect()[0][:] == (False, False)
    assert result_df.where("member_id == 2").select(
        "age",
        "is_active",
    ).collect()[0][:] == (True, False)

    assert dict(result_df.dtypes)["age"] == "boolean"
    assert dict(result_df.dtypes)["is_active"] == "boolean"
Beispiel #23
0
def test_auto_mapper_multiple_columns(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran'),
        (2, 'Vidal', 'Michael'),
    ], ['member_id', 'last_name', 'first_name'
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False).columns(dst1="src1").columns(
            dst2=AutoMapperList(["address1"])).columns(dst3=AutoMapperList(
                ["address1", "address2"])).columns(dst4=AutoMapperList(
                    [A.complex(use="usual", family=A.column("last_name"))]))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5, list(result_df.columns)
    assert result_df.where("member_id == 1").select(
        "dst1").collect()[0][0] == "src1"
    assert result_df.where("member_id == 1").select(
        "dst2").collect()[0][0][0] == "address1"

    assert result_df.where("member_id == 1").select(
        "dst3").collect()[0][0][0] == "address1"
    assert result_df.where("member_id == 1").select(
        "dst3").collect()[0][0][1] == "address2"

    assert result_df.where("member_id == 1").select(
        "dst4").collect()[0][0][0][0] == "usual"
    assert result_df.where("member_id == 1").select(
        "dst4").collect()[0][0][0][1] == "Qureshi"
def test_automapper_nested_array_filter_simple_with_array(
    spark_session: SparkSession, ) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    environ["LOGLEVEL"] = "DEBUG"

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients").columns(age=A.nested_array_filter(
            array_field=A.column("array1"),
            inner_array_field=A.field("array2"),
            match_property="reference",
            match_value=A.text("bar"),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        filter(
            col("b.array1"),
            lambda y: exists(
                y["array2"], lambda x: x["reference"] == lit("bar").cast(
                    "string")),
        ).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.printSchema()
    result_df.show(truncate=False)

    assert result_df.count() == 2
    assert result_df.select("age").collect()[0][0] == []
    assert result_df.select(
        "age").collect()[1][0][0]["array2"][0]["reference"] == "bar"
def test_auto_mapper_fhir_reference(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi"),
            (2, "Vidal"),
        ],
        ["member_id", "last_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        patient=Patient(
            id_=FhirId(A.column("last_name")),
            managingOrganization=Reference(
                reference=FhirReference("Organization", A.column("last_name"))
            ),
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show(truncate=False)

    assert (
        result_df.where("member_id == 1")
        .selectExpr("patient.managingOrganization.reference")
        .collect()[0][0]
        == "Organization/Qureshi"
    )
    assert (
        result_df.where("member_id == 2")
        .selectExpr("patient.managingOrganization.reference")
        .collect()[0][0]
        == "Organization/Vidal"
    )
def test_auto_mapper_date_column_typed(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn(
        "date_of_birth", to_date("date_of_birth", format="yyyy-MM-dd")
    )

    assert dict(source_df.dtypes)["date_of_birth"] == "date"

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(birthDate=A.date(A.column("date_of_birth")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["birthDate"]) == str(
        col("b.date_of_birth").alias("birthDate")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("birthDate").collect()[0][
        0
    ] == date(1970, 1, 1)
    assert result_df.where("member_id == 2").select("birthDate").collect()[0][
        0
    ] == date(1970, 2, 2)

    assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_full_no_views(spark_session: SparkSession) -> None:
    # Arrange
    source_df = spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran'),
        (2, 'Vidal', 'Michael'),
    ], ['member_id', 'last_name', 'first_name'])

    # example of a variable
    client_address_variable: str = "address1"

    # Act
    mapper = AutoMapper(keys=["member_id"], drop_key_columns=False).columns(
        dst1="src1",
        dst2=AutoMapperList([client_address_variable]),
        dst3=AutoMapperList([client_address_variable, "address2"]))

    company_name: str = "Microsoft"

    if company_name == "Microsoft":
        mapper = mapper.columns(dst4=AutoMapperList(
            [A.complex(use="usual", family=A.column("last_name"))]))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5
    assert result_df.where("member_id == 1").select(
        "dst1").collect()[0][0] == "src1"
    assert result_df.where("member_id == 1").select(
        "dst2").collect()[0][0][0] == "address1"

    assert result_df.where("member_id == 1").select(
        "dst3").collect()[0][0][0] == "address1"
    assert result_df.where("member_id == 1").select(
        "dst3").collect()[0][0][1] == "address2"

    assert result_df.where("member_id == 1").select(
        "dst4").collect()[0][0][0][0] == "usual"
    assert result_df.where("member_id == 1").select(
        "dst4").collect()[0][0][0][1] == "Qureshi"
def test_auto_mapper_fhir_plan_definition(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01", "female"),
            (2, "Vidal", "Michael", "1970-02-02", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).complex(
        PlanDefinition(
            id_=FhirId(A.column("member_id")), status=PublicationStatusCodeValues.Active
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    result_df.printSchema()
    result_df.show()
def test_automapper_if_not(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=[
                            "member_id"
                        ]).columns(age=A.if_not(column=A.column("my_age"),
                                                check=A.text("55"),
                                                value=A.number(A.text("100"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        when(
            col("b.my_age").eqNullSafe(lit("55").cast(StringType())),
            lit(None)).otherwise(
                lit("100").cast(StringType()).cast(LongType())).alias("age"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.count() == 1
    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 100

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_lpad(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "1234"),
            (2, "1234567"),
            (3, "123456789"),
        ],
        ["member_id", "empi"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(my_column=A.lpad(column=A.column("empi"), length=9, pad="0"))

    # Assert
    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)

    assert str(sql_expressions["my_column"]) == str(
        lpad(col=col("b.empi"), len=9, pad="0").alias("my_column")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 1").select("my_column").collect()[0][0]
        == "000001234"
    )
    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 2").select("my_column").collect()[0][0]
        == "001234567"
    )

    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 3").select("my_column").collect()[0][0]
        == "123456789"
    )