Ejemplo n.º 1
0
def test_framework_drop_duplicates_transformer(
        spark_session: SparkSession) -> None:
    # create a dataframe with the test data
    data_dir: Path = Path(__file__).parent.joinpath("./")

    df: DataFrame = create_empty_dataframe(spark_session=spark_session)

    view: str = "primary_care_protocol"
    FrameworkCsvLoader(
        view=view,
        filepath=data_dir.joinpath("primary_care_protocol.csv"),
        clean_column_names=False,
    ).transform(df)

    # ensure we have all the rows even the ones we want to drop
    result_df: DataFrame = spark_session.table(view)
    assert 3 == result_df.count()

    # drop the rows with null NPI or null Last Name
    FrameworkDropDuplicatesTransformer(columns=["NPI"],
                                       view=view).transform(df)

    # assert we get only the rows with a populated NPI
    result_df = spark_session.table(view)
    assert 2 == result_df.count()
Ejemplo n.º 2
0
def test_framework_drop_rows_with_null_transformer(
        spark_session: SparkSession) -> None:
    # create a dataframe with the test data
    data_dir: Path = Path(__file__).parent.joinpath("./")

    df: DataFrame = create_empty_dataframe(spark_session=spark_session)

    view: str = "primary_care_protocol"
    FrameworkCsvLoader(
        view=view,
        filepath=data_dir.joinpath("primary_care_protocol.csv"),
        clean_column_names=False,
    ).transform(df)

    # ensure we have all the rows even the ones we want to drop
    result_df: DataFrame = spark_session.table(view)
    assert 7 == result_df.count()

    # drop the rows with null NPI or null Last Name
    FrameworkDropRowsWithNullTransformer(columns_to_check=["NPI", "Last Name"],
                                         view=view).transform(df)

    # assert we get only the rows with a populated NPI
    result_df = spark_session.table(view)
    assert 1 == result_df.count()

    # ensure that no rows are dropped when there are no null values
    FrameworkDropRowsWithNullTransformer(columns_to_check=["NPI", "Last Name"],
                                         view=view).transform(result_df)
    assert 1 == result_df.count()
Ejemplo n.º 3
0
def test_auto_mapper_datetime_column_default(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "18922"),
            (2, "Vidal", "Michael", "1609390500"),
        ],
        ["member_id", "last_name", "first_name", "ts"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        timestamp=A.unix_timestamp(A.column("ts")),
        literal_val=A.unix_timestamp("1609390500"),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["literal_val"]) == str(
        to_timestamp(
            from_unixtime("1609390500", "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd HH:mm:ss"
        ).alias("literal_val")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.count() == 2

    assert result_df.where("member_id == 1").select("timestamp").collect()[0][
        0
    ] == datetime(1970, 1, 1, 5, 15, 22)
    assert result_df.where("member_id == 2").select("timestamp").collect()[0][
        0
    ] == datetime(2020, 12, 31, 4, 55, 0)

    assert result_df.where("member_id == 1").select("literal_val").collect()[0][
        0
    ] == datetime(2020, 12, 31, 4, 55, 0)
    assert result_df.where("member_id == 2").select("literal_val").collect()[0][
        0
    ] == datetime(2020, 12, 31, 4, 55, 0)

    assert dict(result_df.dtypes)["timestamp"] == "timestamp"
    assert dict(result_df.dtypes)["literal_val"] == "timestamp"
def test_auto_mapper_coalesce(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", None),
            (2, None, "Michael", "1970-02-02"),
            (3, None, "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        my_column=A.coalesce(
            A.column("last_name"), A.column("date_of_birth"), A.text("last_resort")
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["my_column"],
        coalesce(
            col("b.last_name"),
            col("b.date_of_birth"),
            lit("last_resort").cast(StringType()),
        ).alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (
        result_df.where("member_id == 1").select("my_column").collect()[0][0]
        == "Qureshi"
    )
    assert (
        result_df.where("member_id == 2").select("my_column").collect()[0][0]
        == "1970-02-02"
    )
    assert (
        result_df.where("member_id == 3").select("my_column").collect()[0][0]
        == "last_resort"
    )
def test_automapper_if_not_null_or_empty(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame(
        [
            (1, 'Qureshi', 'Imran', "54"),
            (2, 'Vidal', 'Michael', ""),
            (3, 'Vidal3', 'Michael', None),
        ], ['member_id', 'last_name', 'first_name', "my_age"]
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    source_df.show()

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False
    ).columns(
        age=A.if_not_null_or_empty(
            A.column("my_age"), A.column("my_age"), A.text("100")
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df
    )
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        when(
            col("b.my_age").isNull() | col("b.my_age").eqNullSafe(""),
            lit("100").cast(StringType())
        ).otherwise(col("b.my_age")).alias("age")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("age"
                                                    ).collect()[0][0] == "54"
    assert result_df.where("member_id == 2").select("age"
                                                    ).collect()[0][0] == "100"
    assert result_df.where("member_id == 3").select("age"
                                                    ).collect()[0][0] == "100"

    assert dict(result_df.dtypes)["age"] == "string"
Ejemplo n.º 6
0
def test_auto_mapper_regex_replace_unicode(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (
                1,
                "MedStar NRN PMR at Good Samaritan Hosp Good Health Center",
                "Imran",
                "1970-01-01",
            ),
            (2, "Vidal", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    not_normal_characters: str = r"[^\w\r\n\t _.,!\"'/$-]"

    # source_df.select(regexp_extract('last_name', not_normal_characters, 1).alias('d')).show()

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(my_column=A.column("last_name").regex_replace(not_normal_characters, "."))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["my_column"]) == str(
        regexp_replace(col("b.last_name"), not_normal_characters, ".").alias(
            "my_column"
        )
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show(truncate=False)

    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 1").select("my_column").collect()[0][0]
        == "MedStar NRN PMR at Good Samaritan Hosp.Good Health Center"
    )
    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 2").select("my_column").collect()[0][0] == "Vidal"
    )
Ejemplo n.º 7
0
    def test_create_temporary_view(self, target_df: DataFrame,
                                   spark_session: SparkSession) -> None:
        # arrange
        spark_client = SparkClient()

        # act
        spark_client.create_temporary_view(target_df, "temp_view")
        result_df = spark_session.table("temp_view")

        # assert
        assert_dataframe_equality(target_df, result_df)
def test_auto_mapper_fhir_group_resource(spark_session: SparkSession) -> None:
    spark_session.createDataFrame(
        [(1, "practitioner", "affiliated practitioner", 2)],
        ["practitioner_id", "type", "name", "affiliated_id"],
    ).createOrReplaceTempView("groups")

    source_df: DataFrame = spark_session.table("groups")

    df = source_df.select("practitioner_id")
    df.createOrReplaceTempView("view_group")

    mapper = AutoMapper(
        view="view_group", source_view="groups",
        keys=["practitioner_id"]).complex(
            Group(
                id_=FhirId(A.column("practitioner_id")),
                meta=Meta(source="http://medstarhealth.org/provider"),
                identifier=FhirList([
                    Identifier(
                        value=A.column("practitioner_id"),
                        type_=CodeableConcept(coding=FhirList([
                            Coding(
                                system=IdentifierTypeCodesCode.codeset,
                                code=IdentifierTypeCodesCode(
                                    A.text("PractitionerAffiliation")),
                            )
                        ])),
                        system="http://medstarhealth.org",
                    )
                ]),
                type_=GroupTypeCodeValues.Practitioner,
                actual=True,
                name=A.text("Medstar Affiliated Practitioner"),
                member=FhirList([
                    GroupMember(entity=Reference(reference=FhirReference(
                        "Practitioner",
                        A.column("affiliated_id"),
                    )),
                                # inactive=False,
                                ),
                ]),
            ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    result_df.printSchema()
    result_df.show(truncate=False)
def test_auto_mapper_decimal(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54.45"),
            (2, "Vidal", "Michael", "123467.678"),
            (3, "Paul", "Kyle", "13"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(age=A.decimal(A.column("my_age"), 10, 2))

    debug_text: str = mapper.to_debug_string()
    print(debug_text)

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"], col("b.my_age").cast("decimal(10,2)").alias("age")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("age").collect()[0][0] == Decimal(
        "54.45"
    )
    assert result_df.where("member_id == 2").select("age").collect()[0][0] == Decimal(
        "123467.68"
    )
    assert result_df.where("member_id == 3").select("age").collect()[0][0] == Decimal(
        "13.00"
    )

    assert dict(result_df.dtypes)["age"] == "decimal(10,2)"
Ejemplo n.º 10
0
def test_auto_mapper_cast(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn("an_array", array())
    source_df.createOrReplaceTempView("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).complex(
        MyClass(
            name=A.column("last_name"),
            age=A.column("my_age").cast(AutoMapperNumberDataType),
            my_array=A.column("an_array").cast(
                AutoMapperList[AutoMapperNumberDataType]
            ),
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["name"]) == str(
        col("b.last_name").cast("string").alias("name")
    )
    assert str(sql_expressions["age"]) == str(col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_multiple_columns_simpler_syntax(
    spark_session: SparkSession, ) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = (AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst1="src1").columns(dst2=["address1"]).columns(
        dst3=["address1", "address2"]).columns(
            dst4=[dict(use="usual", family="[last_name]")]))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5
    assert result_df.where("member_id == 1").select(
        "dst1").collect()[0][0] == "src1"
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            == "address1")

    assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][0]
            == "address1")
    assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][1]
            == "address2")

    assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0]
            [0] == "usual")
    assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0]
            [1] == "Qureshi")
Ejemplo n.º 12
0
def test_auto_mapper_hash(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "54"),
            (2, "Vidal", "67"),
            (3, "Vidal", None),
            (4, None, None),
        ],
        ["member_id", "last_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn("my_age", col("my_age").cast("int"))

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        keys=["member_id"
              ]).columns(age=A.hash(A.column("my_age"), A.column("last_name")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        hash(col("b.my_age"), col("b.last_name")).cast("string").alias("age"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("age").collect()[0][0] ==
            "-543157534")
    assert (result_df.where("member_id == 2").select("age").collect()[0][0] ==
            "2048196121")
    assert (result_df.where("member_id == 3").select("age").collect()[0][0] ==
            "-80001407")
    assert result_df.where("member_id == 4").select(
        "age").collect()[0][0] == "42"

    assert dict(result_df.dtypes)["age"] == "string"
Ejemplo n.º 13
0
def test_automapper_if_list(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Qureshi", "Imran", "59"),
            (3, "Vidal", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(age=A.if_(
                            column=A.column("my_age"),
                            check=["54", "59"],
                            value=A.number(A.column("my_age")),
                            else_=A.number(A.text("100")),
                        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        when(col("b.my_age").isin(["54", "59"]),
             col("b.my_age").cast("long")).otherwise(
                 lit("100").cast(StringType()).cast(LongType())).alias("age"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 59
    assert result_df.where("member_id == 3").select(
        "age").collect()[0][0] == 100
    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
Ejemplo n.º 14
0
def test_automapper_null_if_empty(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", ""),
            (3, "Vidal3", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    source_df.show()

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(age=A.column("my_age").to_null_if_empty())

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        when(col("b.my_age").eqNullSafe(""),
             lit(None)).otherwise(col("b.my_age")).alias("age"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == "54"
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] is None
    assert result_df.where("member_id == 3").select(
        "age").collect()[0][0] is None

    assert dict(result_df.dtypes)["age"] == "string"