def test_framework_drop_duplicates_transformer( spark_session: SparkSession) -> None: # create a dataframe with the test data data_dir: Path = Path(__file__).parent.joinpath("./") df: DataFrame = create_empty_dataframe(spark_session=spark_session) view: str = "primary_care_protocol" FrameworkCsvLoader( view=view, filepath=data_dir.joinpath("primary_care_protocol.csv"), clean_column_names=False, ).transform(df) # ensure we have all the rows even the ones we want to drop result_df: DataFrame = spark_session.table(view) assert 3 == result_df.count() # drop the rows with null NPI or null Last Name FrameworkDropDuplicatesTransformer(columns=["NPI"], view=view).transform(df) # assert we get only the rows with a populated NPI result_df = spark_session.table(view) assert 2 == result_df.count()
def test_framework_drop_rows_with_null_transformer( spark_session: SparkSession) -> None: # create a dataframe with the test data data_dir: Path = Path(__file__).parent.joinpath("./") df: DataFrame = create_empty_dataframe(spark_session=spark_session) view: str = "primary_care_protocol" FrameworkCsvLoader( view=view, filepath=data_dir.joinpath("primary_care_protocol.csv"), clean_column_names=False, ).transform(df) # ensure we have all the rows even the ones we want to drop result_df: DataFrame = spark_session.table(view) assert 7 == result_df.count() # drop the rows with null NPI or null Last Name FrameworkDropRowsWithNullTransformer(columns_to_check=["NPI", "Last Name"], view=view).transform(df) # assert we get only the rows with a populated NPI result_df = spark_session.table(view) assert 1 == result_df.count() # ensure that no rows are dropped when there are no null values FrameworkDropRowsWithNullTransformer(columns_to_check=["NPI", "Last Name"], view=view).transform(result_df) assert 1 == result_df.count()
def test_auto_mapper_datetime_column_default(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "18922"), (2, "Vidal", "Michael", "1609390500"), ], ["member_id", "last_name", "first_name", "ts"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( timestamp=A.unix_timestamp(A.column("ts")), literal_val=A.unix_timestamp("1609390500"), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["literal_val"]) == str( to_timestamp( from_unixtime("1609390500", "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd HH:mm:ss" ).alias("literal_val") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.count() == 2 assert result_df.where("member_id == 1").select("timestamp").collect()[0][ 0 ] == datetime(1970, 1, 1, 5, 15, 22) assert result_df.where("member_id == 2").select("timestamp").collect()[0][ 0 ] == datetime(2020, 12, 31, 4, 55, 0) assert result_df.where("member_id == 1").select("literal_val").collect()[0][ 0 ] == datetime(2020, 12, 31, 4, 55, 0) assert result_df.where("member_id == 2").select("literal_val").collect()[0][ 0 ] == datetime(2020, 12, 31, 4, 55, 0) assert dict(result_df.dtypes)["timestamp"] == "timestamp" assert dict(result_df.dtypes)["literal_val"] == "timestamp"
def test_auto_mapper_coalesce(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", None), (2, None, "Michael", "1970-02-02"), (3, None, "Michael", None), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( my_column=A.coalesce( A.column("last_name"), A.column("date_of_birth"), A.text("last_resort") ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["my_column"], coalesce( col("b.last_name"), col("b.date_of_birth"), lit("last_resort").cast(StringType()), ).alias("my_column"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert ( result_df.where("member_id == 1").select("my_column").collect()[0][0] == "Qureshi" ) assert ( result_df.where("member_id == 2").select("my_column").collect()[0][0] == "1970-02-02" ) assert ( result_df.where("member_id == 3").select("my_column").collect()[0][0] == "last_resort" )
def test_automapper_if_not_null_or_empty(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran', "54"), (2, 'Vidal', 'Michael', ""), (3, 'Vidal3', 'Michael', None), ], ['member_id', 'last_name', 'first_name', "my_age"] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df.show() df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False ).columns( age=A.if_not_null_or_empty( A.column("my_age"), A.column("my_age"), A.text("100") ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( when( col("b.my_age").isNull() | col("b.my_age").eqNullSafe(""), lit("100").cast(StringType()) ).otherwise(col("b.my_age")).alias("age") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("age" ).collect()[0][0] == "54" assert result_df.where("member_id == 2").select("age" ).collect()[0][0] == "100" assert result_df.where("member_id == 3").select("age" ).collect()[0][0] == "100" assert dict(result_df.dtypes)["age"] == "string"
def test_auto_mapper_regex_replace_unicode(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ ( 1, "MedStar NRN PMR at Good Samaritan Hosp Good Health Center", "Imran", "1970-01-01", ), (2, "Vidal", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") not_normal_characters: str = r"[^\w\r\n\t _.,!\"'/$-]" # source_df.select(regexp_extract('last_name', not_normal_characters, 1).alias('d')).show() # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(my_column=A.column("last_name").regex_replace(not_normal_characters, ".")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["my_column"]) == str( regexp_replace(col("b.last_name"), not_normal_characters, ".").alias( "my_column" ) ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show(truncate=False) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 1").select("my_column").collect()[0][0] == "MedStar NRN PMR at Good Samaritan Hosp.Good Health Center" ) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 2").select("my_column").collect()[0][0] == "Vidal" )
def test_create_temporary_view(self, target_df: DataFrame, spark_session: SparkSession) -> None: # arrange spark_client = SparkClient() # act spark_client.create_temporary_view(target_df, "temp_view") result_df = spark_session.table("temp_view") # assert assert_dataframe_equality(target_df, result_df)
def test_auto_mapper_fhir_group_resource(spark_session: SparkSession) -> None: spark_session.createDataFrame( [(1, "practitioner", "affiliated practitioner", 2)], ["practitioner_id", "type", "name", "affiliated_id"], ).createOrReplaceTempView("groups") source_df: DataFrame = spark_session.table("groups") df = source_df.select("practitioner_id") df.createOrReplaceTempView("view_group") mapper = AutoMapper( view="view_group", source_view="groups", keys=["practitioner_id"]).complex( Group( id_=FhirId(A.column("practitioner_id")), meta=Meta(source="http://medstarhealth.org/provider"), identifier=FhirList([ Identifier( value=A.column("practitioner_id"), type_=CodeableConcept(coding=FhirList([ Coding( system=IdentifierTypeCodesCode.codeset, code=IdentifierTypeCodesCode( A.text("PractitionerAffiliation")), ) ])), system="http://medstarhealth.org", ) ]), type_=GroupTypeCodeValues.Practitioner, actual=True, name=A.text("Medstar Affiliated Practitioner"), member=FhirList([ GroupMember(entity=Reference(reference=FhirReference( "Practitioner", A.column("affiliated_id"), )), # inactive=False, ), ]), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) result_df.printSchema() result_df.show(truncate=False)
def test_auto_mapper_decimal(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54.45"), (2, "Vidal", "Michael", "123467.678"), (3, "Paul", "Kyle", "13"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(age=A.decimal(A.column("my_age"), 10, 2)) debug_text: str = mapper.to_debug_string() print(debug_text) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], col("b.my_age").cast("decimal(10,2)").alias("age") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("age").collect()[0][0] == Decimal( "54.45" ) assert result_df.where("member_id == 2").select("age").collect()[0][0] == Decimal( "123467.68" ) assert result_df.where("member_id == 3").select("age").collect()[0][0] == Decimal( "13.00" ) assert dict(result_df.dtypes)["age"] == "decimal(10,2)"
def test_auto_mapper_cast(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn("an_array", array()) source_df.createOrReplaceTempView("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).complex( MyClass( name=A.column("last_name"), age=A.column("my_age").cast(AutoMapperNumberDataType), my_array=A.column("an_array").cast( AutoMapperList[AutoMapperNumberDataType] ), ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["name"]) == str( col("b.last_name").cast("string").alias("name") ) assert str(sql_expressions["age"]) == str(col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_multiple_columns_simpler_syntax( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = (AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst1="src1").columns(dst2=["address1"]).columns( dst3=["address1", "address2"]).columns( dst4=[dict(use="usual", family="[last_name]")])) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5 assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][1] == "address2") assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0] [0] == "usual") assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0] [1] == "Qureshi")
def test_auto_mapper_hash(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "54"), (2, "Vidal", "67"), (3, "Vidal", None), (4, None, None), ], ["member_id", "last_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn("my_age", col("my_age").cast("int")) df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id" ]).columns(age=A.hash(A.column("my_age"), A.column("last_name"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], hash(col("b.my_age"), col("b.last_name")).cast("string").alias("age"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("age").collect()[0][0] == "-543157534") assert (result_df.where("member_id == 2").select("age").collect()[0][0] == "2048196121") assert (result_df.where("member_id == 3").select("age").collect()[0][0] == "-80001407") assert result_df.where("member_id == 4").select( "age").collect()[0][0] == "42" assert dict(result_df.dtypes)["age"] == "string"
def test_automapper_if_list(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Qureshi", "Imran", "59"), (3, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns(age=A.if_( column=A.column("my_age"), check=["54", "59"], value=A.number(A.column("my_age")), else_=A.number(A.text("100")), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], when(col("b.my_age").isin(["54", "59"]), col("b.my_age").cast("long")).otherwise( lit("100").cast(StringType()).cast(LongType())).alias("age"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 59 assert result_df.where("member_id == 3").select( "age").collect()[0][0] == 100 assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_null_if_empty(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", ""), (3, "Vidal3", "Michael", None), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df.show() df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(age=A.column("my_age").to_null_if_empty()) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], when(col("b.my_age").eqNullSafe(""), lit(None)).otherwise(col("b.my_age")).alias("age"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == "54" assert result_df.where("member_id == 2").select( "age").collect()[0][0] is None assert result_df.where("member_id == 3").select( "age").collect()[0][0] is None assert dict(result_df.dtypes)["age"] == "string"