def test_create_instance(mock_datetime, patient_mapping, fhir_concept_map_identifier): mock_datetime.now.return_value = mockdatetime() resource_mapping = patient_mapping attr_identifier = Attribute("identifier[0].value", columns=[SqlColumn("a", "b")]) attr_birthDate = Attribute("birthDate", columns=[SqlColumn("a", "c")]) attr_maritalStatus = Attribute("maritalStatus.coding[0].code", columns=[SqlColumn("a", "d")]) attr_generalPractitioner = Attribute("generalPractitioner[0].type", static_inputs=["Practitioner"]) attributes = [ attr_identifier, attr_birthDate, attr_maritalStatus, attr_generalPractitioner ] row = { attr_maritalStatus: "D", attr_birthDate: "2000-10-10", attr_identifier: "A", } actual = transform.create_instance(row, resource_mapping, attributes) assert actual == { "meta": { "lastUpdated": "now", "tag": [ { "system": ARKHN_CODE_SYSTEMS.source, "code": patient_mapping["source"]["id"] }, { "system": ARKHN_CODE_SYSTEMS.resource, "code": patient_mapping["id"] }, ], }, "id": actual["id"], "identifier": [{ "value": "A" }], "resourceType": "Patient", "birthDate": "2000-10-10", "maritalStatus": { "coding": [{ "code": "D" }] }, "generalPractitioner": [{ "type": "Practitioner" }], }
def test_build_squash_rules(): cols = [ "ADMISSIONS.LANGUAGE", "PATIENTS.DOD", "PATIENTS.SUBJECT_ID", ] # NOTE: I use a list instead of a set to keep the order of elements joins = {SqlJoin(SqlColumn("PATIENTS", "SUBJECT_ID"), SqlColumn("ADMISSIONS", "SUBJECT_ID"))} table = "PATIENTS" actual = mapping.build_squash_rules(cols, joins, table) assert actual == ["PATIENTS", [["ADMISSIONS", []]]]
def test_apply_joins(): extractor = Extractor(None) joins = [ SqlJoin(SqlColumn("patients", "subject_id"), SqlColumn("admissions", "subject_id")), SqlJoin(SqlColumn("admissions", "row_id"), SqlColumn("prescriptions", "row_id")), ] base_query = mock.MagicMock() extractor.apply_joins(base_query, joins) foreign_tables = [tables["admissions"], tables["prescriptions"]] binary_expressions = [ mock_get_column("", SqlColumn("patients", "subject_id")) == mock_get_column( "", SqlColumn("admissions", "subject_id")), mock_get_column("", SqlColumn("admissions", "row_id")) == mock_get_column( "", SqlColumn("prescriptions", "row_id")), ] for call, foreign_table, binary_expression in zip( base_query.join.call_args_list, foreign_tables, binary_expressions): args, kwargs = call assert args[0] == foreign_table assert args[1].compare(binary_expression) assert kwargs == {"isouter": True}
def get_column(self, column: SqlColumn) -> AlchemyColumn: """ Get the sql alchemy column corresponding to the SqlColumn (custom type) from the analysis. """ table = self.get_table(column) # Note that we label the column manually to avoid collisions and # sqlAlchemy automatic labelling return table.c[column.column].label(column.dataframe_column_name())
def test_fetch_values_from_dataframe(): attr_identifier = Attribute("identifier[0].value", columns=[SqlColumn("a", "b")]) attr_birthDate = Attribute("birthDate", columns=[SqlColumn("a", "c")]) attr_maritalStatus = Attribute("maritalStatus.coding[0].code", columns=[SqlColumn("a", "d")]) attribute = attr_birthDate row = { attr_maritalStatus: "D", attr_birthDate: "2000-10-10", attr_identifier: "A", } value = transform.fetch_values_from_dataframe(row, attribute) assert value == "2000-10-10"
def test_apply_filters(): extractor = Extractor(None) resource_mapping = { "filters": [ { "relation": "LIKE", "value": "'2150-08-29'", "sqlColumn": { "owner": None, "table": "admissions", "column": "admittime" }, }, { "relation": "<=", "value": "1000", "sqlColumn": { "owner": None, "table": "patients", "column": "row_id" }, }, ] } pk_column = SqlColumn("patients", "subject_id") pk_values = [123, 456] base_query = mock.MagicMock() extractor.apply_filters(base_query, resource_mapping, pk_column, pk_values) binary_expressions = [ extractor.get_column(SqlColumn("patients", "subject_id")).in_(pk_values), extractor.get_column(SqlColumn("admissions", "admittime")).like("'2150-08-29'"), extractor.get_column(SqlColumn("patients", "row_id")) <= "1000", ] for call, binary_expression in zip(base_query.filter.call_args_list, binary_expressions): args, kwargs = call assert args[0].compare(binary_expression)
def test_squash_rows(): attr_name = Attribute("name", columns=[SqlColumn("PATIENTS", "NAME")]) attr_id = Attribute("id", columns=[SqlColumn("PATIENTS", "ID")]) attr_language = Attribute("language", columns=[SqlColumn("ADMISSIONS", "LANGUAGE")]) attr_admid = Attribute("admid", columns=[SqlColumn("ADMISSIONS", "ID")]) df_columns = pd.MultiIndex.from_tuples([ (attr_name, ("PATIENTS_NAME", "PATIENTS")), (attr_id, ("PATIENTS_ID", "PATIENTS")), (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")), (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")), ]) df = pd.DataFrame( { df_columns[0]: ["bob", "bob", "alice", "bob"], df_columns[1]: ["id1", "id1", "id2", "id3"], df_columns[2]: ["lang1", "lang2", "lang3", "lang4"], df_columns[3]: ["id1", "id2", "id3", "id4"], }, ) squash_rules = ["PATIENTS", [["ADMISSIONS", []]]] actual = transform.squash_rows(df, squash_rules) # Sort to be sure actual and expected are in the same order actual = actual.sort_values(by=df_columns[1]).reset_index(drop=True) expected = pd.DataFrame( { df_columns[0]: ["bob", "alice", "bob"], df_columns[1]: ["id1", "id2", "id3"], df_columns[2]: [("lang1", "lang2"), ("lang3", ), ("lang4", )], df_columns[3]: [("id1", "id2"), ("id3", ), ("id4", )], }, ) # Sort to be sure actual and expected are in the same order expected = expected.sort_values(by=df_columns[1]).reset_index(drop=True) assert actual.equals(expected)
def test_get_primary_key(): analyzer = Analyzer() # With owner resource_mapping = { "primaryKeyOwner": "owner", "primaryKeyTable": "table", "primaryKeyColumn": "col", } analyzer.get_primary_key(resource_mapping) assert analyzer.analysis.primary_key_column == SqlColumn( "table", "col", "owner") # Without owner resource_mapping = { "primaryKeyOwner": "", "primaryKeyTable": "table", "primaryKeyColumn": "col", } analyzer.get_primary_key(resource_mapping) assert analyzer.analysis.primary_key_column == SqlColumn("table", "col") # Raising error resource_mapping = { "primaryKeyOwner": "", "primaryKeyTable": "", "primaryKeyColumn": "col", "definitionId": "fhirtype", } with pytest.raises( ValueError, match= "You need to provide a primary key table and column in the mapping" ): analyzer.get_primary_key(resource_mapping)
def test_handle_array_attributes(): attr1 = Attribute("attr1", columns=[SqlColumn("a", "b")]) attr2 = Attribute("attr2", columns=[SqlColumn("a", "c")]) row = { attr1: ("A1", "A2", "A3"), attr2: "B", } attributes_in_array = { "path1": attr1, "path2": attr2, } value = transform.handle_array_attributes(attributes_in_array, row) assert value == [ { "path1": "A1", "path2": "B" }, { "path1": "A2", "path2": "B" }, { "path1": "A3", "path2": "B" }, ] # With mismatch in lengths row = { attr1: ("A1", "A2", "A3"), attr2: ("B1", "B2"), } with raises(AssertionError, match="mismatch in array lengths"): transform.handle_array_attributes(attributes_in_array, row)
def apply_filters(self, query: Query, resource_mapping, pk_column: SqlColumn, pk_values) -> Query: """ Augment the sql alchemy query with filters from the analysis. """ if pk_values is not None: query = query.filter(self.get_column(pk_column).in_(pk_values)) if resource_mapping["filters"]: for filter in resource_mapping["filters"]: col = self.get_column( SqlColumn( filter["sqlColumn"]["table"], filter["sqlColumn"]["column"], filter["sqlColumn"]["owner"], )) rel_method = SQL_RELATIONS_TO_METHOD[filter["relation"]] query = query.filter(getattr(col, rel_method)(filter["value"])) return query
def test_merge_dataframe(_): attr_name = Attribute("name", columns=[SqlColumn("PATIENTS", "NAME")]) attr_id = Attribute( "id", columns=[SqlColumn("PATIENTS", "ID"), SqlColumn("PATIENTS", "ID2")], static_inputs=["unknown"], merging_script=MergingScript("merge"), ) attr_language = Attribute("language", columns=[SqlColumn("ADMISSIONS", "LANGUAGE")]) attr_admid = Attribute("admid", columns=[SqlColumn("ADMISSIONS", "ID")]) df_columns = pd.MultiIndex.from_tuples([ (attr_name, ("PATIENTS_NAME", "PATIENTS")), (attr_id, ("PATIENTS_ID", "PATIENTS")), (attr_id, ("PATIENTS_ID2", "PATIENTS")), (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")), (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")), ("pk", ("PATIENTS_ID", "PATIENTS")), ]) df = pd.DataFrame( { df_columns[0]: ["bob", "bob", "alice", "bob"], df_columns[1]: ["id1", "id1", "id2", "id3"], df_columns[2]: ["id21", "id21", "id22", "id23"], df_columns[3]: ["lang1", "lang2", "lang3", "lang4"], df_columns[4]: ["hadmid1", "hadmid2", "hadmid3", "hadmid4"], df_columns[5]: ["id1", "id2", "id3", "id4"], }, ) attributes = [attr_name, attr_id, attr_language, attr_admid] primary_key_column = SqlColumn("PATIENTS", "ID") actual = transform.merge_dataframe(df, attributes, primary_key_column) expected = pd.DataFrame( { attr_name: ["bob", "bob", "alice", "bob"], attr_id: ["id1id21merge", "id1id21merge", "id2id22merge", "id3id23merge"], attr_language: ["lang1", "lang2", "lang3", "lang4"], attr_admid: ["hadmid1", "hadmid2", "hadmid3", "hadmid4"], }, ) assert actual.equals(expected)
def test_analyze_mapping(patient_mapping, fhir_concept_map_gender, fhir_concept_map_identifier): analyzer = Analyzer() analyzer.analyze_mapping(patient_mapping) assert analyzer.analysis.attributes == [ Attribute( "identifier[0].value", columns=[SqlColumn("patients", "row_id")], static_inputs=[], merging_script=None, ), Attribute( "deceasedBoolean", columns=[SqlColumn("patients", "expire_flag")], static_inputs=[], merging_script=None, ), Attribute( "generalPractitioner[0].identifier.value", columns=[SqlColumn("icustays", "hadm_id")], static_inputs=[], merging_script=None, ), Attribute( "birthDate", columns=[SqlColumn("patients", "dob")], static_inputs=[], merging_script=None, ), Attribute( "deceasedDateTime", columns=[SqlColumn("patients", "dod")], static_inputs=[], merging_script=None, ), Attribute( "gender", columns=[SqlColumn("patients", "gender")], static_inputs=["unknown"], merging_script=MergingScript("select_first_not_empty"), ), Attribute( "maritalStatus.coding[0].code", columns=[SqlColumn("admissions", "marital_status")], static_inputs=[], merging_script=None, ), Attribute( "generalPractitioner[0].type", columns=[], static_inputs=["Practitioner"], merging_script=None, ), ] assert analyzer.analysis.columns == { SqlColumn("patients", "row_id"), SqlColumn("patients", "gender"), SqlColumn("patients", "dob"), SqlColumn("patients", "dod"), SqlColumn("patients", "expire_flag"), SqlColumn("admissions", "marital_status"), SqlColumn("icustays", "hadm_id"), } assert analyzer.analysis.joins == { SqlJoin(SqlColumn("patients", "subject_id"), SqlColumn("admissions", "subject_id")), SqlJoin(SqlColumn("patients", "subject_id"), SqlColumn("icustays", "subject_id")), }
def pk_col_name(primary_key_column: SqlColumn): return ("pk", (primary_key_column.dataframe_column_name(), primary_key_column.table))
def test_clean_data(_, fhir_concept_map_code, fhir_concept_map_gender): df = pd.DataFrame( { "PATIENTS_NAME": ["alice", "bob", "charlie"], "PATIENTS_ID": ["id1", "id2", "id3"], "PATIENTS_ID2": ["id21", "id22", "id23"], "ADMISSIONS_LANGUAGE": ["M", "F", "F"], "ADMISSIONS_ID": ["ABC", "DEF", "GHI"], }, ) attr_name = Attribute("name", columns=[ SqlColumn( "PATIENTS", "NAME", cleaning_script=CleaningScript("clean1"), ) ]) attr_id = Attribute( "id", columns=[SqlColumn("PATIENTS", "ID"), SqlColumn("PATIENTS", "ID2")], static_inputs=["null"], ) attr_language = Attribute( "language", columns=[ SqlColumn("ADMISSIONS", "LANGUAGE", concept_map=ConceptMap("id_cm_gender")) ], static_inputs=["val"], ) attr_admid = Attribute( "code", columns=[ SqlColumn( "ADMISSIONS", "ID", cleaning_script=CleaningScript("clean2"), concept_map=ConceptMap("id_cm_code"), ) ], ) attributes = [attr_name, attr_id, attr_language, attr_admid] primary_key_column = SqlColumn("PATIENTS", "ID") cleaned_df = transform.clean_dataframe(df, attributes, primary_key_column) df_columns = pd.MultiIndex.from_tuples([ (attr_name, ("PATIENTS_NAME", "PATIENTS")), (attr_id, ("PATIENTS_ID", "PATIENTS")), (attr_id, ("PATIENTS_ID2", "PATIENTS")), (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")), (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")), ("pk", ("PATIENTS_ID", "PATIENTS")), ]) expected = pd.DataFrame( { df_columns[0]: ["alicecleaned", "bobcleaned", "charliecleaned"], df_columns[1]: ["id1", "id2", "id3"], df_columns[2]: ["id21", "id22", "id23"], df_columns[3]: ["male", "female", "female"], df_columns[4]: ["abc", "def", "ghi"], df_columns[5]: ["id1", "id2", "id3"], }, ) assert cleaned_df.equals(expected)