Beispiel #1
0
def test_create_instance(mock_datetime, patient_mapping,
                         fhir_concept_map_identifier):
    mock_datetime.now.return_value = mockdatetime()

    resource_mapping = patient_mapping

    attr_identifier = Attribute("identifier[0].value",
                                columns=[SqlColumn("a", "b")])
    attr_birthDate = Attribute("birthDate", columns=[SqlColumn("a", "c")])
    attr_maritalStatus = Attribute("maritalStatus.coding[0].code",
                                   columns=[SqlColumn("a", "d")])
    attr_generalPractitioner = Attribute("generalPractitioner[0].type",
                                         static_inputs=["Practitioner"])

    attributes = [
        attr_identifier, attr_birthDate, attr_maritalStatus,
        attr_generalPractitioner
    ]

    row = {
        attr_maritalStatus: "D",
        attr_birthDate: "2000-10-10",
        attr_identifier: "A",
    }

    actual = transform.create_instance(row, resource_mapping, attributes)

    assert actual == {
        "meta": {
            "lastUpdated":
            "now",
            "tag": [
                {
                    "system": ARKHN_CODE_SYSTEMS.source,
                    "code": patient_mapping["source"]["id"]
                },
                {
                    "system": ARKHN_CODE_SYSTEMS.resource,
                    "code": patient_mapping["id"]
                },
            ],
        },
        "id": actual["id"],
        "identifier": [{
            "value": "A"
        }],
        "resourceType": "Patient",
        "birthDate": "2000-10-10",
        "maritalStatus": {
            "coding": [{
                "code": "D"
            }]
        },
        "generalPractitioner": [{
            "type": "Practitioner"
        }],
    }
Beispiel #2
0
def test_build_squash_rules():
    cols = [
        "ADMISSIONS.LANGUAGE",
        "PATIENTS.DOD",
        "PATIENTS.SUBJECT_ID",
    ]  # NOTE: I use a list instead of a set to keep the order of elements
    joins = {SqlJoin(SqlColumn("PATIENTS", "SUBJECT_ID"), SqlColumn("ADMISSIONS", "SUBJECT_ID"))}
    table = "PATIENTS"

    actual = mapping.build_squash_rules(cols, joins, table)

    assert actual == ["PATIENTS", [["ADMISSIONS", []]]]
Beispiel #3
0
def test_apply_joins():
    extractor = Extractor(None)
    joins = [
        SqlJoin(SqlColumn("patients", "subject_id"),
                SqlColumn("admissions", "subject_id")),
        SqlJoin(SqlColumn("admissions", "row_id"),
                SqlColumn("prescriptions", "row_id")),
    ]

    base_query = mock.MagicMock()

    extractor.apply_joins(base_query, joins)

    foreign_tables = [tables["admissions"], tables["prescriptions"]]
    binary_expressions = [
        mock_get_column("", SqlColumn("patients",
                                      "subject_id")) == mock_get_column(
                                          "",
                                          SqlColumn("admissions",
                                                    "subject_id")),
        mock_get_column("",
                        SqlColumn("admissions", "row_id")) == mock_get_column(
                            "", SqlColumn("prescriptions", "row_id")),
    ]
    for call, foreign_table, binary_expression in zip(
            base_query.join.call_args_list, foreign_tables,
            binary_expressions):
        args, kwargs = call
        assert args[0] == foreign_table
        assert args[1].compare(binary_expression)
        assert kwargs == {"isouter": True}
Beispiel #4
0
 def get_column(self, column: SqlColumn) -> AlchemyColumn:
     """ Get the sql alchemy column corresponding to the SqlColumn (custom type)
     from the analysis.
     """
     table = self.get_table(column)
     # Note that we label the column manually to avoid collisions and
     # sqlAlchemy automatic labelling
     return table.c[column.column].label(column.dataframe_column_name())
Beispiel #5
0
def test_fetch_values_from_dataframe():
    attr_identifier = Attribute("identifier[0].value",
                                columns=[SqlColumn("a", "b")])
    attr_birthDate = Attribute("birthDate", columns=[SqlColumn("a", "c")])
    attr_maritalStatus = Attribute("maritalStatus.coding[0].code",
                                   columns=[SqlColumn("a", "d")])

    attribute = attr_birthDate

    row = {
        attr_maritalStatus: "D",
        attr_birthDate: "2000-10-10",
        attr_identifier: "A",
    }

    value = transform.fetch_values_from_dataframe(row, attribute)

    assert value == "2000-10-10"
Beispiel #6
0
def test_apply_filters():
    extractor = Extractor(None)
    resource_mapping = {
        "filters": [
            {
                "relation": "LIKE",
                "value": "'2150-08-29'",
                "sqlColumn": {
                    "owner": None,
                    "table": "admissions",
                    "column": "admittime"
                },
            },
            {
                "relation": "<=",
                "value": "1000",
                "sqlColumn": {
                    "owner": None,
                    "table": "patients",
                    "column": "row_id"
                },
            },
        ]
    }
    pk_column = SqlColumn("patients", "subject_id")
    pk_values = [123, 456]

    base_query = mock.MagicMock()

    extractor.apply_filters(base_query, resource_mapping, pk_column, pk_values)

    binary_expressions = [
        extractor.get_column(SqlColumn("patients",
                                       "subject_id")).in_(pk_values),
        extractor.get_column(SqlColumn("admissions",
                                       "admittime")).like("'2150-08-29'"),
        extractor.get_column(SqlColumn("patients", "row_id")) <= "1000",
    ]

    for call, binary_expression in zip(base_query.filter.call_args_list,
                                       binary_expressions):
        args, kwargs = call
        assert args[0].compare(binary_expression)
Beispiel #7
0
def test_squash_rows():
    attr_name = Attribute("name", columns=[SqlColumn("PATIENTS", "NAME")])
    attr_id = Attribute("id", columns=[SqlColumn("PATIENTS", "ID")])
    attr_language = Attribute("language",
                              columns=[SqlColumn("ADMISSIONS", "LANGUAGE")])
    attr_admid = Attribute("admid", columns=[SqlColumn("ADMISSIONS", "ID")])

    df_columns = pd.MultiIndex.from_tuples([
        (attr_name, ("PATIENTS_NAME", "PATIENTS")),
        (attr_id, ("PATIENTS_ID", "PATIENTS")),
        (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")),
        (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")),
    ])

    df = pd.DataFrame(
        {
            df_columns[0]: ["bob", "bob", "alice", "bob"],
            df_columns[1]: ["id1", "id1", "id2", "id3"],
            df_columns[2]: ["lang1", "lang2", "lang3", "lang4"],
            df_columns[3]: ["id1", "id2", "id3", "id4"],
        }, )
    squash_rules = ["PATIENTS", [["ADMISSIONS", []]]]

    actual = transform.squash_rows(df, squash_rules)
    # Sort to be sure actual and expected are in the same order
    actual = actual.sort_values(by=df_columns[1]).reset_index(drop=True)

    expected = pd.DataFrame(
        {
            df_columns[0]: ["bob", "alice", "bob"],
            df_columns[1]: ["id1", "id2", "id3"],
            df_columns[2]: [("lang1", "lang2"), ("lang3", ), ("lang4", )],
            df_columns[3]: [("id1", "id2"), ("id3", ), ("id4", )],
        }, )
    # Sort to be sure actual and expected are in the same order
    expected = expected.sort_values(by=df_columns[1]).reset_index(drop=True)

    assert actual.equals(expected)
Beispiel #8
0
def test_get_primary_key():
    analyzer = Analyzer()

    # With owner
    resource_mapping = {
        "primaryKeyOwner": "owner",
        "primaryKeyTable": "table",
        "primaryKeyColumn": "col",
    }
    analyzer.get_primary_key(resource_mapping)

    assert analyzer.analysis.primary_key_column == SqlColumn(
        "table", "col", "owner")

    # Without owner
    resource_mapping = {
        "primaryKeyOwner": "",
        "primaryKeyTable": "table",
        "primaryKeyColumn": "col",
    }
    analyzer.get_primary_key(resource_mapping)

    assert analyzer.analysis.primary_key_column == SqlColumn("table", "col")

    # Raising error
    resource_mapping = {
        "primaryKeyOwner": "",
        "primaryKeyTable": "",
        "primaryKeyColumn": "col",
        "definitionId": "fhirtype",
    }
    with pytest.raises(
            ValueError,
            match=
            "You need to provide a primary key table and column in the mapping"
    ):
        analyzer.get_primary_key(resource_mapping)
Beispiel #9
0
def test_handle_array_attributes():
    attr1 = Attribute("attr1", columns=[SqlColumn("a", "b")])
    attr2 = Attribute("attr2", columns=[SqlColumn("a", "c")])
    row = {
        attr1: ("A1", "A2", "A3"),
        attr2: "B",
    }
    attributes_in_array = {
        "path1": attr1,
        "path2": attr2,
    }

    value = transform.handle_array_attributes(attributes_in_array, row)

    assert value == [
        {
            "path1": "A1",
            "path2": "B"
        },
        {
            "path1": "A2",
            "path2": "B"
        },
        {
            "path1": "A3",
            "path2": "B"
        },
    ]

    # With mismatch in lengths
    row = {
        attr1: ("A1", "A2", "A3"),
        attr2: ("B1", "B2"),
    }
    with raises(AssertionError, match="mismatch in array lengths"):
        transform.handle_array_attributes(attributes_in_array, row)
Beispiel #10
0
    def apply_filters(self, query: Query, resource_mapping,
                      pk_column: SqlColumn, pk_values) -> Query:
        """ Augment the sql alchemy query with filters from the analysis.
        """
        if pk_values is not None:
            query = query.filter(self.get_column(pk_column).in_(pk_values))

        if resource_mapping["filters"]:
            for filter in resource_mapping["filters"]:
                col = self.get_column(
                    SqlColumn(
                        filter["sqlColumn"]["table"],
                        filter["sqlColumn"]["column"],
                        filter["sqlColumn"]["owner"],
                    ))
                rel_method = SQL_RELATIONS_TO_METHOD[filter["relation"]]
                query = query.filter(getattr(col, rel_method)(filter["value"]))

        return query
Beispiel #11
0
def test_merge_dataframe(_):
    attr_name = Attribute("name", columns=[SqlColumn("PATIENTS", "NAME")])
    attr_id = Attribute(
        "id",
        columns=[SqlColumn("PATIENTS", "ID"),
                 SqlColumn("PATIENTS", "ID2")],
        static_inputs=["unknown"],
        merging_script=MergingScript("merge"),
    )
    attr_language = Attribute("language",
                              columns=[SqlColumn("ADMISSIONS", "LANGUAGE")])
    attr_admid = Attribute("admid", columns=[SqlColumn("ADMISSIONS", "ID")])

    df_columns = pd.MultiIndex.from_tuples([
        (attr_name, ("PATIENTS_NAME", "PATIENTS")),
        (attr_id, ("PATIENTS_ID", "PATIENTS")),
        (attr_id, ("PATIENTS_ID2", "PATIENTS")),
        (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")),
        (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")),
        ("pk", ("PATIENTS_ID", "PATIENTS")),
    ])

    df = pd.DataFrame(
        {
            df_columns[0]: ["bob", "bob", "alice", "bob"],
            df_columns[1]: ["id1", "id1", "id2", "id3"],
            df_columns[2]: ["id21", "id21", "id22", "id23"],
            df_columns[3]: ["lang1", "lang2", "lang3", "lang4"],
            df_columns[4]: ["hadmid1", "hadmid2", "hadmid3", "hadmid4"],
            df_columns[5]: ["id1", "id2", "id3", "id4"],
        }, )
    attributes = [attr_name, attr_id, attr_language, attr_admid]
    primary_key_column = SqlColumn("PATIENTS", "ID")

    actual = transform.merge_dataframe(df, attributes, primary_key_column)

    expected = pd.DataFrame(
        {
            attr_name: ["bob", "bob", "alice", "bob"],
            attr_id:
            ["id1id21merge", "id1id21merge", "id2id22merge", "id3id23merge"],
            attr_language: ["lang1", "lang2", "lang3", "lang4"],
            attr_admid: ["hadmid1", "hadmid2", "hadmid3", "hadmid4"],
        }, )

    assert actual.equals(expected)
Beispiel #12
0
def test_analyze_mapping(patient_mapping, fhir_concept_map_gender,
                         fhir_concept_map_identifier):
    analyzer = Analyzer()

    analyzer.analyze_mapping(patient_mapping)

    assert analyzer.analysis.attributes == [
        Attribute(
            "identifier[0].value",
            columns=[SqlColumn("patients", "row_id")],
            static_inputs=[],
            merging_script=None,
        ),
        Attribute(
            "deceasedBoolean",
            columns=[SqlColumn("patients", "expire_flag")],
            static_inputs=[],
            merging_script=None,
        ),
        Attribute(
            "generalPractitioner[0].identifier.value",
            columns=[SqlColumn("icustays", "hadm_id")],
            static_inputs=[],
            merging_script=None,
        ),
        Attribute(
            "birthDate",
            columns=[SqlColumn("patients", "dob")],
            static_inputs=[],
            merging_script=None,
        ),
        Attribute(
            "deceasedDateTime",
            columns=[SqlColumn("patients", "dod")],
            static_inputs=[],
            merging_script=None,
        ),
        Attribute(
            "gender",
            columns=[SqlColumn("patients", "gender")],
            static_inputs=["unknown"],
            merging_script=MergingScript("select_first_not_empty"),
        ),
        Attribute(
            "maritalStatus.coding[0].code",
            columns=[SqlColumn("admissions", "marital_status")],
            static_inputs=[],
            merging_script=None,
        ),
        Attribute(
            "generalPractitioner[0].type",
            columns=[],
            static_inputs=["Practitioner"],
            merging_script=None,
        ),
    ]

    assert analyzer.analysis.columns == {
        SqlColumn("patients", "row_id"),
        SqlColumn("patients", "gender"),
        SqlColumn("patients", "dob"),
        SqlColumn("patients", "dod"),
        SqlColumn("patients", "expire_flag"),
        SqlColumn("admissions", "marital_status"),
        SqlColumn("icustays", "hadm_id"),
    }
    assert analyzer.analysis.joins == {
        SqlJoin(SqlColumn("patients", "subject_id"),
                SqlColumn("admissions", "subject_id")),
        SqlJoin(SqlColumn("patients", "subject_id"),
                SqlColumn("icustays", "subject_id")),
    }
Beispiel #13
0
def pk_col_name(primary_key_column: SqlColumn):
    return ("pk", (primary_key_column.dataframe_column_name(),
                   primary_key_column.table))
Beispiel #14
0
def test_clean_data(_, fhir_concept_map_code, fhir_concept_map_gender):
    df = pd.DataFrame(
        {
            "PATIENTS_NAME": ["alice", "bob", "charlie"],
            "PATIENTS_ID": ["id1", "id2", "id3"],
            "PATIENTS_ID2": ["id21", "id22", "id23"],
            "ADMISSIONS_LANGUAGE": ["M", "F", "F"],
            "ADMISSIONS_ID": ["ABC", "DEF", "GHI"],
        }, )
    attr_name = Attribute("name",
                          columns=[
                              SqlColumn(
                                  "PATIENTS",
                                  "NAME",
                                  cleaning_script=CleaningScript("clean1"),
                              )
                          ])
    attr_id = Attribute(
        "id",
        columns=[SqlColumn("PATIENTS", "ID"),
                 SqlColumn("PATIENTS", "ID2")],
        static_inputs=["null"],
    )
    attr_language = Attribute(
        "language",
        columns=[
            SqlColumn("ADMISSIONS",
                      "LANGUAGE",
                      concept_map=ConceptMap("id_cm_gender"))
        ],
        static_inputs=["val"],
    )
    attr_admid = Attribute(
        "code",
        columns=[
            SqlColumn(
                "ADMISSIONS",
                "ID",
                cleaning_script=CleaningScript("clean2"),
                concept_map=ConceptMap("id_cm_code"),
            )
        ],
    )
    attributes = [attr_name, attr_id, attr_language, attr_admid]
    primary_key_column = SqlColumn("PATIENTS", "ID")

    cleaned_df = transform.clean_dataframe(df, attributes, primary_key_column)

    df_columns = pd.MultiIndex.from_tuples([
        (attr_name, ("PATIENTS_NAME", "PATIENTS")),
        (attr_id, ("PATIENTS_ID", "PATIENTS")),
        (attr_id, ("PATIENTS_ID2", "PATIENTS")),
        (attr_language, ("ADMISSIONS_LANGUAGE", "ADMISSIONS")),
        (attr_admid, ("ADMISSIONS_ID", "ADMISSIONS")),
        ("pk", ("PATIENTS_ID", "PATIENTS")),
    ])

    expected = pd.DataFrame(
        {
            df_columns[0]: ["alicecleaned", "bobcleaned", "charliecleaned"],
            df_columns[1]: ["id1", "id2", "id3"],
            df_columns[2]: ["id21", "id22", "id23"],
            df_columns[3]: ["male", "female", "female"],
            df_columns[4]: ["abc", "def", "ghi"],
            df_columns[5]: ["id1", "id2", "id3"],
        }, )

    assert cleaned_df.equals(expected)