def test_make_df_from_expectations_with_categories_expression_validation():
    study = StudyDefinition(
        population=patients.all(),
        category=patients.categorised_as(
            {
                "A": "sex = 'F'",
                "B": "sex = 'M'",
                "": "DEFAULT"
            },
            sex=patients.sex(),
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.6,
                        "C": 0.1
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
    )
    population_size = 10000
    with pytest.raises(ValueError):
        study.make_df_from_expectations(population_size)
def test_make_df_from_expectations_with_categories_in_codelist_validation():
    categorised_codelist = codelist([("X", "Y")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        ethnicity=patients.with_these_clinical_events(
            categorised_codelist,
            returning="category",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.7
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
            find_last_match_in_period=True,
        ),
    )
    population_size = 10000
    with pytest.raises(ValueError):
        study.make_df_from_expectations(population_size)
Exemple #3
0
def test_make_df_no_categories_validation_when_no_categories_in_definition():
    study = StudyDefinition(
        population=patients.all(),
        sex=patients.sex(
            return_expectations={
                "rate": "universal",
                "date": {"earliest": "1900-01-01", "latest": "today"},
                "category": {"ratios": {"M": 0.49, "F": 0.51}},
            }
        ),
    )
    population_size = 10000
    # Just ensuring no exception is raised
    study.make_df_from_expectations(population_size)
def test_make_df_from_expectations_with_number_of_episodes():
    study = StudyDefinition(
        population=patients.all(),
        episode_count=patients.with_these_clinical_events(
            codelist(["A", "B", "C"], system="ctv3"),
            ignore_days_where_these_codes_occur=codelist(["D", "E"],
                                                         system="ctv3"),
            returning="number_of_episodes",
            episode_defined_as="series of events each <= 14 days apart",
            return_expectations={
                "int": {
                    "distribution": "normal",
                    "mean": 4,
                    "stddev": 2
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
                "incidence": 0.2,
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.columns == ["episode_count"]
Exemple #5
0
def test_make_df_from_expectations_doesnt_alter_date_defaults():

    study = StudyDefinition(
        default_expectations={
            "rate": "exponential_increase",
            "incidence": 1.0,
            "date": {"earliest": "1900-01-01", "latest": "today"},
            "category": {"ratios": {"M": 0.5, "F": 0.5}},
        },
        population=patients.all(),
        with_different_incidence=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            return_expectations={"incidence": 0.2},
            include_day=True,
        ),
        with_different_date=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            return_expectations={"date": {"earliest": "2015-01-01", "latest": "today"}},
            include_day=True,
        ),
        with_defaults=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"), returning="date", include_day=True
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)

    # Regression test: make sure defaults are respected even when they've been overridden
    assert result.with_defaults.min() < "2015-01-01"
    assert len(result[pd.isnull(result.with_defaults)]) == 0
def test_make_df_from_expectations_with_distribution_and_date():
    study = StudyDefinition(
        population=patients.all(),
        bmi=patients.most_recent_bmi(
            on_or_after="2010-02-01",
            minimum_age_at_measurement=16,
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.6,
                "float": {
                    "distribution": "normal",
                    "mean": 35,
                    "stddev": 10
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
        bmi_date_measured=patients.date_of(
            "bmi",
            date_format="YYYY-MM",
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert list(sorted(result.columns)) == ["bmi", "bmi_date_measured"]

    # Check that the null-valued rows are aligned with each other
    assert ((result["bmi"] == 0.0) == pd.isnull(
        result["bmi_date_measured"])).all()
def test_make_df_from_expectations_with_categories_expression():
    study = StudyDefinition(
        population=patients.all(),
        category=patients.categorised_as(
            {
                "A": "sex = 'F'",
                "B": "sex = 'M'",
                "": "DEFAULT"
            },
            sex=patients.sex(),
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.7
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    value_counts = result.category.value_counts()
    assert value_counts["A"] < value_counts["B"]
def test_make_df_from_expectations_partial_default_overrides():
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        asthma_condition=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            find_first_match_in_period=True,
            date_format="YYYY",
            return_expectations={"date": {
                "latest": "2000-01-01"
            }},
        ),
    )

    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.asthma_condition.astype("float").max() == 2000
def test_make_df_from_expectations_with_categories():
    categorised_codelist = codelist([("1", "A"), ("2", "B")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        ethnicity=patients.with_these_clinical_events(
            categorised_codelist,
            returning="category",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.7
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
            find_last_match_in_period=True,
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.columns == ["ethnicity"]

    category_counts = result.reset_index().groupby("ethnicity").count()
    assert category_counts.loc["A", :][0] < category_counts.loc["B", :][0]
def test_make_df_from_expectations_with_using_dates_as_categories():
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        eligible_date=patients.categorised_as(
            {
                "2020-04-14": "age >= 80",
                "2020-06-16": "age >= 70 AND age < 80",
                "2020-08-18": "DEFAULT",
            },
            age=patients.age_as_of("2020-01-01"),
            return_expectations={
                "category": {
                    "ratios": {
                        "2020-04-14": 0.25,
                        "2020-06-16": 0.25,
                        "2020-08-18": 0.5,
                    }
                },
                "incidence": 1,
            },
        ),
    )
    population_size = 100
    result = study.make_df_from_expectations(population_size)
    assert set(result.eligible_date) == set(
        ["2020-08-18", "2020-06-16", "2020-04-14"])
Exemple #11
0
def test_make_df_from_expectations_with_care_home_status():
    study = StudyDefinition(
        population=patients.all(),
        is_in_care_home=patients.care_home_status_as_of(
            "2020-01-01",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.3,
                "date": {"earliest": "1900-01-01", "latest": "2020-01-01"},
                "bool": True,
            },
        ),
        care_home_type=patients.care_home_status_as_of(
            "2020-01-01",
            categorised_as={
                "PN": "IsPotentialCareHome AND LocationRequiresNursing='Y'",
                "PC": "IsPotentialCareHome",
                "U": "DEFAULT",
            },
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {"ratios": {"PN": 0.1, "PC": 0.2, "U": 0.7}},
                "date": {"earliest": "1900-01-01", "latest": "today"},
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    value_counts = result.care_home_type.value_counts()
    assert value_counts["PN"] < value_counts["U"]
def test_make_df_from_expectations_with_mean_recorded_value():
    study = StudyDefinition(
        population=patients.all(),
        drug_x=patients.mean_recorded_value(
            codelist(["X"], system="ctv3"),
            on_most_recent_day_of_measurement=True,
            return_expectations={
                "rate": "exponential_increase",
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
                "incidence": 0.6,
                "float": {
                    "distribution": "normal",
                    "mean": 35,
                    "stddev": 10
                },
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    nonzero_results = result[result["drug_x"] != 0.0]
    assert abs(35 - int(nonzero_results["drug_x"].mean())) < 5
def test_column_refs_in_date_expressions_do_not_trigger_errors():
    # Further down the road we want to actually interpret these expressions and
    # generate appopriate dates, but for now we just need to not blow up when
    # we encounter them
    study = StudyDefinition(
        population=patients.all(),
        copd_exacerbation=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            between=["2001-12-01", "2002-06-01"],
            returning="date",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "date": {
                    "earliest": "1990-01-01",
                    "latest": "today"
                },
            },
            find_last_match_in_period=True,
            date_format="YYYY-MM-DD",
        ),
        drug_after_exacerbation=patients.with_these_medications(
            codelist(["Y"], system="snomed"),
            between=["copd_exacerbation", "copd_exacerbation + 3 months"],
            returning="date",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "date": {
                    "earliest": "1990-01-01",
                    "latest": "today"
                },
            },
            find_first_match_in_period=True,
            date_format="YYYY-MM-DD",
        ),
    )
    population_size = 10000
    # Just ensure no exception is raised
    study.make_df_from_expectations(population_size)
Exemple #14
0
def test_make_df_from_binary_default_outcome():
    study = StudyDefinition(
        population=patients.all(),
        died=patients.died_from_any_cause(
            return_expectations={
                "date": {"earliest": "1900-01-01", "latest": "today"},
                "incidence": 0.1,
            }
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert len(result[~pd.isnull(result.died)]) == 0.1 * population_size
Exemple #15
0
def test_make_df_from_expectations_returning_date_using_defaults():
    study = StudyDefinition(
        default_expectations={
            "date": {"earliest": "1900-01-01", "latest": "today"},
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        asthma_condition=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            find_first_match_in_period=True,
            date_format="YYYY-MM-DD",
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result[~pd.isnull(result["asthma_condition"])].min()[0] < "1960-01-01"
Exemple #16
0
def test_make_df_from_expectations_with_date_filter():
    study = StudyDefinition(
        population=patients.all(),
        asthma_condition=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            between=["2001-12-01", "2002-06-01"],
            returning="date",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "date": {"earliest": "1900-01-01", "latest": "today"},
            },
            find_first_match_in_period=True,
            date_format="YYYY-MM-DD",
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.columns == ["asthma_condition"]
    assert result[~pd.isnull(result["asthma_condition"])].max()[0] <= "2002-06-01"
Exemple #17
0
def test_make_df_from_expectations_with_satisfying():
    study = StudyDefinition(
        population=patients.all(),
        has_condition=patients.satisfying(
            "condition_a OR condition_b",
            condition_a=patients.with_these_clinical_events(
                codelist(["A", "B", "C"], system="ctv3")
            ),
            condition_b=patients.with_these_clinical_events(
                codelist(["X", "Y", "Z"], system="ctv3")
            ),
            return_expectations={
                "date": {"earliest": "2001-01-01", "latest": "2020-03-01"},
                "incidence": 0.95,
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.columns == ["has_condition"]
def test_make_df_from_expectations_with_deregistration_date():
    study = StudyDefinition(
        population=patients.all(),
        dereg_date=patients.date_deregistered_from_all_supported_practices(
            on_or_before="2018-02-01",
            date_format="YYYY-MM",
            return_expectations={
                "incidence": 0.1,
                "date": {
                    "earliest": "1980-01-01",
                    "latest": "2018-02-01"
                },
            },
        ),
    )
    population_size = 1000
    result = study.make_df_from_expectations(population_size)
    dates = result.dereg_date.dropna()
    assert dates.max() <= "2018-02"
    assert dates.min() >= "1980-01"
def test_make_df_from_expectations_with_covid_therapeutics():
    study = StudyDefinition(
        population=patients.all(),
        therapeutic_approved=patients.with_covid_therapeutics(
            with_these_statuses=["Approved", "Treatment Complete"],
            returning="therapeutic",
            return_expectations={
                "rate": "universal",
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
                "category": {
                    "ratios": {
                        "Approved": 0.25,
                        "Treatment Complete": 0.25,
                        "Treatment Not Started": 0.25,
                        "Treatment Stopped": 0.25,
                    }
                },
            },
        ),
        start_date=patients.with_covid_therapeutics(
            with_these_statuses=["Approved", "Treatment Complete"],
            returning="date",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
    )
    population_size = 1000
    # Just ensuring no exception is raised
    result = study.make_df_from_expectations(population_size)
    assert len(result[pd.isnull(result.therapeutic_approved)]) == 0
    assert len(result[pd.isnull(result.start_date)]) == 800
def test_make_df_from_expectations_doesnt_alter_defaults():
    study = StudyDefinition(
        default_expectations={
            "rate": "exponential_increase",
            "incidence": 1.0,
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "category": {
                "ratios": {
                    "M": 0.5,
                    "F": 0.5
                }
            },
        },
        population=patients.all(),
        sex_altered=patients.sex(return_expectations={
            "incidence": 0.1,
            "category": {
                "ratios": {
                    "M": 0.5,
                    "F": 0.5
                }
            },
        }),
        sex_default=patients.sex(
            return_expectations={"category": {
                "ratios": {
                    "M": 0.5,
                    "F": 0.5
                }
            }}),
    )
    population_size = 10000
    # Just ensuring no exception is raised
    result = study.make_df_from_expectations(population_size)
    assert len(result[pd.isnull(result.sex_default)]) == 0
def test_make_df_from_expectations_with_aggregate_of():
    # aggregate of variables defined in their own right
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        date_1=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            date_format="YYYY-MM-DD",
        ),
        date_2=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            date_format="YYYY-MM-DD",
        ),
        date_min=patients.minimum_of(
            "date_1",
            "date_2",
        ),
        date_max=patients.maximum_of(
            "date_1",
            "date_2",
        ),
        date_min_fixed=patients.minimum_of(
            "date_1",
            "1980-10-20",
        ),
        int_1=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="number_of_matches_in_period",
            return_expectations={
                "int": {
                    "distribution": "normal",
                    "mean": 25,
                    "stddev": 5
                },
                "incidence": 0.5,
            },
        ),
        int_2=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="number_of_matches_in_period",
            return_expectations={
                "int": {
                    "distribution": "normal",
                    "mean": 25,
                    "stddev": 5
                },
                "incidence": 0.5,
            },
        ),
        int_min=patients.minimum_of("int_1", "int_2"),
        int_max=patients.maximum_of("int_1", "int_2"),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    for _, row in result.iterrows():
        print(row)
        dates = [
            d for d in [row["date_1"], row["date_2"]] if isinstance(d, str)
        ]
        if dates:
            date_min = min(dates)
            date_max = max(dates)
        else:
            date_min = float("nan")
            date_max = float("nan")
        assert_nan_equal(row["date_min"], date_min)
        assert_nan_equal(row["date_max"], date_max)
        ints = [i for i in [row["int_1"], row["int_2"]] if isinstance(i, int)]
        if ints:
            int_min = min(ints)
            int_max = max(ints)
        else:
            int_min = float("nan")
            int_max = float("nan")
        assert_nan_equal(row["int_min"], int_min)
        assert_nan_equal(row["int_max"], int_max)

    # aggregate of variables defined only within aggregate function
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "rate": "exponential_increase",
            "incidence": 1,
        },
        # We use an expression here (never mind that it's a trivial and
        # pointless one) as that triggers a bug which we want to ensure we've
        # fixed
        population=patients.satisfying("foo OR bar",
                                       foo=patients.all(),
                                       bar=patients.all()),
        date_min=patients.maximum_of(
            date_1=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="date",
                date_format="YYYY-MM-DD",
            ),
            date_2=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="date",
                date_format="YYYY-MM-DD",
            ),
        ),
        date_max=patients.maximum_of(
            date_3=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="date",
                date_format="YYYY-MM-DD",
            ),
            date_4=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="date",
                date_format="YYYY-MM-DD",
            ),
        ),
        int_min=patients.minimum_of(
            int_1=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="number_of_matches_in_period",
                return_expectations={
                    "int": {
                        "distribution": "normal",
                        "mean": 25,
                        "stddev": 5
                    },
                    "incidence": 0.5,
                },
            ),
            int_2=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="number_of_matches_in_period",
                return_expectations={
                    "int": {
                        "distribution": "normal",
                        "mean": 25,
                        "stddev": 5
                    },
                    "incidence": 0.5,
                },
            ),
        ),
        int_max=patients.maximum_of(
            int_3=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="number_of_matches_in_period",
                return_expectations={
                    "int": {
                        "distribution": "normal",
                        "mean": 25,
                        "stddev": 5
                    },
                    "incidence": 0.5,
                },
            ),
            int_4=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="number_of_matches_in_period",
                return_expectations={
                    "int": {
                        "distribution": "normal",
                        "mean": 25,
                        "stddev": 5
                    },
                    "incidence": 0.5,
                },
            ),
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    for _, row in result.iterrows():
        print(row)
        assert pd.notna(row["date_min"])
        assert pd.notna(row["date_max"])
        assert pd.notna(row["int_min"])
        assert pd.notna(row["int_max"])

    # aggregate of variables defined both inside and outside aggregation
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        date_1=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            date_format="YYYY-MM-DD",
        ),
        date_2=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            date_format="YYYY-MM-DD",
        ),
        date_min=patients.minimum_of(
            "date_1",
            "date_2",
            date_3=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="date",
                date_format="YYYY-MM-DD",
            ),
        ),
        date_max=patients.maximum_of(
            "date_1",
            "date_2",
            date_4=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="date",
                date_format="YYYY-MM-DD",
            ),
        ),
        int_1=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="number_of_matches_in_period",
            return_expectations={
                "int": {
                    "distribution": "normal",
                    "mean": 25,
                    "stddev": 5
                },
                "incidence": 0.5,
            },
        ),
        int_2=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="number_of_matches_in_period",
            return_expectations={
                "int": {
                    "distribution": "normal",
                    "mean": 25,
                    "stddev": 5
                },
                "incidence": 0.5,
            },
        ),
        int_min=patients.minimum_of(
            "int_1",
            "int_2",
            int_3=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="number_of_matches_in_period",
                return_expectations={
                    "int": {
                        "distribution": "normal",
                        "mean": 25,
                        "stddev": 5
                    },
                    "incidence": 0.5,
                },
            ),
        ),
        int_max=patients.maximum_of(
            "int_1",
            "int_2",
            int_4=patients.with_these_clinical_events(
                codelist(["X"], system="ctv3"),
                returning="number_of_matches_in_period",
                return_expectations={
                    "int": {
                        "distribution": "normal",
                        "mean": 25,
                        "stddev": 5
                    },
                    "incidence": 0.5,
                },
            ),
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    for _, row in result.iterrows():
        print(row)
        dates = [
            d for d in [row["date_1"], row["date_2"]] if isinstance(d, str)
        ]
        if dates:
            date_min = min(dates)
            date_max = max(dates)
        else:
            date_min = float("nan")
            date_max = float("nan")
        assert_nan_equal(row["date_min"], date_min)
        assert_nan_equal(row["date_max"], date_max)
        ints = [i for i in [row["int_1"], row["int_2"]] if isinstance(i, int)]
        if ints:
            int_min = min(ints)
            int_max = max(ints)
        else:
            int_min = float("nan")
            int_max = float("nan")
        assert_nan_equal(row["int_min"], int_min)
        assert_nan_equal(row["int_max"], int_max)