def test_recursive_definitions_produce_errors(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), this=patients.satisfying("that = 1"), that=patients.satisfying("this = 1"), )
def test_errors_are_triggered_without_database_url(monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) with pytest.raises(KeyError): StudyDefinition( population=patients.satisfying( "no_such_column AND missing_column"), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), )
def test_syntax_errors_in_expressions_are_raised(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), status=patients.satisfying( "age > 70 AND AND sex = 'M'", sex=patients.sex(), age=patients.age_as_of("2010-01-01"), ), )
def test_column_name_clashes_produce_errors(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), age=patients.age_as_of("2020-01-01"), status=patients.satisfying( "age > 70 AND sex = 'M'", sex=patients.sex(), age=patients.age_as_of("2010-01-01"), ), )
def test_make_df_from_expectations_with_satisfying(): study = StudyDefinition( population=patients.all(), has_condition=patients.satisfying( "condition_a OR condition_b", condition_a=patients.with_these_clinical_events( codelist(["A", "B", "C"], system="ctv3") ), condition_b=patients.with_these_clinical_events( codelist(["X", "Y", "Z"], system="ctv3") ), return_expectations={ "date": {"earliest": "2001-01-01", "latest": "2020-03-01"}, "incidence": 0.95, }, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) assert result.columns == ["has_condition"]
asthma_codes, between=["2017-02-28", "2020-02-29"], return_expectations={"incidence": 0.5}, ), asthma_ever=patients.with_these_clinical_events( asthma_ever_codes, on_or_before="2020-02-29", return_expectations={"incidence": 0.8}, ), age_cat=patients.satisfying( "age >=18 AND age <= 110", return_expectations={"incidence": 0.9}, age=patients.age_as_of( "2020-02-29", return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" }, }, ), ), has_follow_up=patients.registered_with_one_practice_between( "2019-02-28", "2020-02-29", return_expectations={"incidence": 0.9}), copd=patients.with_these_clinical_events( copd_codes, on_or_before="2020-02-29", return_expectations={"incidence": 0.05}, ), ### OTHER RESPIRATORY other_respiratory=patients.with_these_clinical_events(
default_expectations={ "date": { "earliest": start_date, "latest": end_date }, "rate": "exponential_increase", "incidence": 0.1, }, population=patients.satisfying( """ registered AND (NOT died) AND (sex = 'F' OR sex='M') AND (age != 'missing') """, registered=patients.registered_as_of( "index_date", return_expectations={"incidence": 0.9}, ), died=patients.died_from_any_cause( on_or_before="index_date", returning="binary_flag", return_expectations={"incidence": 0.1}), ), age=patients.age_as_of( "index_date", return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" }, },
# STUDY DEFINITION # Defines both the study population and points to the important covariates and outcomes study = StudyDefinition( default_expectations={ "date": { "earliest": "1970-01-01", "latest": "today" }, "rate": "uniform", "incidence": 0.2, }, # STUDY POPULATION - registered 1 year prior to November 16th 2020 population=patients.satisfying( "one_practice AND sgss_pos_inrange", one_practice=patients.registered_with_one_practice_between( "2019-11-16", "2020-11-16"), ), dereg_date=patients.date_deregistered_from_all_supported_practices( on_or_after="2020-11-16", date_format="YYYY-MM", ), # OUTCOMES - ONS death dates died_ons_covid_flag_any=patients.with_these_codes_on_death_certificate( covid_codelist, on_or_after="2020-02-01", match_only_underlying_cause=False, return_expectations={ "date": { "earliest": "2020-02-01"
# Configure the expectations framework default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" }, "rate": "universal", }, # define the study index date index_date=index_date, # This line defines the study population population=patients.satisfying( "(NOT died) AND (registered)", died=patients.died_from_any_cause(on_or_before=index_date, returning="binary_flag"), registered=patients.registered_as_of(index_date), ), age=patients.age_as_of(index_date, return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" } }), age_group=patients.categorised_as( { "0": "DEFAULT", "16 - under 40": """ age >= 16 AND age < 40""", "40 - under 50": """ age >= 40 AND age < 50""", "50 - under 65": """ age >= 50 AND age < 65""",
"rate": "universal", }, # define the study index date index_date=index_date, # This line defines the study population population=patients.satisfying( """ age >= 16 AND (NOT died) AND (registered) AND (carer) """, died=patients.died_from_any_cause(on_or_before=index_date, returning="binary_flag"), pop_age=patients.age_as_of(index_date), registered=patients.registered_as_of(index_date), carer=patients.with_these_clinical_events( carer_code, between=["index_date", "index_date + 1 month"], returning="binary_flag", return_expectations={"incidence": 0.6}, )), age=patients.age_as_of(index_date, return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" } }),
default_expectations={ "date": {"earliest": "1970-01-01", "latest": "today"}, "rate": "uniform", "incidence": 0.05, }, ## STUDY POPULATION (required) population=patients.all(), copd=patients.with_these_clinical_events( copd_codes, on_or_before="2020-02-29", return_expectations={"incidence": 0.4}, ), age_cat=patients.satisfying( "age >=35 AND age <= 110", return_expectations={"incidence": 0.9}, age=patients.age_as_of( "2020-02-29", return_expectations={ "rate": "universal", "int": {"distribution": "population_ages"}, }, ), ), ever_smoked=patients.with_these_clinical_events( filter_codes_by_category(clear_smoking_codes, include=["S", "E"]), on_or_before="2020-02-29", return_expectations={"incidence": 0.9}, ), has_follow_up=patients.registered_with_one_practice_between( "2019-02-28", "2020-02-29", return_expectations={"incidence": 0.9}, ), recent_asthma=patients.with_these_clinical_events( asthma_codes,
"latest": "today" }, "rate": "uniform", "incidence": 0.5, }, # STUDY POPULATION population=patients.satisfying( """ has_follow_up AND (age >=18 AND age <= 110) AND (rheumatoid OR osteoarthritis) AND imd >0 AND NOT ( (has_asthma AND saba_single) OR aspirin_ten_years OR stroke OR mi OR gi_bleed_ulcer ) """, has_follow_up=patients.registered_with_one_practice_between( "2019-02-28", "2020-02-29"), has_asthma=patients.with_these_clinical_events( current_asthma_codes, between=["2017-02-28", "2020-02-29"], ), ), # The rest of the lines define the covariates with from the protocol with associated GitHub issues # OUTCOMES died_ons_covid_flag_any=patients.with_these_codes_on_death_certificate( covid_identification, on_or_after="2020-03-01", match_only_underlying_cause=False,
population=patients.satisfying( """ ( has_asthma OR (asthma_ever AND any_asthma_med) ) AND (age >=18 AND age <= 110) AND has_follow_up AND NOT copd AND NOT other_respiratory AND NOT nebules AND NOT ( (lama_single OR laba_lama) AND NOT ( high_dose_ics OR high_dose_ics_single_ing OR high_dose_ics_multiple_ingredient OR low_med_dose_ics_single_ingredient OR low_med_dose_ics_multiple_ingredient OR low_med_dose_ics OR ics_single OR laba_ics OR laba_lama_ics ) ) """, has_asthma=patients.with_these_clinical_events( asthma_codes, between=["2017-02-28", "2020-02-29"], ), has_follow_up=patients.registered_with_one_practice_between( "2019-02-28", "2020-02-29"), nebules=patients.with_these_medications( nebulised_med_codes, between=["2019-02-28", "2020-02-29"], ), any_asthma_med=patients.satisfying(""" ltra_single OR laba_lama_ics OR laba_lama OR laba_ics OR lama_single OR laba_single OR sama_single OR saba_single OR ics_single OR low_med_dose_ics OR low_med_dose_ics_multiple_ingredient OR low_med_dose_ics_single_ingredient OR high_dose_ics_multiple_ingredient OR high_dose_ics_single_ing OR high_dose_ics """), ),
"earliest": "1980-01-01", "latest": "today" }, "rate": "uniform", "incidence": 0.05, }, index_date="2019-02-01", population=patients.satisfying( """ has_follow_up AND (age >=18 AND age <= 110) """, has_follow_up=patients.registered_with_one_practice_between( "index_date - 1 year", "index_date"), age=patients.age_as_of( "index_date", return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" }, }, ), ), covid_hospitalisation=patients.categorised_as( { "COVID-19 positive": "covid_positive AND NOT covid_hospitalised", "COVID-19 hospitalised": "covid_hospitalised", "General population": "DEFAULT", }, return_expectations={
# Configure the expectations framework default_expectations={ "date": { "earliest": "1970-01-01", "latest": latest_date }, "rate": "uniform", "incidence": 0.2, }, # This line defines the study population population=patients.satisfying(""" registered = 1 AND (covid_vacc_date OR (age >=70 AND age <= 110) OR (care_home_type)) AND NOT has_died """), has_follow_up=patients.registered_with_one_practice_between( start_date="2019-12-01", end_date=campaign_start, return_expectations={"incidence": 0.90}, ), registered=patients.registered_as_of( campaign_start, # day before vaccination campaign starts - discuss with team if this should be "today" return_expectations={"incidence": 0.98}, ), has_died=patients.died_from_any_cause(
"date": {"earliest": "1970-01-01", "latest": "today"}, "rate": "uniform", "incidence": 0.2, }, ## STUDY POPULATION (required) population=patients.satisfying( """ copd AND (age >=35 AND age <= 110) AND ever_smoked AND has_follow_up AND NOT recent_asthma AND NOT other_respiratory AND NOT nebules AND NOT ltra_single """, has_follow_up=patients.registered_with_one_practice_between( "2019-02-28", "2020-02-29" ), recent_asthma=patients.with_these_clinical_events( asthma_codes, between=["2017-02-28", "2020-02-29"], ), #### NEBULES nebules=patients.with_these_medications( nebulised_med_codes, between=["2019-02-28", "2020-02-29"], ), ), ## OUTCOMES (at least one outcome or covariate is required) icu_date_admitted=patients.admitted_to_icu( on_or_after="2020-03-01", include_day=True, returning="date_admitted",
default_expectations={ "date": {"earliest": "2020-01-01", "latest": "today"}, "rate": "universal", }, # define the study index date index_date = index_date, # This line defines the study population population = patients.satisfying("(riskgroup) AND (NOT died) AND (registered)", died = patients.died_from_any_cause( on_or_before=index_date, returning="binary_flag" ), registered = patients.registered_as_of(index_date), riskgroup = patients.with_these_clinical_events(all_riskgroup_codes, between = ["index_date", "index_date + 1 month"], returning="binary_flag", return_expectations= { "incidence": 0.6 },) ), age=patients.age_as_of(index_date, return_expectations={ "rate" : "universal", "int" : {"distribution" : "population_ages"} }), age_group = patients.categorised_as({ "0": "DEFAULT", "16 - under 40": """ age >= 16 AND age < 40""",
"date": {"earliest": "1970-01-01", "latest": "today"}, "rate": "uniform", "incidence": 0.2, }, # set an index date (as starting point) index_date="2020-02-01", # This line defines the study population that the below varaibles will be defined for # currently registered patients restricts to those alive # the age restriction is applied as current TPP linkage only includes linkages to old age care population=patients.satisfying( """ (age >= 65 AND age < 120) AND is_registered_with_tpp """, is_registered_with_tpp=patients.registered_as_of( "index_date" ), ), # TPP ADDRESS LINKAGE # tpp defined care home as of date tpp_care_home_type=patients.care_home_status_as_of( "index_date", categorised_as={ "PC": """ IsPotentialCareHome AND LocationDoesNotRequireNursing='Y' AND LocationRequiresNursing='N' """,
study = StudyDefinition( # Configure the expectations framework default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "uniform", "incidence": 0.1, }, # This line defines the study population population=patients.satisfying( """ has_follow_up AND (age >=18 AND age <= 110) AND (sex = "M" OR sex = "F") """, has_follow_up=patients.registered_with_one_practice_between( "2019-02-28", "2020-02-29")), # Outcomes icu_date_admitted=patients.admitted_to_icu( on_or_after="2020-03-01", include_day=True, returning="date_admitted", find_first_match_in_period=True, return_expectations={ "date": { "earliest": "2020-03-01" }, "incidence": 0.1 },
returning="binary_flag", on_or_after=from_date, match_only_underlying_cause=True, return_expectations={"date": { "earliest": "2020-03-01" }}, ), died_ons=patients.died_from_any_cause( returning="binary_flag", on_or_after=from_date, return_expectations={"date": { "earliest": from_date }}, ), died_ons_noncovid=patients.satisfying( """(NOT died_ons_covid) AND died_ons""", return_expectations={"incidence": 0.15}, ), death_category=patients.categorised_as( { "alive": "NOT died_ons", "covid-death": "died_ons_covid", "non-covid-death": "died_ons_noncovid", "unknown": "DEFAULT", }, return_expectations={ "category": { "ratios": { "alive": 0.8, "covid-death": 0.1, "non-covid-death": 0.1 }
"date": { "earliest": index_date, "latest": end_date }, "rate": "uniform", }, index_date=index_date, # This line defines the study population population=patients.satisfying( """ (sex = 'F' OR sex = 'M') AND (age >= 18 AND age < 120) AND (NOT died) AND (registered) """, registered=patients.registered_as_of(index_date), died=patients.died_from_any_cause( on_or_before=index_date, returning="binary_flag", ), ), age=patients.age_as_of( index_date, return_expectations={ "int": { "distribution": "population_ages" }, "incidence": 1 }, ),
"13": 0.1, "21": 0.1, "22": 0.1, "23": 0.1, "24": 0.1, "25": 0.1, "2A": 0.1, "2B": 0.1 } } }, ), cvd_elective=patients.satisfying( """ cvd_admission_method = "11" OR cvd_admission_method = "12" OR cvd_admission_method = "13" """, return_expectations={"incidence": 0.05}, ), cvd_emergency=patients.satisfying( """ cvd_admission_method = "21" OR cvd_admission_method = "22" OR cvd_admission_method = "23" OR cvd_admission_method = "24" OR cvd_admission_method = "25" OR cvd_admission_method = "2A" OR cvd_admission_method = "2B" OR cvd_admission_method = "2C" OR cvd_admission_method = "2D" OR cvd_admission_method = "28"
"earliest": "2020-01-01", "latest": "today" }, "rate": "universal", }, # define the study index date index_date=index_date, # This line defines the study population population=patients.satisfying( "(NOT died) AND (registered) AND (pregnant) AND age >= 16", died=patients.died_from_any_cause(on_or_before=index_date, returning="binary_flag"), registered=patients.registered_as_of(index_date), pregnant=patients.with_these_clinical_events( pregnant_code, between=["index_date", "index_date + 1 month"], returning="binary_flag", return_expectations={"incidence": 0.6}, ), ), age=patients.age_as_of(index_date, return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" } }), clinical_riskgroup=patients.with_these_clinical_events( clinical_riskgroup_codes, between=["index_date", "index_date + 1 month"],
def test_make_df_from_expectations_with_aggregate_of(): # aggregate of variables defined in their own right study = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "exponential_increase", "incidence": 0.2, }, population=patients.all(), date_1=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), date_2=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), date_min=patients.minimum_of( "date_1", "date_2", ), date_max=patients.maximum_of( "date_1", "date_2", ), date_min_fixed=patients.minimum_of( "date_1", "1980-10-20", ), int_1=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), int_2=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), int_min=patients.minimum_of("int_1", "int_2"), int_max=patients.maximum_of("int_1", "int_2"), ) population_size = 10000 result = study.make_df_from_expectations(population_size) for _, row in result.iterrows(): print(row) dates = [ d for d in [row["date_1"], row["date_2"]] if isinstance(d, str) ] if dates: date_min = min(dates) date_max = max(dates) else: date_min = float("nan") date_max = float("nan") assert_nan_equal(row["date_min"], date_min) assert_nan_equal(row["date_max"], date_max) ints = [i for i in [row["int_1"], row["int_2"]] if isinstance(i, int)] if ints: int_min = min(ints) int_max = max(ints) else: int_min = float("nan") int_max = float("nan") assert_nan_equal(row["int_min"], int_min) assert_nan_equal(row["int_max"], int_max) # aggregate of variables defined only within aggregate function study = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "exponential_increase", "incidence": 1, }, # We use an expression here (never mind that it's a trivial and # pointless one) as that triggers a bug which we want to ensure we've # fixed population=patients.satisfying("foo OR bar", foo=patients.all(), bar=patients.all()), date_min=patients.maximum_of( date_1=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), date_2=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), ), date_max=patients.maximum_of( date_3=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), date_4=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), ), int_min=patients.minimum_of( int_1=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), int_2=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), ), int_max=patients.maximum_of( int_3=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), int_4=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) for _, row in result.iterrows(): print(row) assert pd.notna(row["date_min"]) assert pd.notna(row["date_max"]) assert pd.notna(row["int_min"]) assert pd.notna(row["int_max"]) # aggregate of variables defined both inside and outside aggregation study = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "exponential_increase", "incidence": 0.2, }, population=patients.all(), date_1=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), date_2=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), date_min=patients.minimum_of( "date_1", "date_2", date_3=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), ), date_max=patients.maximum_of( "date_1", "date_2", date_4=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="date", date_format="YYYY-MM-DD", ), ), int_1=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), int_2=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), int_min=patients.minimum_of( "int_1", "int_2", int_3=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), ), int_max=patients.maximum_of( "int_1", "int_2", int_4=patients.with_these_clinical_events( codelist(["X"], system="ctv3"), returning="number_of_matches_in_period", return_expectations={ "int": { "distribution": "normal", "mean": 25, "stddev": 5 }, "incidence": 0.5, }, ), ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) for _, row in result.iterrows(): print(row) dates = [ d for d in [row["date_1"], row["date_2"]] if isinstance(d, str) ] if dates: date_min = min(dates) date_max = max(dates) else: date_min = float("nan") date_max = float("nan") assert_nan_equal(row["date_min"], date_min) assert_nan_equal(row["date_max"], date_max) ints = [i for i in [row["int_1"], row["int_2"]] if isinstance(i, int)] if ints: int_min = min(ints) int_max = max(ints) else: int_min = float("nan") int_max = float("nan") assert_nan_equal(row["int_min"], int_min) assert_nan_equal(row["int_max"], int_max)