def test_make_df_from_expectations_with_categories_expression(): study = StudyDefinition( population=patients.all(), category=patients.categorised_as( { "A": "sex = 'F'", "B": "sex = 'M'", "": "DEFAULT" }, sex=patients.sex(), return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": { "ratios": { "A": 0.3, "B": 0.7 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) value_counts = result.category.value_counts() assert value_counts["A"] < value_counts["B"]
def test_make_df_from_expectations_with_categories_expression_validation(): study = StudyDefinition( population=patients.all(), category=patients.categorised_as( { "A": "sex = 'F'", "B": "sex = 'M'", "": "DEFAULT" }, sex=patients.sex(), return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": { "ratios": { "A": 0.3, "B": 0.6, "C": 0.1 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, ), ) population_size = 10000 with pytest.raises(ValueError): study.make_df_from_expectations(population_size)
def test_make_df_from_expectations_with_using_dates_as_categories(): study = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "exponential_increase", "incidence": 0.2, }, population=patients.all(), eligible_date=patients.categorised_as( { "2020-04-14": "age >= 80", "2020-06-16": "age >= 70 AND age < 80", "2020-08-18": "DEFAULT", }, age=patients.age_as_of("2020-01-01"), return_expectations={ "category": { "ratios": { "2020-04-14": 0.25, "2020-06-16": 0.25, "2020-08-18": 0.5, } }, "incidence": 1, }, ), ) population_size = 100 result = study.make_df_from_expectations(population_size) assert set(result.eligible_date) == set( ["2020-08-18", "2020-06-16", "2020-04-14"])
smoking_status=patients.categorised_as( { "S": "most_recent_smoking_code = 'S'", "E": """ most_recent_smoking_code = 'E' OR ( most_recent_smoking_code = 'N' AND ever_smoked ) """, "N": "most_recent_smoking_code = 'N' AND NOT ever_smoked", "M": "DEFAULT", }, return_expectations={ "category": { "ratios": { "S": 0.6, "E": 0.1, "N": 0.2, "M": 0.1 } } }, most_recent_smoking_code=patients.with_these_clinical_events( clear_smoking_codes, find_last_match_in_period=True, on_or_before="2019-02-01", returning="category", ), ever_smoked=patients.with_these_clinical_events( filter_codes_by_category(clear_smoking_codes, include=["S", "E"]), on_or_before="2019-02-01", ), ),
age_band=patients.categorised_as( { "0-4": "age >= 0 AND age < 5", "5-9": "age >= 5 AND age < 10", "10-14": "age >= 10 AND age < 15", "15-19": "age >= 15 AND age < 20", "20-24": "age >= 20 AND age < 25", "25-29": "age >= 25 AND age < 30", "30-34": "age >= 30 AND age < 35", "35-39": "age >= 35 AND age < 40", "40-44": "age >= 40 AND age < 45", "45-49": "age >= 45 AND age < 50", "50-54": "age >= 50 AND age < 55", "55-59": "age >= 55 AND age < 60", "60-64": "age >= 60 AND age < 65", "65-69": "age >= 65 AND age < 70", "70-74": "age >= 70 AND age < 75", "75-79": "age >= 75 AND age < 80", "80-84": "age >= 80 AND age < 85", "85-89": "age >= 85 AND age < 90", "90plus": "age >= 90", "missing": "DEFAULT", }, return_expectations={ "rate": "universal", "category": { "ratios": { "0-4": 0.05, "5-9": 0.05, "10-14": 0.05, "15-19": 0.05, "20-24": 0.05, "25-29": 0.05, "30-34": 0.05, "35-39": 0.05, "40-44": 0.05, "45-49": 0.1, "50-54": 0.05, "55-59": 0.05, "60-64": 0.05, "65-69": 0.05, "70-74": 0.05, "75-79": 0.05, "80-84": 0.05, "85-89": 0.05, "90plus": 0.05, "missing": 0, } }, }),
age_group=patients.categorised_as( { "0": "DEFAULT", "0 - under 16": """ age < 16""", "16 - under 40": """ age >= 16 AND age < 40""", "40 - under 50": """ age >= 40 AND age < 50""", "50 - under 55": """ age >= 50 AND age < 55""", "55 - under 60": """ age >= 55 AND age < 60""", "60 - under 65": """ age >= 60 AND age < 65""", "65 - under 70": """ age >= 65 AND age < 70""", "70 - under 75": """ age >= 70 AND age < 75""", "75 - under 80": """ age >= 75 AND age < 80""", "80 - under 85": """ age >= 80 AND age < 85""", "85 plus": """ age >= 85""", }, return_expectations={ "rate": "universal", "category": { "ratios": { "0 - under 16": 0.05, "16 - under 40": 0.1, "40 - under 50": 0.1, "50 - under 55": 0.1, "55 - under 60": 0.1, "60 - under 65": 0.1, "65 - under 70": 0.1, "70 - under 75": 0.1, "75 - under 80": 0.1, "80 - under 85": 0.1, "85 plus": 0.05, } }, }, ),
imd=patients.categorised_as( { "0": "DEFAULT", "1": """index_of_multiple_deprivation >=1 AND index_of_multiple_deprivation < 32844*1/5""", "2": """index_of_multiple_deprivation >= 32844*1/5 AND index_of_multiple_deprivation < 32844*2/5""", "3": """index_of_multiple_deprivation >= 32844*2/5 AND index_of_multiple_deprivation < 32844*3/5""", "4": """index_of_multiple_deprivation >= 32844*3/5 AND index_of_multiple_deprivation < 32844*4/5""", "5": """index_of_multiple_deprivation >= 32844*4/5 """, }, index_of_multiple_deprivation=patients.address_as_of( "index_date", returning="index_of_multiple_deprivation", round_to_nearest=100, ), return_expectations={ "rate": "universal", "category": { "ratios": { "0": 0.01, "1": 0.20, "2": 0.20, "3": 0.20, "4": 0.20, "5": 0.19, } }, }),
}, ), death_category=patients.categorised_as( { "covid-death": "died_covid", "non-covid-death": "(NOT died_covid) AND died_any", "alive": "DEFAULT" }, died_covid=patients.with_these_codes_on_death_certificate( codes_ICD10_covid, returning="binary_flag", match_only_underlying_cause=False, between=[index_date, end_date], ), died_any=patients.died_from_any_cause( between=[index_date, end_date], returning="binary_flag", ), return_expectations={ "category": { "ratios": { "alive": 0.8, "covid-death": 0.1, "non-covid-death": 0.1 } }, "incidence": 1 }, ), )
"PS": 0.05, "": 0.85, }, }, }, ), # simple care home flag care_home=patients.categorised_as( { 1: """care_home_type""", 0: "DEFAULT", }, return_expectations={ "rate": "universal", "category": { "ratios": { 1: 0.15, 0: 0.85, }, }, }, ), age=patients.age_as_of( "2021-03-31", # PHE defined date for calulating eligibilty across all vaccination campaigns return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" }, },
covid_hospitalisation=patients.categorised_as( { "COVID-19 positive": "covid_positive AND NOT covid_hospitalised", "COVID-19 hospitalised": "covid_hospitalised", "General population": "DEFAULT", }, return_expectations={ "incidence": 1, "category": { "ratios": { "COVID-19 positive": 0.1, "COVID-19 hospitalised": 0.1, "General population": 0.8, } }, }, covid_positive=patients.with_test_result_in_sgss( pathogen="SARS-CoV-2", test_result="positive", between=["2020-01-01", "last_day_of_month(index_date)"], date_format="YYYY-MM-DD", return_expectations={"date": { "earliest": "index_date" }}, ), covid_hospitalised=patients.admitted_to_hospital( with_these_diagnoses=covid_codelist, between=["2020-01-01", "last_day_of_month(index_date)"], return_expectations={"incidence": 0.20}, ), ),
covariate_definitions = study.covariate_definitions fixtures_path = Path(__file__).parent / "fixtures" / "dummy-data" # Create a second test study to which we can add columns without needing to rebuild all # the test fixtures study_2 = StudyDefinition( **column_definitions, category_date=patients.categorised_as( { "2020-10-15": "age > 50", "2021-11-16": "DEFAULT", }, return_expectations={ "category": { "ratios": { "2020-10-15": 0.5, "2021-11-16": 0.5, } }, }, ), ) covariate_definitions_2 = study_2.covariate_definitions @pytest.mark.parametrize("file_format", SUPPORTED_FILE_FORMATS) def test_validate_dummy_data_valid(file_format, tmpdir): rows = zip( ["patient_id", "11", "22"], ["sex", "F", "M"],
return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" } }), age_group=patients.categorised_as( { "0": "DEFAULT", "16 - under 40": """ age >= 16 AND age < 40""", "40 - under 50": """ age >= 40 AND age < 50""", "50 - under 65": """ age >= 50 AND age < 65""", "65 plus": """ age >= 65""", }, return_expectations={ "rate": "universal", "category": { "ratios": { "16 - under 40": 0.25, "40 - under 50": 0.25, "50 - under 65": 0.25, "65 plus": 0.25, } }, }, ), stp=patients.registered_practice_as_of( "index_date", returning="stp_code", return_expectations={ "category": { "ratios": {
smoking_status=patients.categorised_as( { "S": "most_recent_smoking_code = 'S'", "E": """ most_recent_smoking_code = 'E' OR ( most_recent_smoking_code = 'N' AND ever_smoked ) """, "N": "most_recent_smoking_code = 'N' AND NOT ever_smoked", "M": "DEFAULT", }, return_expectations={ "category": { "ratios": { "S": 0.6, "E": 0.1, "N": 0.2, "M": 0.1 } } }, most_recent_smoking_code=patients.with_these_clinical_events( clear_smoking_codes, find_last_match_in_period=True, on_or_before="sgss_pos_inrange", returning="category", ), ever_smoked=patients.with_these_clinical_events( filter_codes_by_category(clear_smoking_codes, include=["S", "E"]), on_or_before="sgss_pos_inrange", ), ),
smoking_status=patients.categorised_as( { "S": "most_recent_smoking_code = 'S'", "E": """ most_recent_smoking_code = 'E' OR ( most_recent_smoking_code = 'N' AND ever_smoked ) """, "N": "most_recent_smoking_code = 'N' AND NOT ever_smoked", "M": "DEFAULT", }, return_expectations={ "category": { "ratios": { "S": 0.6, "E": 0.1, "N": 0.2, "M": 0.1 } } }, most_recent_smoking_code=patients.with_these_clinical_events( clear_smoking_codes, find_last_match_in_period=True, on_or_before="2020-02-29", returning="category", ), ever_smoked=patients.with_these_clinical_events( filter_codes_by_category(clear_smoking_codes, include=["S", "E"]), on_or_before="2020-02-29", ), ),
# age age=patients.age_as_of( "index_date", return_expectations={ "rate": "universal", "int": {"distribution": "population_ages"}, }, ), # age band ageband_narrow = patients.categorised_as( { "0": "DEFAULT", "65-74": """ age >= 65 AND age < 75""", "75-79": """ age >= 75 AND age < 80""", "80-84": """ age >= 80 AND age < 85""", "85-89": """ age >= 85 AND age < 120""", }, return_expectations={ "rate":"universal", "category": {"ratios": {"65-74": 0.4, "75-79": 0.2, "80-84":0.2, "85-89":0.2 }} }, ), # SELECTED DEMOGRAPHIC CHARACTERISTICS TO DESCRIBE # sex sex=patients.sex( return_expectations={ "rate": "universal", "category": {"ratios": {"M": 0.49, "F": 0.51}}, } ),
age=patients.age_as_of(index_date, return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" } }), age_group=patients.categorised_as( { "0": "DEFAULT", "16 - under 50": " age >= 16 AND age < 50", "50 - under 65": " age >= 50 AND age < 65", "65 plus": " age >= 65", }, return_expectations={ "rate": "universal", "category": { "ratios": { "16 - under 50": 0.5, "50 - under 65": 0.25, "65 plus": 0.25, } }, }, ), stp=patients.registered_practice_as_of( "index_date", returning="stp_code", return_expectations={ "category": { "ratios": { "STP1": 0.5,
"earliest": from_date }}, ), died_ons_noncovid=patients.satisfying( """(NOT died_ons_covid) AND died_ons""", return_expectations={"incidence": 0.15}, ), death_category=patients.categorised_as( { "alive": "NOT died_ons", "covid-death": "died_ons_covid", "non-covid-death": "died_ons_noncovid", "unknown": "DEFAULT", }, return_expectations={ "category": { "ratios": { "alive": 0.8, "covid-death": 0.1, "non-covid-death": 0.1 } } }, ), date_died_ons=patients.died_from_any_cause( returning="date_of_death", on_or_after=from_date, date_format="YYYY-MM-DD", return_expectations={"date": { "earliest": from_date }},
"rate": "universal", "int": { "distribution": "population_ages" } }), age_group=patients.categorised_as( { "0": "DEFAULT", "16 - under 40": """ age >= 16 AND age < 40""", "40 - under 50": """ age >= 40 AND age < 50""", "50 - under 65": """ age >= 50 AND age < 65""", "65 - under 75": """ age >= 65 AND age < 75""", "75 plus": """ age >= 75""", }, return_expectations={ "rate": "universal", "category": { "ratios": { "16 - under 40": 0.25, "40 - under 50": 0.15, "50 - under 65": 0.10, "65 - under 75": 0.25, "75 plus": 0.25, } }, }, ), sex=patients.sex( return_expectations={ "rate": "universal", "category": { "ratios": {
"4": 0.2, "5": 0.2 } }, "incidence": 0.4, }, ), ethnicity=patients.categorised_as( { "0": "DEFAULT", "1": "eth='1' OR (NOT eth AND ethnicity_sus='1')", "2": "eth='2' OR (NOT eth AND ethnicity_sus='2')", "3": "eth='3' OR (NOT eth AND ethnicity_sus='3')", "4": "eth='4' OR (NOT eth AND ethnicity_sus='4')", "5": "eth='5' OR (NOT eth AND ethnicity_sus='5')", }, return_expectations={ "category": { "ratios": { "1": 0.2, "2": 0.2, "3": 0.2, "4": 0.2, "5": 0.2 } }, "incidence": 0.4, }, ), )