def test_column_name_clashes_produce_errors(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), age=patients.age_as_of("2020-01-01"), status=patients.satisfying( "age > 70 AND sex = 'M'", sex=patients.sex(), age=patients.age_as_of("2010-01-01"), ), )
def test_make_df_from_expectations_with_using_dates_as_categories(): study = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, "rate": "exponential_increase", "incidence": 0.2, }, population=patients.all(), eligible_date=patients.categorised_as( { "2020-04-14": "age >= 80", "2020-06-16": "age >= 70 AND age < 80", "2020-08-18": "DEFAULT", }, age=patients.age_as_of("2020-01-01"), return_expectations={ "category": { "ratios": { "2020-04-14": 0.25, "2020-06-16": 0.25, "2020-08-18": 0.5, } }, "incidence": 1, }, ), ) population_size = 100 result = study.make_df_from_expectations(population_size) assert set(result.eligible_date) == set( ["2020-08-18", "2020-06-16", "2020-04-14"])
def test_unrecognised_database_url_raises_error(monkeypatch): monkeypatch.setenv("DATABASE_URL", "unknown-db://localhost") with pytest.raises(ValueError): StudyDefinition( population=patients.all(), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), )
def test_errors_are_triggered_without_database_url(monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) with pytest.raises(KeyError): StudyDefinition( population=patients.satisfying( "no_such_column AND missing_column"), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), )
def test_export_data_without_database_url_raises_error(tmp_path, monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) study = StudyDefinition( population=patients.all(), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), ) with pytest.raises(RuntimeError): study.to_file(tmp_path / "dummy_data.csv")
def test_syntax_errors_in_expressions_are_raised(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), status=patients.satisfying( "age > 70 AND AND sex = 'M'", sex=patients.sex(), age=patients.age_as_of("2010-01-01"), ), )
def test_age_dtype_generation(): study = StudyDefinition( # This line defines the study population population=patients.all(), age=patients.age_as_of("2020-02-01"), ) result = _converters_to_names(study.pandas_csv_args) assert result == { "dtype": {"age": "Int64"}, "parse_dates": [], "date_col_for": {}, "converters": {}, }
def test_create_dummy_data_works_without_database_url(tmp_path, monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) study = StudyDefinition( population=patients.all(), sex=patients.sex( return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "today" }, "category": { "ratios": { "M": 0.49, "F": 0.51 } }, }), age=patients.age_as_of( "2020-01-01", return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "2020-01-01" }, "int": { "distribution": "population_ages" }, }, ), ) filename = tmp_path / "dummy_data.csv" study.to_file(filename, expectations_population=10) with open(filename) as f: results = list(csv.DictReader(f)) assert len(results) == 10 columns = results[0].keys() assert "sex" in columns assert "age" in columns
(sex = 'F' OR sex = 'M') AND (age >= 18 AND age < 120) AND (NOT died) AND (registered) """, registered=patients.registered_as_of(index_date), died=patients.died_from_any_cause( on_or_before=index_date, returning="binary_flag", ), ), age=patients.age_as_of( index_date, return_expectations={ "int": { "distribution": "population_ages" }, "incidence": 1 }, ), sex=patients.sex(return_expectations={ "category": { "ratios": { "M": 0.49, "F": 0.51 } }, "incidence": 1 }), date_death=patients.died_from_any_cause( between=[index_date, end_date],
def test_to_file_with_expectations_population(tmp_path, file_format): cl = codelist([("12345", "foo"), ("67890", "bar")], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), sex=patients.sex(return_expectations={ "category": { "ratios": { "F": 0.5, "M": 0.5 } }, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": { "distribution": "population_ages" }, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_day=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM-DD", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_month=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_year=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), incomplete_categories=patients.with_these_clinical_events( cl, returning="category", return_expectations={ "category": { "ratios": { "foo": 0.5, "bar": 0.5 } }, # Half the values here should be null "incidence": 0.5, }, ), ) dummy_data_file = tmp_path / f"dummy-data.{file_format}" study.to_file(dummy_data_file, expectations_population=100) # We reuse validate_dummy_data to check that the data generated by the expectations # framework is valid. validate_dummy_data(study.covariate_definitions, dummy_data_file)
def test_to_file_with_dummy_data_file(tmp_path, file_format): cl = codelist(["12345"], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), sex=patients.sex(return_expectations={ "category": { "ratios": { "F": 0.5, "M": 0.5 } }, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": { "distribution": "population_ages" }, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_day=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM-DD", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_month=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_year=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), ) # Generate dummy data using the expectations framework dummy_data_file = tmp_path / f"dummy-data.{file_format}" study.to_file(dummy_data_file, expectations_population=10) # Use this dummy data output_file = tmp_path / f"output.{file_format}" study.to_file(output_file, dummy_data_file=dummy_data_file) # Check results with open(dummy_data_file, "rb") as f: dummy_data = f.read() with open(output_file, "rb") as f: expected_output = f.read() assert dummy_data == expected_output
"40-44": 0.05, "45-49": 0.1, "50-54": 0.05, "55-59": 0.05, "60-64": 0.05, "65-69": 0.05, "70-74": 0.05, "75-79": 0.05, "80-84": 0.05, "85-89": 0.05, "90plus": 0.03, "missing": 0.02, } }, }, age=patients.age_as_of("index_date", ), ), # patients admitted to hospital with primary diagnoses included in cvd codelist # filters out maternity-related admissions and transfers from other providers cvd_emergency_elective=patients.admitted_to_hospital( with_these_primary_diagnoses=cvd_codelist, with_admission_method=[ "11", "12", "13", "21", "22", "23", "24", "25", "2A", "2B", "2C", "2D", "28" ], between=["index_date", "index_date + 6 days"], return_expectations={"incidence": 0.1}, ), cvd_admission_method=patients.admitted_to_hospital( with_these_primary_diagnoses=cvd_codelist,
"rate": "uniform", "incidence": 0.5, }, population=patients.registered_with_one_practice_between( "2019-02-01", "2020-02-01"), # Set index date to start date index_date="2020-02-01", ## DEMOGRAPHIC INFORMATION ### Age age=patients.age_as_of( "2020-03-31", return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" }, "incidence": 0.001 }, ), ### Sex sex=patients.sex(return_expectations={ "rate": "universal", "category": { "ratios": { "M": 0.49, "F": 0.51 } }, }),
}, return_expectations={ "rate": "universal", "category": { "ratios": { 1: 0.15, 0: 0.85, }, }, }, ), age=patients.age_as_of( "2021-03-31", # PHE defined date for calulating eligibilty across all vaccination campaigns return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" }, }, ), ageband=patients.categorised_as( { "0": "DEFAULT", # consider doing an under 16 age band as well to differentiate between workers and children eligble for another reason "0-19": """ age >= 0 AND age < 20""", "20-29": """ age >= 20 AND age < 30""", "30-39": """ age >= 30 AND age < 40""", "40-49": """ age >= 40 AND age < 50""", "50-59": """ age >= 50 AND age < 60""", "60-69": """ age >= 60 AND age < 70""", "70-79": """ age >= 70 AND age < 80""",
date_format="YYYY-MM-DD", return_expectations={ "date": { "earliest": "2020-11-16" }, "incidence": 0.2 }, ), ### DEMOGRAPHIC COVARIATES # AGE age=patients.age_as_of( "sgss_pos_inrange", return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" }, }, ), # SEX sex=patients.sex(return_expectations={ "rate": "universal", "category": { "ratios": { "M": 0.49, "F": 0.51 } }, }),
from cohortextractor import StudyDefinition, patients, codelist, codelist_from_csv study = StudyDefinition( default_expectations={ "date": {"earliest": "1900-01-01", "latest": "today"}, "rate": "uniform", "incidence": 0.5, }, population=patients.registered_with_one_practice_between( "2019-02-01", "2020-02-01" ), age=patients.age_as_of( "2019-09-01", return_expectations={ "rate": "universal", "int": {"distribution": "population_ages"}, }, ), )
population=patients.satisfying( "(NOT died) AND (registered) AND (pregnant) AND age >= 16", died=patients.died_from_any_cause(on_or_before=index_date, returning="binary_flag"), registered=patients.registered_as_of(index_date), pregnant=patients.with_these_clinical_events( pregnant_code, between=["index_date", "index_date + 1 month"], returning="binary_flag", return_expectations={"incidence": 0.6}, ), ), age=patients.age_as_of(index_date, return_expectations={ "rate": "universal", "int": { "distribution": "population_ages" } }), clinical_riskgroup=patients.with_these_clinical_events( clinical_riskgroup_codes, between=["index_date", "index_date + 1 month"], returning="binary_flag", #return_expectations= { "incidence": 0.6 },), return_expectations={ "category": { "ratios": { "0": 0.5, "1": 0.5 } },
"rate": "uniform", "incidence": 1 }, index_date=index_date, # This line defines the study population population=patients.satisfying( """ (age >= 18 AND age < 120) AND (NOT died) AND (registered) """, died=patients.died_from_any_cause(on_or_before=index_date, returning="binary_flag"), registered=patients.registered_as_of(index_date), age=patients.age_as_of(index_date), ), ### geographic/administrative groups practice=patients.registered_practice_as_of( index_date, returning="pseudo_id", return_expectations={ "int": { "distribution": "normal", "mean": 100, "stddev": 20 } }, ), stp=patients.registered_practice_as_of(