def test_script(): import sys from cohortextractor import StudyDefinition, patients study = StudyDefinition( population=patients.all(), sex=patients.sex( return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "today" }, "category": { "ratios": { "M": 0.49, "F": 0.51 } }, }), ) study.to_file("TMP_PATH/dummy.csv", expectations_population=10) pyodbc = "yes" if "pyodbc" in sys.modules else "no" ctds = "yes" if "ctds" in sys.modules else "no" print(f"pyodbc: {pyodbc}, ctds: {ctds}")
def test_export_data_without_database_url_raises_error(tmp_path, monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) study = StudyDefinition( population=patients.all(), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), ) with pytest.raises(RuntimeError): study.to_file(tmp_path / "dummy_data.csv")
def test_create_dummy_data_works_without_database_url(tmp_path, monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) study = StudyDefinition( population=patients.all(), sex=patients.sex( return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "today" }, "category": { "ratios": { "M": 0.49, "F": 0.51 } }, }), age=patients.age_as_of( "2020-01-01", return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "2020-01-01" }, "int": { "distribution": "population_ages" }, }, ), ) filename = tmp_path / "dummy_data.csv" study.to_file(filename, expectations_population=10) with open(filename) as f: results = list(csv.DictReader(f)) assert len(results) == 10 columns = results[0].keys() assert "sex" in columns assert "age" in columns
def test_booleans_correctly_handled_in_dummy_data(tmp_path, file_format): cl = codelist(["12345"], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), ) filename = tmp_path / f"dummy-data.{file_format}" study.to_file(filename, expectations_population=100) if file_format in ("csv", "csv.gz"): df = pandas.read_csv(filename, dtype=str) bools = ("0", "1") elif file_format == "feather": df = pandas.read_feather(filename) bools = (True, False) elif file_format in ("dta", "dta.gz"): df = pandas.read_stata(filename) bools = (0, 1) else: assert False, f"Unhandled format: {file_format}" # Check we've got at least some of each value counts = df.has_event.value_counts() assert counts[bools[0]] > 10 assert counts[bools[1]] > 10
def test_to_file_with_expectations_population(tmp_path, file_format): cl = codelist([("12345", "foo"), ("67890", "bar")], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), sex=patients.sex(return_expectations={ "category": { "ratios": { "F": 0.5, "M": 0.5 } }, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": { "distribution": "population_ages" }, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_day=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM-DD", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_month=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_year=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), incomplete_categories=patients.with_these_clinical_events( cl, returning="category", return_expectations={ "category": { "ratios": { "foo": 0.5, "bar": 0.5 } }, # Half the values here should be null "incidence": 0.5, }, ), ) dummy_data_file = tmp_path / f"dummy-data.{file_format}" study.to_file(dummy_data_file, expectations_population=100) # We reuse validate_dummy_data to check that the data generated by the expectations # framework is valid. validate_dummy_data(study.covariate_definitions, dummy_data_file)
def test_to_file_with_dummy_data_file_incorrect_extension(tmp_path): study = StudyDefinition(population=patients.all()) with pytest.raises(DummyDataValidationError): study.to_file(tmp_path / "output.csv.gz", dummy_data_file=tmp_path / "dummy-data-csv")
def test_to_file_with_dummy_data_file(tmp_path, file_format): cl = codelist(["12345"], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), sex=patients.sex(return_expectations={ "category": { "ratios": { "F": 0.5, "M": 0.5 } }, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": { "distribution": "population_ages" }, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_day=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM-DD", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_month=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_year=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), ) # Generate dummy data using the expectations framework dummy_data_file = tmp_path / f"dummy-data.{file_format}" study.to_file(dummy_data_file, expectations_population=10) # Use this dummy data output_file = tmp_path / f"output.{file_format}" study.to_file(output_file, dummy_data_file=dummy_data_file) # Check results with open(dummy_data_file, "rb") as f: dummy_data = f.read() with open(output_file, "rb") as f: expected_output = f.read() assert dummy_data == expected_output