def test_make_df_from_expectations_with_categories_expression_validation(): study = StudyDefinition( population=patients.all(), category=patients.categorised_as( { "A": "sex = 'F'", "B": "sex = 'M'", "": "DEFAULT" }, sex=patients.sex(), return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": { "ratios": { "A": 0.3, "B": 0.6, "C": 0.1 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, ), ) population_size = 10000 with pytest.raises(ValueError): study.make_df_from_expectations(population_size)
def test_script(): import sys from cohortextractor import StudyDefinition, patients study = StudyDefinition( population=patients.all(), sex=patients.sex( return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "today" }, "category": { "ratios": { "M": 0.49, "F": 0.51 } }, }), ) study.to_csv("/dev/null", expectations_population=10) pyodbc = "yes" if "pyodbc" in sys.modules else "no" ctds = "yes" if "ctds" in sys.modules else "no" print(f"pyodbc: {pyodbc}, ctds: {ctds}")
def test_make_df_from_expectations_with_categories_expression(): study = StudyDefinition( population=patients.all(), category=patients.categorised_as( { "A": "sex = 'F'", "B": "sex = 'M'", "": "DEFAULT" }, sex=patients.sex(), return_expectations={ "rate": "exponential_increase", "incidence": 0.2, "category": { "ratios": { "A": 0.3, "B": 0.7 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, }, ), ) population_size = 10000 result = study.make_df_from_expectations(population_size) value_counts = result.category.value_counts() assert value_counts["A"] < value_counts["B"]
def test_unrecognised_database_url_raises_error(monkeypatch): monkeypatch.setenv("DATABASE_URL", "unknown-db://localhost") with pytest.raises(ValueError): StudyDefinition( population=patients.all(), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), )
def test_errors_are_triggered_without_database_url(monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) with pytest.raises(KeyError): StudyDefinition( population=patients.satisfying( "no_such_column AND missing_column"), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), )
def test_export_data_without_database_url_raises_error(tmp_path, monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) study = StudyDefinition( population=patients.all(), sex=patients.sex(), age=patients.age_as_of("2020-01-01", ), ) with pytest.raises(RuntimeError): study.to_file(tmp_path / "dummy_data.csv")
def test_sex_dtype_generation(): study = StudyDefinition(population=patients.all(), sex=patients.sex()) result = _converters_to_names(study.pandas_csv_args) assert result == { "dtype": {"sex": "category"}, "converters": {}, "date_col_for": {}, "parse_dates": [], }
def test_syntax_errors_in_expressions_are_raised(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), status=patients.satisfying( "age > 70 AND AND sex = 'M'", sex=patients.sex(), age=patients.age_as_of("2010-01-01"), ), )
def test_column_name_clashes_produce_errors(): with pytest.raises(ValueError): StudyDefinition( population=patients.all(), age=patients.age_as_of("2020-01-01"), status=patients.satisfying( "age > 70 AND sex = 'M'", sex=patients.sex(), age=patients.age_as_of("2010-01-01"), ), )
def test_make_df_from_expectations_doesnt_alter_defaults(): study = StudyDefinition( default_expectations={ "rate": "exponential_increase", "incidence": 1.0, "date": { "earliest": "1900-01-01", "latest": "today" }, "category": { "ratios": { "M": 0.5, "F": 0.5 } }, }, population=patients.all(), sex_altered=patients.sex(return_expectations={ "incidence": 0.1, "category": { "ratios": { "M": 0.5, "F": 0.5 } }, }), sex_default=patients.sex( return_expectations={"category": { "ratios": { "M": 0.5, "F": 0.5 } }}), ) population_size = 10000 # Just ensuring no exception is raised result = study.make_df_from_expectations(population_size) assert len(result[pd.isnull(result.sex_default)]) == 0
def test_make_df_no_categories_validation_when_no_categories_in_definition(): study = StudyDefinition( population=patients.all(), sex=patients.sex( return_expectations={ "rate": "universal", "date": {"earliest": "1900-01-01", "latest": "today"}, "category": {"ratios": {"M": 0.49, "F": 0.51}}, } ), ) population_size = 10000 # Just ensuring no exception is raised study.make_df_from_expectations(population_size)
def test_create_dummy_data_works_without_database_url(tmp_path, monkeypatch): monkeypatch.delenv("DATABASE_URL", raising=False) study = StudyDefinition( population=patients.all(), sex=patients.sex( return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "today" }, "category": { "ratios": { "M": 0.49, "F": 0.51 } }, }), age=patients.age_as_of( "2020-01-01", return_expectations={ "rate": "universal", "date": { "earliest": "1900-01-01", "latest": "2020-01-01" }, "int": { "distribution": "population_ages" }, }, ), ) filename = tmp_path / "dummy_data.csv" study.to_file(filename, expectations_population=10) with open(filename) as f: results = list(csv.DictReader(f)) assert len(results) == 10 columns = results[0].keys() assert "sex" in columns assert "age" in columns
def test_to_file_with_dummy_data_file(tmp_path, file_format): cl = codelist(["12345"], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), sex=patients.sex(return_expectations={ "category": { "ratios": { "F": 0.5, "M": 0.5 } }, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": { "distribution": "population_ages" }, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_day=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM-DD", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_month=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_year=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), ) # Generate dummy data using the expectations framework dummy_data_file = tmp_path / f"dummy-data.{file_format}" study.to_file(dummy_data_file, expectations_population=10) # Use this dummy data output_file = tmp_path / f"output.{file_format}" study.to_file(output_file, dummy_data_file=dummy_data_file) # Check results with open(dummy_data_file, "rb") as f: dummy_data = f.read() with open(output_file, "rb") as f: expected_output = f.read() assert dummy_data == expected_output
), ), age=patients.age_as_of( index_date, return_expectations={ "int": { "distribution": "population_ages" }, "incidence": 1 }, ), sex=patients.sex(return_expectations={ "category": { "ratios": { "M": 0.49, "F": 0.51 } }, "incidence": 1 }), date_death=patients.died_from_any_cause( between=[index_date, end_date], returning="date_of_death", date_format="YYYY-MM-DD", return_expectations={ "incidence": 0.2, }, ), death_category=patients.categorised_as( { "covid-death": "died_covid",
def test_stats_logging_generate_cohort( mock_load, _mock_list, _mock_check, tmp_path, logger, output_format, write_to_file_log, ): mock_load.return_value = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, }, population=patients.all(), sex=patients.sex(), ) # The query counter is a global at the module level, so it isn't reset between tests # Find the next position (without incrementing it); this is the start of the test's timing logs start_counter = timing_log_counter.next generate_cohort( output_dir=tmp_path, expectations_population=None, dummy_data_file=None, output_format=output_format, ) # initial stats expected_initial_study_def_logs = [ # these 3 are logged from StudyDefinition instantiation # patient_id, population, sex - all from patient table, but we make one temp # table per variable { "output_column_count": 3, "table_count": 2, "table_joins_count": 1 }, { "variable_count": 2 }, # population, sex { "variables_using_codelist_count": 0 }, # index_date_count logged from generate_cohort { "index_date_count": 0 }, ] expected_timing_log_params = [ # logging the start of overall timing for the cohort generation dict( description="generate_cohort", study_definition="study_definition", index_date="all", timing="start", state="started", timing_id=start_counter, ), dict( description="generate_cohort", study_definition="study_definition", timing="start", state="started", timing_id=start_counter + 1, ), # logs in tpp_backend during query execution *_sql_execute_timing_logs( description="Query for sex", sql="SELECT * INTO #sex", timing_id=start_counter + 2, ), *_sql_execute_timing_logs( description="Query for population", sql="SELECT * INTO #population", timing_id=start_counter + 3, ), # logs specifically from study.to_file *_sql_execute_timing_logs( description="Writing results into #final_output", sql="SELECT * INTO #final_output", timing_id=start_counter + 4, ), *_sql_execute_timing_logs( description=None, sql="CREATE INDEX ix_patient_id ON #final_output", timing_id=start_counter + 5, ), # results are fetched in batches for writing dict( description=f"{write_to_file_log} {tmp_path}/input.{output_format}", timing="start", state="started", timing_id=start_counter + 6, ), *_sql_execute_timing_logs( description=None, sql="SELECT TOP 32000 * FROM #final_output", timing_id=start_counter + 7, ), dict( description="Fetch batched results ", timing="start", state="started", timing_id=start_counter + 8, ), dict( description="Fetch batched results ", timing="stop", state="ok", timing_id=start_counter + 8, ), dict( description=f"{write_to_file_log} {tmp_path}/input.{output_format}", timing="stop", state="ok", timing_id=start_counter + 6, ), *_sql_execute_timing_logs( description="Deleting '#final_output'", sql="DROP TABLE #final_output", timing_id=start_counter + 9, ), # logging the overall timing for the cohort generation dict( description="generate_cohort", study_definition="study_definition", timing="stop", state="ok", timing_id=start_counter + 1, ), dict( description="generate_cohort", study_definition="study_definition", index_date="all", timing="stop", state="ok", timing_id=start_counter, ), ] assert_stats_logs(logger, expected_initial_study_def_logs, expected_timing_log_params)
def test_to_file_with_expectations_population(tmp_path, file_format): cl = codelist([("12345", "foo"), ("67890", "bar")], system="snomed") study = StudyDefinition( default_expectations={ "date": { "earliest": "2020-01-01", "latest": "today" } }, population=patients.all(), sex=patients.sex(return_expectations={ "category": { "ratios": { "F": 0.5, "M": 0.5 } }, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": { "distribution": "population_ages" }, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_day=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM-DD", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_month=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY-MM", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), event_date_year=patients.with_these_clinical_events( cl, returning="date", date_format="YYYY", return_expectations={ "rate": "uniform", "incidence": 0.5 }, ), incomplete_categories=patients.with_these_clinical_events( cl, returning="category", return_expectations={ "category": { "ratios": { "foo": 0.5, "bar": 0.5 } }, # Half the values here should be null "incidence": 0.5, }, ), ) dummy_data_file = tmp_path / f"dummy-data.{file_format}" study.to_file(dummy_data_file, expectations_population=100) # We reuse validate_dummy_data to check that the data generated by the expectations # framework is valid. validate_dummy_data(study.covariate_definitions, dummy_data_file)
def test_stats_logging_generate_cohort_with_index_dates( mock_load, _mock_list, _mock_check, logger, tmp_path): mock_load.return_value = StudyDefinition( default_expectations={ "date": { "earliest": "1900-01-01", "latest": "today" }, }, population=patients.all(), sex=patients.sex(), ) # The query counter is a global at the module level, so it isn't reset between tests # Find the next position (without incrementing it); this is the start of the test's timing logs start_counter = timing_log_counter.next generate_cohort( output_dir=tmp_path, expectations_population=None, dummy_data_file=None, index_date_range="2020-01-01 to 2020-03-01 by month", ) expected_index_dates = ["2020-03-01", "2020-02-01", "2020-01-01"] # initial stats expected_initial_study_def_logs = [ # these 3 are logged from StudyDefinition instantiation { "variable_count": 2 }, # population, sex { "variables_using_codelist_count": 0 }, # index_date_count logged from generate_cohort { "index_date_count": 3 }, { "min_index_date": "2020-01-01", "max_index_date": "2020-03-01" }, # output_column/table/joins_count is logged in tpp_backend on backend instantiation so it's repeated for each index date *[{ "output_column_count": 3, "table_count": 2, "table_joins_count": 1 }] * 4, *[{ "resetting_backend_index_date": ix_date } for ix_date in expected_index_dates], ] expected_timing_log_params = [ # logging the start of overall timing for the cohort generation dict( description="generate_cohort", study_definition="study_definition_test", index_date="all", timing="start", state="started", timing_id=start_counter, ) ] # find the value of the next counter, the start of the timing logs for the first index date next_counter = start_counter + 1 for i, index_date in enumerate(expected_index_dates, start=1): expected_timing_log_params.extend([ dict( description="generate_cohort", study_definition="study_definition_test", timing="start", state="started", timing_id=next_counter, ), # logs in tpp_backend during query execution *_sql_execute_timing_logs( description="Query for sex", sql="SELECT * INTO #sex", is_truncated=i != 1, timing_id=next_counter + 1, ), *_sql_execute_timing_logs( description="Query for population", sql="SELECT * INTO #population", is_truncated=i != 1, timing_id=next_counter + 2, ), # logs specifically from study.to_file *_sql_execute_timing_logs( description="Writing results into #final_output", sql="SELECT * INTO #final_output", is_truncated=i != 1, timing_id=next_counter + 3, ), *_sql_execute_timing_logs( description=None, sql="CREATE INDEX ix_patient_id ON #final_output", timing_id=next_counter + 4, ), # results are fetched in batches for writing dict( description= f"write_rows_to_csv {tmp_path}/input_test_{index_date}.csv", timing="start", state="started", timing_id=next_counter + 5, ), *_sql_execute_timing_logs( description=None, sql="SELECT TOP 32000 * FROM #final_output", timing_id=next_counter + 6, ), dict( description="Fetch batched results ", timing="start", state="started", timing_id=next_counter + 7, ), dict( description="Fetch batched results ", timing="stop", state="ok", timing_id=next_counter + 7, ), dict( description= f"write_rows_to_csv {tmp_path}/input_test_{index_date}.csv", timing="stop", state="ok", timing_id=next_counter + 5, ), *_sql_execute_timing_logs( description="Deleting '#final_output'", sql="DROP TABLE #final_output", is_truncated=i != 1, timing_id=next_counter + 8, ), # logging the overall timing for the cohort generation dict( description="generate_cohort", study_definition="study_definition_test", timing="stop", state="ok", timing_id=next_counter, ), ]) # set next counter to one more than the max for this index date next_counter += 8 + 1 # add the log for the end of overall timing for the cohort generation; this should have the same # id as the first timing log expected_timing_log_params.append( dict( description="generate_cohort", study_definition="study_definition_test", index_date="all", timing="stop", state="ok", timing_id=start_counter, )) assert_stats_logs( logger, expected_initial_study_def_logs, expected_timing_log_params, )
"ratios": { "16 - under 40": 0.25, "40 - under 50": 0.15, "50 - under 65": 0.10, "65 - under 75": 0.25, "75 plus": 0.25, } }, }, ), sex=patients.sex( return_expectations={ "rate": "universal", "category": { "ratios": { "M": 0.39, "F": 0.41, "I": 0.1, "U": 0.1 } }, }), stp=patients.registered_practice_as_of( "index_date", returning="stp_code", return_expectations={ "category": { "ratios": { "STP1": 0.5, "STP2": 0.5 } },
from cohortextractor.cohortextractor import SUPPORTED_FILE_FORMATS from cohortextractor.csv_utils import is_csv_filename, write_rows_to_csv from cohortextractor.pandas_utils import dataframe_from_rows, dataframe_to_file from cohortextractor.validate_dummy_data import ( DummyDataValidationError, validate_dummy_data, ) cl = codelist(["12345"], system="snomed") column_definitions = dict( default_expectations={"date": {"earliest": "2020-01-01", "latest": "today"}}, population=patients.all(), sex=patients.sex( return_expectations={ "category": {"ratios": {"F": 0.5, "M": 0.5}}, "rate": "universal", }, ), age=patients.age_as_of( "2020-01-01", return_expectations={ "int": {"distribution": "population_ages"}, "rate": "universal", }, ), has_event=patients.with_these_clinical_events( cl, returning="binary_flag", return_expectations={"rate": "uniform", "incidence": 0.5}, ), event_date_day=patients.with_these_clinical_events(