def test_make_df_from_expectations_with_categories_expression():
    study = StudyDefinition(
        population=patients.all(),
        category=patients.categorised_as(
            {
                "A": "sex = 'F'",
                "B": "sex = 'M'",
                "": "DEFAULT"
            },
            sex=patients.sex(),
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.7
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    value_counts = result.category.value_counts()
    assert value_counts["A"] < value_counts["B"]
def test_make_df_from_expectations_with_using_dates_as_categories():
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        eligible_date=patients.categorised_as(
            {
                "2020-04-14": "age >= 80",
                "2020-06-16": "age >= 70 AND age < 80",
                "2020-08-18": "DEFAULT",
            },
            age=patients.age_as_of("2020-01-01"),
            return_expectations={
                "category": {
                    "ratios": {
                        "2020-04-14": 0.25,
                        "2020-06-16": 0.25,
                        "2020-08-18": 0.5,
                    }
                },
                "incidence": 1,
            },
        ),
    )
    population_size = 100
    result = study.make_df_from_expectations(population_size)
    assert set(result.eligible_date) == set(
        ["2020-08-18", "2020-06-16", "2020-04-14"])
Example #3
0
def test_recursive_definitions_produce_errors():
    with pytest.raises(ValueError):
        StudyDefinition(
            population=patients.all(),
            this=patients.satisfying("that = 1"),
            that=patients.satisfying("this = 1"),
        )
def test_bmi_dtype_generation():
    categorised_codelist = codelist([("X", "Y")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        bmi=patients.most_recent_bmi(
            on_or_after="2010-02-01",
            minimum_age_at_measurement=16,
        ),
        bmi_date_measured=patients.date_of("bmi", date_format="YYYY-MM"),
    )

    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {
            "bmi_date_measured": "add_day_to_date"
        },
        "dtype": {
            "bmi": "float"
        },
        "date_col_for": {
            "bmi": "bmi_date_measured"
        },
        "parse_dates": ["bmi_date_measured"],
    }
def test_make_df_from_expectations_with_number_of_episodes():
    study = StudyDefinition(
        population=patients.all(),
        episode_count=patients.with_these_clinical_events(
            codelist(["A", "B", "C"], system="ctv3"),
            ignore_days_where_these_codes_occur=codelist(["D", "E"],
                                                         system="ctv3"),
            returning="number_of_episodes",
            episode_defined_as="series of events each <= 14 days apart",
            return_expectations={
                "int": {
                    "distribution": "normal",
                    "mean": 4,
                    "stddev": 2
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
                "incidence": 0.2,
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.columns == ["episode_count"]
def test_make_df_from_expectations_with_mean_recorded_value():
    study = StudyDefinition(
        population=patients.all(),
        drug_x=patients.mean_recorded_value(
            codelist(["X"], system="ctv3"),
            on_most_recent_day_of_measurement=True,
            return_expectations={
                "rate": "exponential_increase",
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
                "incidence": 0.6,
                "float": {
                    "distribution": "normal",
                    "mean": 35,
                    "stddev": 10
                },
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    nonzero_results = result[result["drug_x"] != 0.0]
    assert abs(35 - int(nonzero_results["drug_x"].mean())) < 5
def test_make_df_from_expectations_with_categories_expression_validation():
    study = StudyDefinition(
        population=patients.all(),
        category=patients.categorised_as(
            {
                "A": "sex = 'F'",
                "B": "sex = 'M'",
                "": "DEFAULT"
            },
            sex=patients.sex(),
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.6,
                        "C": 0.1
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
    )
    population_size = 10000
    with pytest.raises(ValueError):
        study.make_df_from_expectations(population_size)
Example #8
0
    def test_script():
        import sys
        from cohortextractor import StudyDefinition, patients

        study = StudyDefinition(
            population=patients.all(),
            sex=patients.sex(
                return_expectations={
                    "rate": "universal",
                    "date": {
                        "earliest": "1900-01-01",
                        "latest": "today"
                    },
                    "category": {
                        "ratios": {
                            "M": 0.49,
                            "F": 0.51
                        }
                    },
                }),
        )
        study.to_csv("/dev/null", expectations_population=10)
        pyodbc = "yes" if "pyodbc" in sys.modules else "no"
        ctds = "yes" if "ctds" in sys.modules else "no"
        print(f"pyodbc: {pyodbc}, ctds: {ctds}")
def test_make_df_from_expectations_with_distribution_and_date():
    study = StudyDefinition(
        population=patients.all(),
        bmi=patients.most_recent_bmi(
            on_or_after="2010-02-01",
            minimum_age_at_measurement=16,
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.6,
                "float": {
                    "distribution": "normal",
                    "mean": 35,
                    "stddev": 10
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
        ),
        bmi_date_measured=patients.date_of(
            "bmi",
            date_format="YYYY-MM",
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert list(sorted(result.columns)) == ["bmi", "bmi_date_measured"]

    # Check that the null-valued rows are aligned with each other
    assert ((result["bmi"] == 0.0) == pd.isnull(
        result["bmi_date_measured"])).all()
def test_make_df_from_expectations_with_categories_in_codelist_validation():
    categorised_codelist = codelist([("X", "Y")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        ethnicity=patients.with_these_clinical_events(
            categorised_codelist,
            returning="category",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.7
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
            find_last_match_in_period=True,
        ),
    )
    population_size = 10000
    with pytest.raises(ValueError):
        study.make_df_from_expectations(population_size)
def test_stats_logging_with_error(logger):
    study = StudyDefinition(
        population=patients.all(),
        event=patients.with_these_clinical_events(codelist(["A"], "snomed")),
    )

    # insert a deliberate error in the queries
    study.backend.queries[-1] = "SELECT Foo FROM Bar"
    with pytest.raises(Exception) as excinfo:
        study.to_dicts()

    # The error is raised as expected
    assert "Invalid object name 'Bar'" in str(excinfo.value)

    # Timing is logged, with the error state in the end log
    (sql_log, ) = [
        log for log in logger.entries
        if log.get("sql") == "SELECT Foo FROM Bar"
    ]
    (end_log, ) = [
        log for log in logger.entries
        if log.get("timing_id") == sql_log["timing_id"]
        and log.get("timing") == "stop"
    ]
    assert end_log["state"] == "error"
def test_clinical_events_numeric_value_dtype_generation():
    test_codelist = codelist(["X"], system="ctv3")
    study = StudyDefinition(
        population=patients.all(),
        creatinine=patients.with_these_clinical_events(
            test_codelist,
            find_last_match_in_period=True,
            on_or_before="2020-02-01",
            returning="numeric_value",
        ),
        creatinine_date=patients.date_of("creatinine", date_format="YYYY-MM"),
    )
    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {
            "creatinine_date": "add_day_to_date"
        },
        "dtype": {
            "creatinine": "float"
        },
        "date_col_for": {
            "creatinine": "creatinine_date"
        },
        "parse_dates": ["creatinine_date"],
    }
def test_make_df_from_expectations_with_categories():
    categorised_codelist = codelist([("1", "A"), ("2", "B")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        ethnicity=patients.with_these_clinical_events(
            categorised_codelist,
            returning="category",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {
                    "ratios": {
                        "A": 0.3,
                        "B": 0.7
                    }
                },
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
            },
            find_last_match_in_period=True,
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.columns == ["ethnicity"]

    category_counts = result.reset_index().groupby("ethnicity").count()
    assert category_counts.loc["A", :][0] < category_counts.loc["B", :][0]
def test_categorical_clinical_events_with_date_dtype_generation():
    categorised_codelist = codelist([("X", "Y")], system="ctv3")
    categorised_codelist.has_categories = True
    study = StudyDefinition(
        population=patients.all(),
        ethnicity=patients.with_these_clinical_events(
            categorised_codelist,
            returning="category",
            find_last_match_in_period=True,
        ),
        ethnicity_date=patients.date_of("ethnicity"),
    )

    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {
            "ethnicity_date": "add_month_and_day_to_date"
        },
        "date_col_for": {
            "ethnicity": "ethnicity_date"
        },
        "dtype": {
            "ethnicity": "category"
        },
        "parse_dates": ["ethnicity_date"],
    }
def test_make_df_from_expectations_partial_default_overrides():
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
            "rate": "exponential_increase",
            "incidence": 0.2,
        },
        population=patients.all(),
        asthma_condition=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            find_first_match_in_period=True,
            date_format="YYYY",
            return_expectations={"date": {
                "latest": "2000-01-01"
            }},
        ),
    )

    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert result.asthma_condition.astype("float").max() == 2000
Example #16
0
def test_make_df_from_expectations_with_care_home_status():
    study = StudyDefinition(
        population=patients.all(),
        is_in_care_home=patients.care_home_status_as_of(
            "2020-01-01",
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.3,
                "date": {"earliest": "1900-01-01", "latest": "2020-01-01"},
                "bool": True,
            },
        ),
        care_home_type=patients.care_home_status_as_of(
            "2020-01-01",
            categorised_as={
                "PN": "IsPotentialCareHome AND LocationRequiresNursing='Y'",
                "PC": "IsPotentialCareHome",
                "U": "DEFAULT",
            },
            return_expectations={
                "rate": "exponential_increase",
                "incidence": 0.2,
                "category": {"ratios": {"PN": 0.1, "PC": 0.2, "U": 0.7}},
                "date": {"earliest": "1900-01-01", "latest": "today"},
            },
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    value_counts = result.care_home_type.value_counts()
    assert value_counts["PN"] < value_counts["U"]
Example #17
0
def test_make_df_from_expectations_doesnt_alter_date_defaults():

    study = StudyDefinition(
        default_expectations={
            "rate": "exponential_increase",
            "incidence": 1.0,
            "date": {"earliest": "1900-01-01", "latest": "today"},
            "category": {"ratios": {"M": 0.5, "F": 0.5}},
        },
        population=patients.all(),
        with_different_incidence=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            return_expectations={"incidence": 0.2},
            include_day=True,
        ),
        with_different_date=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"),
            returning="date",
            return_expectations={"date": {"earliest": "2015-01-01", "latest": "today"}},
            include_day=True,
        ),
        with_defaults=patients.with_these_clinical_events(
            codelist(["X"], system="ctv3"), returning="date", include_day=True
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)

    # Regression test: make sure defaults are respected even when they've been overridden
    assert result.with_defaults.min() < "2015-01-01"
    assert len(result[pd.isnull(result.with_defaults)]) == 0
Example #18
0
def test_unrecognised_database_url_raises_error(monkeypatch):
    monkeypatch.setenv("DATABASE_URL", "unknown-db://localhost")
    with pytest.raises(ValueError):
        StudyDefinition(
            population=patients.all(),
            sex=patients.sex(),
            age=patients.age_as_of("2020-01-01", ),
        )
Example #19
0
def test_export_data_without_database_url_raises_error(tmp_path, monkeypatch):
    monkeypatch.delenv("DATABASE_URL", raising=False)
    study = StudyDefinition(
        population=patients.all(),
        sex=patients.sex(),
        age=patients.age_as_of("2020-01-01", ),
    )
    with pytest.raises(RuntimeError):
        study.to_file(tmp_path / "dummy_data.csv")
Example #20
0
def test_sex_dtype_generation():
    study = StudyDefinition(population=patients.all(), sex=patients.sex())
    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "dtype": {"sex": "category"},
        "converters": {},
        "date_col_for": {},
        "parse_dates": [],
    }
Example #21
0
 def define_study():
     StudyDefinition(
         population=patients.all(),
         # by default returns last match in period, using visit date
         value=patients.with_an_ons_cis_record(
             returning=returning,
             date_filter_column=date_filter_column,
             on_or_after=on_or_after,
         ),
     )
Example #22
0
def test_syntax_errors_in_expressions_are_raised():
    with pytest.raises(ValueError):
        StudyDefinition(
            population=patients.all(),
            status=patients.satisfying(
                "age > 70 AND AND sex = 'M'",
                sex=patients.sex(),
                age=patients.age_as_of("2010-01-01"),
            ),
        )
Example #23
0
def test_column_name_clashes_produce_errors():
    with pytest.raises(ValueError):
        StudyDefinition(
            population=patients.all(),
            age=patients.age_as_of("2020-01-01"),
            status=patients.satisfying(
                "age > 70 AND sex = 'M'",
                sex=patients.sex(),
                age=patients.age_as_of("2010-01-01"),
            ),
        )
def test_study_definition_initial_stats_logging(logger):
    StudyDefinition(
        default_expectations={
            "rate": "exponential_increase",
            "incidence": 0.2,
            "date": {
                "earliest": "1900-01-01",
                "latest": "today"
            },
        },
        population=patients.all(),
        event_date_1=patients.with_these_clinical_events(
            codelist(["A"], system="ctv3"),
            returning="date",
            date_format="YYYY-MM-DD",
        ),
        event_min_date=patients.minimum_of(
            "event_date_1",
            event_date_2=patients.with_these_clinical_events(
                codelist(["B", "C"], system="ctv3"),
                returning="date",
                date_format="YYYY-MM-DD",
            ),
        ),
    )
    assert get_stats_logs(logger.entries) == [
        # output columns include patient_id, and the 4 variables defined in the
        # study defniiton, including event_date_2, which is defined as a parameter to
        # event_min_date
        # tables - Patient, temp event table for each codelist
        {
            "output_column_count": 5,
            "table_count": 3,
            "table_joins_count": 2
        },
        # variable_count is a count of the top-level variables defined in the study def (i.e. not event_date_2)
        {
            "variable_count": 4
        },
        # 2 variables use a codelist (event_date_1, and the nested event_date_2)
        {
            "variables_using_codelist_count": 2
        },
        # for each variable using a codelist, we log the size of the codelist
        {
            "variable_using_codelist": "event_date_1",
            "codelist_size": 1
        },
        {
            "variable_using_codelist": "event_date_2",
            "codelist_size": 2
        },
    ]
Example #25
0
def test_apply_date_filters_from_definition():
    study = StudyDefinition(population=patients.all())
    series = np.arange(10)

    result = list(study.apply_date_filters_from_definition(series, between=[5, 6]))
    assert result == [5, 6]

    result = list(study.apply_date_filters_from_definition(series, between=[5, None]))
    assert result == [5, 6, 7, 8, 9]

    result = list(study.apply_date_filters_from_definition(series, between=[None, 2]))
    assert result == [0, 1, 2]
Example #26
0
def test_make_df_from_binary_default_outcome():
    study = StudyDefinition(
        population=patients.all(),
        died=patients.died_from_any_cause(
            return_expectations={
                "date": {"earliest": "1900-01-01", "latest": "today"},
                "incidence": 0.1,
            }
        ),
    )
    population_size = 10000
    result = study.make_df_from_expectations(population_size)
    assert len(result[~pd.isnull(result.died)]) == 0.1 * population_size
Example #27
0
def test_clinical_events_with_year_date_dtype_generation():
    test_codelist = codelist(["X"], system="ctv3")
    study = StudyDefinition(
        population=patients.all(),
        diabetes=patients.with_these_clinical_events(test_codelist, returning="date"),
    )
    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "converters": {"diabetes": "add_month_and_day_to_date"},
        "date_col_for": {},
        "dtype": {},
        "parse_dates": ["diabetes"],
    }
Example #28
0
def test_age_dtype_generation():
    study = StudyDefinition(
        # This line defines the study population
        population=patients.all(),
        age=patients.age_as_of("2020-02-01"),
    )
    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "dtype": {"age": "Int64"},
        "parse_dates": [],
        "date_col_for": {},
        "converters": {},
    }
Example #29
0
def test_make_df_no_categories_validation_when_no_categories_in_definition():
    study = StudyDefinition(
        population=patients.all(),
        sex=patients.sex(
            return_expectations={
                "rate": "universal",
                "date": {"earliest": "1900-01-01", "latest": "today"},
                "category": {"ratios": {"M": 0.49, "F": 0.51}},
            }
        ),
    )
    population_size = 10000
    # Just ensuring no exception is raised
    study.make_df_from_expectations(population_size)
Example #30
0
def test_address_dtype_generation():
    study = StudyDefinition(
        # This line defines the study population
        population=patients.all(),
        rural_urban=patients.address_as_of(
            "2020-02-01", returning="rural_urban_classification"
        ),
    )
    result = _converters_to_names(study.pandas_csv_args)
    assert result == {
        "dtype": {"rural_urban": "category"},
        "parse_dates": [],
        "date_col_for": {},
        "converters": {},
    }