Beispiel #1
0
    def test_script():
        import sys

        from cohortextractor import StudyDefinition, patients

        study = StudyDefinition(
            population=patients.all(),
            sex=patients.sex(
                return_expectations={
                    "rate": "universal",
                    "date": {
                        "earliest": "1900-01-01",
                        "latest": "today"
                    },
                    "category": {
                        "ratios": {
                            "M": 0.49,
                            "F": 0.51
                        }
                    },
                }),
        )
        study.to_file("TMP_PATH/dummy.csv", expectations_population=10)
        pyodbc = "yes" if "pyodbc" in sys.modules else "no"
        ctds = "yes" if "ctds" in sys.modules else "no"
        print(f"pyodbc: {pyodbc}, ctds: {ctds}")
Beispiel #2
0
def test_export_data_without_database_url_raises_error(tmp_path, monkeypatch):
    monkeypatch.delenv("DATABASE_URL", raising=False)
    study = StudyDefinition(
        population=patients.all(),
        sex=patients.sex(),
        age=patients.age_as_of("2020-01-01", ),
    )
    with pytest.raises(RuntimeError):
        study.to_file(tmp_path / "dummy_data.csv")
Beispiel #3
0
def test_create_dummy_data_works_without_database_url(tmp_path, monkeypatch):
    monkeypatch.delenv("DATABASE_URL", raising=False)
    study = StudyDefinition(
        population=patients.all(),
        sex=patients.sex(
            return_expectations={
                "rate": "universal",
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "today"
                },
                "category": {
                    "ratios": {
                        "M": 0.49,
                        "F": 0.51
                    }
                },
            }),
        age=patients.age_as_of(
            "2020-01-01",
            return_expectations={
                "rate": "universal",
                "date": {
                    "earliest": "1900-01-01",
                    "latest": "2020-01-01"
                },
                "int": {
                    "distribution": "population_ages"
                },
            },
        ),
    )
    filename = tmp_path / "dummy_data.csv"
    study.to_file(filename, expectations_population=10)
    with open(filename) as f:
        results = list(csv.DictReader(f))
    assert len(results) == 10
    columns = results[0].keys()
    assert "sex" in columns
    assert "age" in columns
Beispiel #4
0
def test_booleans_correctly_handled_in_dummy_data(tmp_path, file_format):
    cl = codelist(["12345"], system="snomed")
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "2020-01-01",
                "latest": "today"
            }
        },
        population=patients.all(),
        has_event=patients.with_these_clinical_events(
            cl,
            returning="binary_flag",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
    )

    filename = tmp_path / f"dummy-data.{file_format}"
    study.to_file(filename, expectations_population=100)

    if file_format in ("csv", "csv.gz"):
        df = pandas.read_csv(filename, dtype=str)
        bools = ("0", "1")
    elif file_format == "feather":
        df = pandas.read_feather(filename)
        bools = (True, False)
    elif file_format in ("dta", "dta.gz"):
        df = pandas.read_stata(filename)
        bools = (0, 1)
    else:
        assert False, f"Unhandled format: {file_format}"

    # Check we've got at least some of each value
    counts = df.has_event.value_counts()
    assert counts[bools[0]] > 10
    assert counts[bools[1]] > 10
Beispiel #5
0
def test_to_file_with_expectations_population(tmp_path, file_format):
    cl = codelist([("12345", "foo"), ("67890", "bar")], system="snomed")
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "2020-01-01",
                "latest": "today"
            }
        },
        population=patients.all(),
        sex=patients.sex(return_expectations={
            "category": {
                "ratios": {
                    "F": 0.5,
                    "M": 0.5
                }
            },
            "rate": "universal",
        }, ),
        age=patients.age_as_of(
            "2020-01-01",
            return_expectations={
                "int": {
                    "distribution": "population_ages"
                },
                "rate": "universal",
            },
        ),
        has_event=patients.with_these_clinical_events(
            cl,
            returning="binary_flag",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_day=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM-DD",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_month=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_year=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        incomplete_categories=patients.with_these_clinical_events(
            cl,
            returning="category",
            return_expectations={
                "category": {
                    "ratios": {
                        "foo": 0.5,
                        "bar": 0.5
                    }
                },
                # Half the values here should be null
                "incidence": 0.5,
            },
        ),
    )

    dummy_data_file = tmp_path / f"dummy-data.{file_format}"
    study.to_file(dummy_data_file, expectations_population=100)
    # We reuse validate_dummy_data to check that the data generated by the expectations
    # framework is valid.
    validate_dummy_data(study.covariate_definitions, dummy_data_file)
Beispiel #6
0
def test_to_file_with_dummy_data_file_incorrect_extension(tmp_path):
    study = StudyDefinition(population=patients.all())
    with pytest.raises(DummyDataValidationError):
        study.to_file(tmp_path / "output.csv.gz",
                      dummy_data_file=tmp_path / "dummy-data-csv")
Beispiel #7
0
def test_to_file_with_dummy_data_file(tmp_path, file_format):
    cl = codelist(["12345"], system="snomed")
    study = StudyDefinition(
        default_expectations={
            "date": {
                "earliest": "2020-01-01",
                "latest": "today"
            }
        },
        population=patients.all(),
        sex=patients.sex(return_expectations={
            "category": {
                "ratios": {
                    "F": 0.5,
                    "M": 0.5
                }
            },
            "rate": "universal",
        }, ),
        age=patients.age_as_of(
            "2020-01-01",
            return_expectations={
                "int": {
                    "distribution": "population_ages"
                },
                "rate": "universal",
            },
        ),
        has_event=patients.with_these_clinical_events(
            cl,
            returning="binary_flag",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_day=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM-DD",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_month=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY-MM",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
        event_date_year=patients.with_these_clinical_events(
            cl,
            returning="date",
            date_format="YYYY",
            return_expectations={
                "rate": "uniform",
                "incidence": 0.5
            },
        ),
    )

    # Generate dummy data using the expectations framework
    dummy_data_file = tmp_path / f"dummy-data.{file_format}"
    study.to_file(dummy_data_file, expectations_population=10)

    # Use this dummy data
    output_file = tmp_path / f"output.{file_format}"
    study.to_file(output_file, dummy_data_file=dummy_data_file)

    # Check results
    with open(dummy_data_file, "rb") as f:
        dummy_data = f.read()

    with open(output_file, "rb") as f:
        expected_output = f.read()

    assert dummy_data == expected_output