def test_data_generator_category_and_date():
    population_size = 10000
    incidence = 0.2
    return_expectations = {
        "rate": "exponential_increase",
        "incidence": incidence,
        "category": {
            "ratios": {
                "A": 0.1,
                "B": 0.7,
                "C": 0.2
            }
        },
        "date": {
            "earliest": "1900-01-01",
            "latest": "2020-01-01"
        },
    }
    result = generate(population_size, **return_expectations)

    # Check incidence numbers are correct
    null_rows = result[~pd.isnull(result["date"])]
    assert len(null_rows) == (population_size * incidence)

    # Check categories are assigned more-or-less in correct proportion
    category_a = result[result["category"] == "A"]
    category_b = result[result["category"] == "B"]
    category_c = result[result["category"] == "C"]
    assert len(category_b) > len(category_c) > len(category_a)
Ejemplo n.º 2
0
def test_data_generator_age():
    population_size = 10000
    return_expectations = {
        "rate": "universal",
        "date": {"earliest": "1900-01-01", "latest": "2020-01-01"},
        "int": {"distribution": "population_ages"},
    }
    result = generate(population_size, **return_expectations)
    assert result.int.min() < 5 and result.int.max() > 95
Ejemplo n.º 3
0
def test_data_generator_bool():
    population_size = 10000
    incidence = 0.5
    return_expectations = {
        "rate": "exponential_increase",
        "incidence": incidence,
        "date": {"earliest": "1900-01-01", "latest": "2020-01-01"},
        "bool": True,
    }
    result = generate(population_size, **return_expectations)
    assert result["bool"].fillna(0).mean() == 0.5
Ejemplo n.º 4
0
def test_data_generator_int():
    population_size = 10000
    incidence = 0.9
    return_expectations = {
        "rate": "exponential_increase",
        "incidence": incidence,
        "date": {"earliest": "1900-01-01", "latest": "2020-01-01"},
        "int": {"distribution": "normal", "mean": 10, "stddev": 1},
    }
    result = generate(population_size, **return_expectations)
    assert abs(10 - int(result["int"].mean())) < 3
Ejemplo n.º 5
0
def test_data_generator_universal_category():
    population_size = 10000
    return_expectations = {
        "rate": "universal",
        "date": {"earliest": "1900-01-01", "latest": "2020-01-01"},
        "category": {"ratios": {"rural": 0.1, "urban": 0.9}},
    }
    result = generate(population_size, **return_expectations)
    assert (
        result.category.value_counts()["urban"]
        > result.category.value_counts()["rural"]
    )
def test_data_generator_float():
    population_size = 10000
    incidence = 0.6
    return_expectations = {
        "rate": "exponential_increase",
        "incidence": incidence,
        "date": {
            "earliest": "1900-01-01",
            "latest": "2020-01-01"
        },
        "float": {
            "distribution": "normal",
            "mean": 35,
            "stddev": 10
        },
    }
    result = generate(population_size, **return_expectations)
    nonzero_results = result[result["float"] != 0.0]
    assert abs(35 - int(nonzero_results["float"].mean())) < 5
Ejemplo n.º 7
0
def test_data_generator_date_uniform():
    population_size = 100000
    incidence = 0.5
    return_expectations = {
        "rate": "uniform",
        "incidence": incidence,
        "date": {"earliest": "2020-01-01", "latest": "2020-01-11"},
    }
    result = generate(population_size, **return_expectations)

    # Check incidence numbers are correct
    null_rows = result[~pd.isnull(result["date"])]
    assert len(null_rows) == (population_size * incidence)

    # Check dates are distributed approximately evenly
    date_counts = result["date"].reset_index().groupby("date").count()["index"]

    expected = (population_size * incidence) / 10
    for count in date_counts:
        assert isclose(count, expected, rel_tol=0.1)
Ejemplo n.º 8
0
def test_data_generator_date_exponential_increase():
    population_size = 10000
    incidence = 0.2
    return_expectations = {
        "rate": "exponential_increase",
        "incidence": incidence,
        "date": {"earliest": "1970-01-01", "latest": "2019-12-31"},
    }
    result = generate(population_size, **return_expectations)

    # Check incidence numbers are correct
    null_rows = result[~pd.isnull(result["date"])]
    assert len(null_rows) == (population_size * incidence)

    # Check dates are distributed in increasing frequency
    year_counts = (
        result["date"].dt.strftime("%Y").reset_index().groupby("date").count()["index"]
    )
    max_count = population_size
    for count in list(reversed(year_counts))[:5]:
        assert count < max_count
        max_count = count