def test_data_generator_category_and_date(): population_size = 10000 incidence = 0.2 return_expectations = { "rate": "exponential_increase", "incidence": incidence, "category": { "ratios": { "A": 0.1, "B": 0.7, "C": 0.2 } }, "date": { "earliest": "1900-01-01", "latest": "today" }, } result = generate(population_size, **return_expectations) # Check incidence numbers are correct null_rows = result[~pd.isnull(result["date"])] assert len(null_rows) == (population_size * incidence) # Check categories are assigned more-or-less in correct proportion category_a = result[result["category"] == "A"] category_b = result[result["category"] == "B"] category_c = result[result["category"] == "C"] assert len(category_b) > len(category_c) > len(category_a)
def test_data_generator_age(): population_size = 10000 return_expectations = { "rate": "universal", "int": {"distribution": "population_ages"}, } result = generate(population_size, **return_expectations) assert result.int.min() < 5 and result.int.max() > 95
def test_data_generator_bool(): population_size = 10000 incidence = 0.5 return_expectations = { "rate": "exponential_increase", "incidence": incidence, "bool": True, } result = generate(population_size, **return_expectations) assert result["bool"].fillna(0).mean() == 0.5
def test_data_generator_int(): population_size = 10000 incidence = 0.9 return_expectations = { "rate": "exponential_increase", "incidence": incidence, "int": {"distribution": "normal", "mean": 10, "stddev": 1}, } result = generate(population_size, **return_expectations) assert abs(10 - int(result["int"].mean())) < 3
def test_data_generator_universal_category(): population_size = 10000 return_expectations = { "rate": "universal", "category": {"ratios": {"rural": 0.1, "urban": 0.9}}, } result = generate(population_size, **return_expectations) assert ( result.category.value_counts()["urban"] > result.category.value_counts()["rural"] )
def test_data_generator_float(): population_size = 10000 incidence = 0.6 return_expectations = { "rate": "exponential_increase", "incidence": incidence, "date": { "earliest": "1900-01-01", "latest": "2020-01-01" }, "float": { "distribution": "normal", "mean": 35, "stddev": 10 }, } result = generate(population_size, **return_expectations) assert abs(35 - int(result["float"].mean())) < 5
def test_data_generator_date(): population_size = 10000 incidence = 0.2 return_expectations = { "rate": "exponential_increase", "incidence": incidence, "date": {"earliest": "1970-01-01", "latest": "2019-12-31"}, } result = generate(population_size, **return_expectations) # Check incidence numbers are correct null_rows = result[~pd.isnull(result["date"])] assert len(null_rows) == (population_size * incidence) # Check dates are distributed in increasing frequency year_counts = ( result["date"].dt.strftime("%Y").reset_index().groupby("date").count()["index"] ) max_count = population_size for count in list(reversed(year_counts))[:5]: assert count < max_count max_count = count
def test_data_generator_date_uniform(): population_size = 100000 incidence = 0.5 return_expectations = { "rate": "uniform", "incidence": incidence, "date": { "earliest": "2020-01-01", "latest": "2020-01-11" }, } result = generate(population_size, **return_expectations) # Check incidence numbers are correct null_rows = result[~pd.isnull(result["date"])] assert len(null_rows) == (population_size * incidence) # Check dates are distributed approximately evenly date_counts = result["date"].reset_index().groupby("date").count()["index"] expected = (population_size * incidence) / 10 for count in date_counts: assert isclose(count, expected, rel_tol=0.1)