def test_suppression_doesnt_affect_later_calculations_on_the_same_data(): data = pandas.DataFrame({"fish": [2], "litres": [2]}) m1 = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) r1 = calculate(m1, data) assert numpy.isnan(r1.iloc[0]["value"]) m2 = Measure("ignored-id", numerator="fish", denominator="litres") r2 = calculate(m2, data) assert r2.iloc[0]["value"] == 1.0
def test_reports_suppression_of_small_values(): m = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) data = pandas.DataFrame({"fish": [1], "litres": [100]}, index=["bowl"]) reporter = RecordingReporter() calculate(m, data, reporter) assert "Suppressed small numbers in column fish" in reporter.msg
def test_suppresses_denominator_if_its_small_enough(): m = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) data = pandas.DataFrame({"fish": [0], "litres": [4]}, index=["bag"]) result = calculate(m, data) assert numpy.isnan(result.loc["bag"]["litres"]) assert numpy.isnan(result.loc["bag"]["value"])
def test_suppresses_small_numbers_in_the_numerator(): m = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) data = pandas.DataFrame({"fish": [1], "litres": [100]}, index=["bowl"]) result = calculate(m, data) assert numpy.isnan(result.loc["bowl"]["fish"]) assert numpy.isnan(result.loc["bowl"]["value"])
def test_throws_away_unused_columns(): m = Measure("ignored-id", numerator="fish", denominator="litres") data = pandas.DataFrame({ "fish": [10], "litres": [1], "colour": ["green"], "clothing": ["trousers"] }) result = calculate(m, data) assert "clothing" not in result.iloc[0] m = Measure("ignored-id", numerator="fish", denominator="litres", group_by="colour") data = pandas.DataFrame({ "fish": [10], "litres": [1], "colour": ["green"], "age": [12] }) result = calculate(m, data) assert "age" not in result.iloc[0]
def test_calculates_quotients(): m = Measure("ignored-id", numerator="fish", denominator="litres") data = pandas.DataFrame( { "fish": [10, 20, 50], "litres": [1, 2, 100] }, index=["small bowl", "large bowl", "pond"], ) result = calculate(m, data) assert result.loc["small bowl"]["value"] == 10.0 assert result.loc["large bowl"]["value"] == 10.0 assert result.loc["pond"]["value"] == 0.5
def test_groups_into_multiple_buckets(): m = Measure("ignored-id", numerator="fish", denominator="litres", group_by="colour") data = pandas.DataFrame({ "fish": [10, 10], "litres": [1, 2], "colour": ["gold", "pink"] }) result = calculate(m, data) result.set_index("colour", inplace=True) assert result.loc["gold"]["value"] == 10.0 assert result.loc["pink"]["value"] == 5.0
def test_reports_suppression_of_extra_values(): m = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) data = pandas.DataFrame({ "fish": [2, 10, 8], "litres": [10, 10, 10] }, index=["a", "b", "c"]) reporter = RecordingReporter() calculate(m, data, reporter) assert "Additional suppression in column fish" in reporter.msg
def test_doesnt_suppress_zero_values(): m = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) data = pandas.DataFrame({ "fish": [0, 1], "litres": [100, 100] }, index=["bowl", "bag"]) result = calculate(m, data) assert result.loc["bowl"]["fish"] == 0 assert result.loc["bowl"]["value"] == 0
def test_suppresses_all_equal_extra_values_to_reach_threshold(): m = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) data = pandas.DataFrame({ "fish": [1, 10, 10], "litres": [10, 10, 10] }, index=["a", "b", "c"]) result = calculate(m, data) assert numpy.isnan(result.loc["a"]["fish"]) assert numpy.isnan(result.loc["b"]["fish"]) assert numpy.isnan(result.loc["c"]["fish"])
def test_suppresses_all_small_values_even_if_total_is_way_over_threshold(): m = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) data = pandas.DataFrame({ "fish": [2, 2, 2, 2], "litres": [10, 10, 10, 10] }, index=["a", "b", "c", "d"]) result = calculate(m, data) assert numpy.isnan(result.loc["a"]["fish"]) assert numpy.isnan(result.loc["b"]["fish"]) assert numpy.isnan(result.loc["c"]["fish"]) assert numpy.isnan(result.loc["d"]["fish"])
def test_groups_data_together(): m = Measure("ignored-id", numerator="fish", denominator="litres", group_by="colour") data = pandas.DataFrame( { "fish": [10, 20], "litres": [1, 2], "colour": ["gold", "gold"] }, index=["small bowl", "large bowl"], ) result = calculate(m, data) result.set_index("colour", inplace=True) assert result.loc["gold"]["fish"] == 30 assert result.loc["gold"]["litres"] == 3 assert result.loc["gold"]["value"] == 10.0
def test_suppresses_small_numbers_after_grouping(): m = Measure( "ignored-id", numerator="fish", denominator="litres", group_by="colour", small_number_suppression=True, ) data = pandas.DataFrame({ "fish": [2, 2, 2, 2, 3, 3], "litres": [2, 2, 2, 2, 3, 3], "colour": ["gold", "gold", "bronze", "bronze", "pink", "pink"], }) result = calculate(m, data) result.set_index("colour", inplace=True) assert numpy.isnan(result.loc["gold"]["value"]) assert numpy.isnan(result.loc["bronze"]["value"]) assert result.loc["pink"]["value"] == 1.0
def test_suppresses_small_numbers_at_threshold_in_the_numerator(): m = Measure( "ignored-id", numerator="fish", denominator="litres", small_number_suppression=True, ) data = pandas.DataFrame( { "fish": [ measure.SMALL_NUMBER_THRESHOLD, measure.SMALL_NUMBER_THRESHOLD, measure.SMALL_NUMBER_THRESHOLD + 1, ], "litres": [100, 100, measure.SMALL_NUMBER_THRESHOLD + 1], }, index=["bowl", "box", "bag"], ) result = calculate(m, data) assert numpy.isnan(result.loc["bowl"]["fish"]) assert numpy.isnan(result.loc["box"]["fish"]) assert result.loc["bag"]["value"] == 1.0
def test_groups_by_multiple_columns(): m = Measure( "ignored-id", numerator="fish", denominator="litres", group_by=["colour", "nationality"], ) data = pandas.DataFrame({ "fish": [10, 20, 40, 80], "litres": [1, 1, 1, 1], "colour": ["gold", "gold", "gold", "pink"], "nationality": ["russian", "japanese", "russian", "french"], }) result = calculate(m, data) assert result.iloc[0]["colour"] == "gold" assert result.iloc[0]["nationality"] == "japanese" assert result.iloc[0]["fish"] == 20 assert result.iloc[1]["colour"] == "gold" assert result.iloc[1]["nationality"] == "russian" assert result.iloc[1]["fish"] == 50 assert result.iloc[2]["colour"] == "pink" assert result.iloc[2]["nationality"] == "french" assert result.iloc[2]["fish"] == 80
def test_stats_logging_generate_measures(mock_load, _mock_list, _mock_check, logger, tmp_path): import csv from cohortextractor.measure import Measure measures = [ Measure( id="has_code", numerator="has_code", denominator="population", ), Measure( id="has_code_one_group", numerator="has_code", denominator="population", group_by="population", ), ] mock_load.return_value = measures # initial stats expected_initial_logs = [{"measures_count": 2}] # set up an expected input file input_filepath = tmp_path / "input_2020-01-01.csv" with open(input_filepath, "w") as file_to_write: writer = csv.writer(file_to_write) writer.writerow(["patient_id", "has_code"]) writer.writerow([1, 1]) writer.writerow([2, 1]) writer.writerow([3, 1]) writer.writerow([4, 0]) generate_measures(output_dir=tmp_path) stats_logs = get_stats_logs(logger.entries) memory_logs = get_logs_by_key(stats_logs, "memory") measure_date = "2020-01-01" expected_timing_logs = [ dict( description="generate_measures", input_file="all", study_definition="study_definition", timing="start", state="started", ), dict( description="generate_measures", date=measure_date, input_file=str(input_filepath), study_definition="study_definition", timing="start", state="started", ), dict( description="Load patient dataframe for measures", date=measure_date, input_file=str(input_filepath), timing="start", state="started", ), dict( description="Load patient dataframe for measures", date=measure_date, input_file=str(input_filepath), timing="stop", state="ok", ), dict( description="Calculate measure", measure_id="has_code", date=measure_date, timing="start", state="started", ), dict( description="Calculate measure", measure_id="has_code", date=measure_date, timing="stop", state="ok", ), dict( description="Calculate measure", measure_id="has_code_one_group", date=measure_date, timing="start", state="started", ), dict( description="Calculate measure", measure_id="has_code_one_group", date=measure_date, timing="stop", state="ok", ), dict( description="generate_measures", date=measure_date, input_file=str(input_filepath), study_definition="study_definition", timing="stop", state="ok", ), dict( description="generate_measures", input_file="all", study_definition="study_definition", timing="stop", state="ok", ), ] assert_stats_logs(logger, expected_initial_logs + memory_logs, expected_timing_logs) expected_memory_logs = [ ("patient_df", measure_date, "has_code"), ("measure_df", measure_date, "has_code"), ("measure_df", measure_date, "has_code_one_group"), ] for i, memory_log in enumerate(memory_logs): df, measure_date, measure_id = expected_memory_logs[i] assert memory_log["dataframe"] == df assert memory_log["date"] == measure_date assert memory_log["measure_id"] == measure_id