def total_tests(row, log: ResultLog): """Check that positive, and negative sum to the reported totalTest""" # note -- I don't know where this field is in the sheet so this test is not used right now - Josh n_pos, n_neg, n_tests = \ row.positive, row.negative, row.totalTestResults n_diff = n_tests - (n_pos + n_neg) if n_diff != 0: log.error(row.state, f"Formula broken -> Postive ({n_pos}) + Negative ({n_neg}) != Total Tests ({n_tests}), delta = {n_diff}")
def death_rate(row, log: ResultLog): """Check that deaths are <5% of test results""" n_pos, n_neg, n_deaths = row.positive, row.negative, row.death n_tot = n_pos + n_neg percent_deaths = 100.0 * n_deaths / n_tot if n_tot > 0 else 0.0 if n_tot > 100: if percent_deaths > 5.0: log.error(row.state, f"Too many deaths {percent_deaths:.0f}% (positive={n_deaths:,}, total={n_tot:,})") else: if percent_deaths > 10.0: log.error(row.state, f"Too many deaths {percent_deaths:.0f}% (positive={n_deaths:,}, total={n_tot:,})")
def positives_rate(row, log: ResultLog): """Check that positives compose <20% test results""" n_pos, n_neg, n_deaths = row.positive, row.negative, row.death n_tot = n_pos + n_neg percent_pos = 100.0 * n_pos / n_tot if n_tot > 0 else 0.0 if n_tot > 100: if percent_pos > 40.0 and n_pos > 20: log.error(row.state, f"Too many positive {percent_pos:.0f}% (positive={n_pos:,}, total={n_tot:,})") else: if percent_pos > 80.0 and n_pos > 20: log.error(row.state, f"Too many positive {percent_pos:.0f}% (positive={n_pos:,}, total={n_tot:,})")
def counties_rollup_to_state(row, counties: pd.DataFrame, log: ResultLog): """ Check that county totals from NYT, CSBS, CDS datasets are about equal to the reported state totals. Metrics compared are: - positive cases - patient deaths """ if row.positive > 100: pos_error = abs(counties["cases"] - row.positive).min() / row.positive if pos_error > COUNTY_ERROR_THRESHOLDS["positive"]: closest_pos = int(round(pos_error * row.positive + row.positive)) log.error(row.state, f"county aggregate for positive tests does not match state totals (state: {row.positive}, county: {closest_pos})") if row.death > 20: death_error = abs(counties["deaths"] - row.death).min() / row.death if death_error > COUNTY_ERROR_THRESHOLDS["death"]: closest_death = int(round(death_error * row.death + row.death)) log.error(row.state, f"county aggregate for patient deaths does not match state totals (state: {row.death}, county: {closest_death})")
def expected_positive_increase( current: pd.DataFrame, history: pd.DataFrame, log: ResultLog, context: str, config: QCConfig=None): """ Fit state-level daily positives data to an exponential and a linear curve. Get expected vs actual case increase to determine if current positives are within the expected ranges. The exponential is used as the upper bound. The linear is used as the lower bound. TODO: Eventually these curves will NOT be exp (perhaps logistic?) Useful to know which curves have been "leveled" but from a data quality perspective, this check would become annoying """ if not config: config = QCConfig() forecast_date = current.lastUpdateEt.to_pydatetime().strftime('%Y%m%d') history = history.loc[history["date"].astype(str) != forecast_date] forecast = Forecast() forecast.fit(history) forecast.project(current) if config.save_results: save_forecast_hd5(forecast, config.results_dir) elif config.plot_models: plot_to_file(forecast, f"{config.images_dir}/{context}", FIT_THRESHOLDS) state = forecast.state date = forecast.date actual_value, expected_linear, expected_exp = forecast.results min_value = int(FIT_THRESHOLDS[0] * expected_linear) max_value = int(FIT_THRESHOLDS[1] * expected_exp) if not (min_value <= actual_value <= max_value): direction = "increase" if actual_value < expected_linear: direction = "drop" log.error(state, f"unexpected {direction} in positive cases ({actual_value:,}) for {date}, expected between {min_value:,} and {max_value:,}")
def checkers_initials(row, log: ResultLog): """Confirm that checker initials are records""" phase = row.phase if phase == "inactive": return target_date = row.targetDateEt.to_pydatetime() checked_at = row.lastCheckEt.to_pydatetime() if checked_at <= START_OF_TIME: return is_near_release = phase in ["publish", "update"] checker = row.checker.strip() doubleChecker = row.doubleChecker.strip() delta_hours = (target_date - checked_at).total_seconds() / (60.0 * 60.0) if checker == "": if 0 < delta_hours < 5: s_checked = checked_at.strftime('%m/%d %H:%M') log.error(row.state, f"missing checker initials but checked date set recently (at {s_checked})") elif is_near_release: log.error(row.state, f"missing checker initials") else: log.info(row.state, f"missing checker initials") return if doubleChecker == "": if is_near_release: log.error(row.state, f"missing double-checker initials") else: log.info(row.state, f"Missing double-checker initials") return
def increasing_values(row, df: pd.DataFrame, log: ResultLog): """Check that new values more than previous values df contains the historical values (newest first). offset controls how many days to look back. """ df = df[df.date < row.targetDate] #print(df) #exit(-1) dict_row = row._asdict() for c in ["positive", "negative", "death", "total"]: val = dict_row[c] vec = df[c].values prev_val = vec[0] if vec.size > 0 else 0 if val < prev_val: log.error(row.state, f"{c} value ({val:,}) is less than prior value ({prev_val:,})") # allow value to be the same if below a threshold if val < IGNORE_THRESHOLDS[c]: continue phase = row.phase checked_at = row.lastCheckEt.to_pydatetime() is_check_field_set = checked_at > START_OF_TIME if val == prev_val: n_days, d = days_since_change(val, df[c], df["date"]) if n_days >= 0: d = str(d) d = d[4:6] + "/" + d[6:8] if prev_val >= 20 and (is_check_field_set or phase in ["publish", "update"]): log.error(row.state, f"{c} value ({val:,}) has not changed since {d} ({n_days} days)") else: log.warning(row.state, f"{c} value ({val:,}) has not changed since {d} ({n_days} days)") else: log.error(row.state, f"{c} value ({val:,}) constant for all time") continue p_observed = 100.0 * val / prev_val - 100.0 #TODO: estimate expected increase from recent history p_min, p_max = EXPECTED_PERCENT_THRESHOLDS[c] if p_observed < p_min or p_observed > p_max: log.warning(row.state, f"{c} value ({val:,}) is a {p_observed:.0f}% increase, expected: {p_min:.0f} to {p_max:.0f}%")
def less_recovered_than_positive(row, log: ResultLog): """Check that we don't have more recovered than positive""" if row.recovered > row.positive: log.error(row.state, f"More recovered than positive (recovered={row.recovered:,}, positive={row.positive:,})")
def check_working(ds: DataSource, config: QCConfig) -> ResultLog: """ Check unpublished results in the working google sheet https://docs.google.com/spreadsheets/d/1MvvbHfnjF67GnYUDJJiNYUmGco5KQ9PW0ZRnEP9ndlU/edit#gid=1777138528 """ logger.info("check working") log = ResultLog() # targetDate is the date that the dev sheet is currently working on. # phase is what part of their process they are in. # targetDateEt is the time that should be used on any 'staleness' checks d, phase = checks.current_time_and_phase() ds._target_date = d df = ds.working df["targetDate"] = d.year * 10000 + d.month * 100 + d.day df["targetDateEt"] = d df["phase"] = phase logger.info(f"Running with target date = {d} and phase = {phase}") # *** WHEN YOU CHANGE A CHECK THAT IMPACTS WORKING, MAKE SURE TO UPDATE THE EXCEL TRACKING DOCUMENT *** for row in df.itertuples(): try: checks.total(row, log) #checks.total_tests(row, log) checks.last_update(row, log) checks.last_checked(row, log) checks.checkers_initials(row, log) checks.positives_rate(row, log) checks.death_rate(row, log) checks.less_recovered_than_positive(row, log) checks.pendings_rate(row, log) df_history = ds.history[ds.history.state == row.state] checks.increasing_values(row, df_history, log) checks.expected_positive_increase(row, df_history, log, "working", config) df_county_rollup = ds.county_rollup[ds.county_rollup.state == row.state] if not df_county_rollup.empty: checks.counties_rollup_to_state(row, df_county_rollup, log) except Exception as ex: logger.exception(ex) log.error(row.state, f"{ex}") # run loop at end, insted of during run if config.plot_models and config.save_results: for row in df.itertuples(): try: forecast = load_forecast_hd5(config.results_dir, row.state, row.targetDate) if forecast is None: logger.warning( f"Fould not load forecast for {row.state}/{row.targetDate}" ) else: plot_to_file(forecast, f"{config.images_dir}/working", checks.FIT_THRESHOLDS) except Exception as ex: logger.exception(ex) log.error(row.state, f"{ex}") return log