def monotonically_increasing(df: pd.DataFrame, log: ResultLog): """Check that timeseries values are monotonically increasing Input is expected to be the values for a single state """ columns_to_check = ["positive", "negative","hospitalized", "death"] state = df["state"].min() if state != df["state"].max(): raise Exception("Expected input to be for a single state") # TODO: don't group on state -- this is already filtered to a single state df = df.sort_values(["state", "date"], ascending=True) df_lagged = df.groupby("state")[columns_to_check] \ .shift(1) \ .rename(columns=lambda c: c+"_lag") df_comparison = df.merge(df_lagged, left_index=True, right_index=True, how="left") # check that all the counts are >= the previous day for col in columns_to_check: if (df_comparison[f"{col}_lag"] > df_comparison[col]).any(): error_dates = df_comparison.loc[(df_comparison[f"{col}_lag"] > df_comparison[col])]["date"] error_dates_str = error_dates.astype(str).str.cat(sep=", ") log.error(state, f"{col} values decreased from the previous day (on {error_dates_str})")
def checkers_initials(row, log: ResultLog): """Confirm that checker initials are records""" phase = row.phase if phase == "inactive": return target_date = row.targetDateEt.to_pydatetime() checked_at = row.lastCheckEt.to_pydatetime() if checked_at <= START_OF_TIME: return is_near_release = phase in ["publish", "update"] checker = row.checker.strip() doubleChecker = row.doubleChecker.strip() delta_hours = (target_date - checked_at).total_seconds() / (60.0 * 60.0) if checker == "": if 0 < delta_hours < 5: s_checked = checked_at.strftime('%m/%d %H:%M') log.error(row.state, f"missing checker initials but checked date set recently (at {s_checked})") elif is_near_release: log.error(row.state, f"missing checker initials") else: log.info(row.state, f"missing checker initials") return if doubleChecker == "": if is_near_release: log.error(row.state, f"missing double-checker initials") else: log.info(row.state, f"Missing double-checker initials") return
def total(row, log: ResultLog): """Check that pendings, positive, and negative sum to the reported total""" n_pos, n_neg, n_pending, n_tot = \ row.positive, row.negative, row.pending, row.total n_diff = n_tot - (n_pos + n_neg + n_pending) if n_diff != 0: log.error(row.state, f"Formula broken -> Postive ({n_pos}) + Negative ({n_neg}) + Pending ({n_pending}) != Total ({n_tot}), delta = {n_diff}")
def last_update(row, log: ResultLog): """Source has updated within a reasonable timeframe""" updated_at = row.lastUpdateEt.to_pydatetime() target_time = row.targetDateEt.to_pydatetime() delta = target_time - updated_at days = delta.total_seconds() / (24 * 60.0 * 60) if days >= 1.5: log.error(row.state, f"source hasn't updated in {days:.1f} days")
def total_tests(row, log: ResultLog): """Check that positive, and negative sum to the reported totalTest""" # note -- I don't know where this field is in the sheet so this test is not used right now - Josh n_pos, n_neg, n_tests = \ row.positive, row.negative, row.totalTestResults n_diff = n_tests - (n_pos + n_neg) if n_diff != 0: log.error(row.state, f"Formula broken -> Postive ({n_pos}) + Negative ({n_neg}) != Total Tests ({n_tests}), delta = {n_diff}")
def pendings_rate(row, log: ResultLog): """Check that pendings are not more than 20% of total""" n_pos, n_neg, n_pending = row.positive, row.negative, row.pending n_tot = n_pos + n_neg percent_pending = 100.0 * n_pending / n_tot if n_tot > 0 else 0.0 if n_tot > 1000: if percent_pending > 20.0: log.warning(row.state, f"too many pending {percent_pending:.0f}% (pending={n_pending:,}, total={n_tot:,})") else: if percent_pending > 80.0: log.warning(row.state, f"too many pending {percent_pending:.0f}% (pending={n_pending:,}, total={n_tot:,})")
def death_rate(row, log: ResultLog): """Check that deaths are <5% of test results""" n_pos, n_neg, n_deaths = row.positive, row.negative, row.death n_tot = n_pos + n_neg percent_deaths = 100.0 * n_deaths / n_tot if n_tot > 0 else 0.0 if n_tot > 100: if percent_deaths > 5.0: log.error(row.state, f"Too many deaths {percent_deaths:.0f}% (positive={n_deaths:,}, total={n_tot:,})") else: if percent_deaths > 10.0: log.error(row.state, f"Too many deaths {percent_deaths:.0f}% (positive={n_deaths:,}, total={n_tot:,})")
def positives_rate(row, log: ResultLog): """Check that positives compose <20% test results""" n_pos, n_neg, n_deaths = row.positive, row.negative, row.death n_tot = n_pos + n_neg percent_pos = 100.0 * n_pos / n_tot if n_tot > 0 else 0.0 if n_tot > 100: if percent_pos > 40.0 and n_pos > 20: log.error(row.state, f"Too many positive {percent_pos:.0f}% (positive={n_pos:,}, total={n_tot:,})") else: if percent_pos > 80.0 and n_pos > 20: log.error(row.state, f"Too many positive {percent_pos:.0f}% (positive={n_pos:,}, total={n_tot:,})")
def last_checked(row, log: ResultLog): """Data was checked within a reasonable timeframe""" target_date = row.targetDateEt.to_pydatetime() updated_at = row.lastUpdateEt.to_pydatetime() checked_at = row.lastCheckEt.to_pydatetime() if checked_at <= START_OF_TIME: phase = row.phase if phase == "inactive": pass elif phase in ["publish", "update"]: log.error(row.state, f"check needed") elif phase in ["prepare", "cleanup"]: log.info(row.state, f"check needed") return delta = updated_at - checked_at hours = delta.total_seconds() / (60.0 * 60) if hours > 2.0: s_updated = updated_at.strftime('%m/%d %H:%M') s_checked = checked_at.strftime('%m/%d %H:%M') log.error(row.state, f"updated since last check: {hours:.0f} hours ago at {s_updated}, checked at {s_checked}") return delta = target_date - updated_at hours = delta.total_seconds() / (60.0 * 60) if hours > 12.0: s_checked = checked_at.strftime('%m/%d %H:%M') log.warning(row.state, f"source has not been checked in {hours:.0f} hours at {s_checked}") return
def increasing_values(row, df: pd.DataFrame, log: ResultLog): """Check that new values more than previous values df contains the historical values (newest first). offset controls how many days to look back. """ df = df[df.date < row.targetDate] #print(df) #exit(-1) dict_row = row._asdict() for c in ["positive", "negative", "death", "total"]: val = dict_row[c] vec = df[c].values prev_val = vec[0] if vec.size > 0 else 0 if val < prev_val: log.error(row.state, f"{c} value ({val:,}) is less than prior value ({prev_val:,})") # allow value to be the same if below a threshold if val < IGNORE_THRESHOLDS[c]: continue phase = row.phase checked_at = row.lastCheckEt.to_pydatetime() is_check_field_set = checked_at > START_OF_TIME if val == prev_val: n_days, d = days_since_change(val, df[c], df["date"]) if n_days >= 0: d = str(d) d = d[4:6] + "/" + d[6:8] if prev_val >= 20 and (is_check_field_set or phase in ["publish", "update"]): log.error(row.state, f"{c} value ({val:,}) has not changed since {d} ({n_days} days)") else: log.warning(row.state, f"{c} value ({val:,}) has not changed since {d} ({n_days} days)") else: log.error(row.state, f"{c} value ({val:,}) constant for all time") continue p_observed = 100.0 * val / prev_val - 100.0 #TODO: estimate expected increase from recent history p_min, p_max = EXPECTED_PERCENT_THRESHOLDS[c] if p_observed < p_min or p_observed > p_max: log.warning(row.state, f"{c} value ({val:,}) is a {p_observed:.0f}% increase, expected: {p_min:.0f} to {p_max:.0f}%")
def counties_rollup_to_state(row, counties: pd.DataFrame, log: ResultLog): """ Check that county totals from NYT, CSBS, CDS datasets are about equal to the reported state totals. Metrics compared are: - positive cases - patient deaths """ if row.positive > 100: pos_error = abs(counties["cases"] - row.positive).min() / row.positive if pos_error > COUNTY_ERROR_THRESHOLDS["positive"]: closest_pos = int(round(pos_error * row.positive + row.positive)) log.error(row.state, f"county aggregate for positive tests does not match state totals (state: {row.positive}, county: {closest_pos})") if row.death > 20: death_error = abs(counties["deaths"] - row.death).min() / row.death if death_error > COUNTY_ERROR_THRESHOLDS["death"]: closest_death = int(round(death_error * row.death + row.death)) log.error(row.state, f"county aggregate for patient deaths does not match state totals (state: {row.death}, county: {closest_death})")
def expected_positive_increase( current: pd.DataFrame, history: pd.DataFrame, log: ResultLog, context: str, config: QCConfig=None): """ Fit state-level daily positives data to an exponential and a linear curve. Get expected vs actual case increase to determine if current positives are within the expected ranges. The exponential is used as the upper bound. The linear is used as the lower bound. TODO: Eventually these curves will NOT be exp (perhaps logistic?) Useful to know which curves have been "leveled" but from a data quality perspective, this check would become annoying """ if not config: config = QCConfig() forecast_date = current.lastUpdateEt.to_pydatetime().strftime('%Y%m%d') history = history.loc[history["date"].astype(str) != forecast_date] forecast = Forecast() forecast.fit(history) forecast.project(current) if config.save_results: save_forecast_hd5(forecast, config.results_dir) elif config.plot_models: plot_to_file(forecast, f"{config.images_dir}/{context}", FIT_THRESHOLDS) state = forecast.state date = forecast.date actual_value, expected_linear, expected_exp = forecast.results min_value = int(FIT_THRESHOLDS[0] * expected_linear) max_value = int(FIT_THRESHOLDS[1] * expected_exp) if not (min_value <= actual_value <= max_value): direction = "increase" if actual_value < expected_linear: direction = "drop" log.error(state, f"unexpected {direction} in positive cases ({actual_value:,}) for {date}, expected between {min_value:,} and {max_value:,}")
def check_history(ds: DataSource) -> ResultLog: """ Check the history """ logger.info("check history") df = ds.history log = ResultLog() for state in df["state"].drop_duplicates().values: state_df = df.loc[df["state"] == state] checks.monotonically_increasing(state_df, log) return log
def check_current(ds: DataSource, config: QCConfig) -> ResultLog: """ Check the current published results """ logger.info("check current") log = ResultLog() df = ds.current publish_date = 20200330 logger.error(f" [date is hard-coded to {publish_date}]") # setup run settings equivalent to publish date at 5PM s = str(publish_date) y, m, d = int(s[0:4]), int(s[4:6]), int(s[6:8]) publish_timestamp = udatetime.naivedatetime_as_eastern( datetime(y, m, d, 12 + 5)) ds._target_date = publish_timestamp df["targetDate"] = publish_date df["targetDateEt"] = publish_timestamp df["lastCheckEt"] = df["targetDateEt"] df["phase"] = "publish" for row in df.itertuples(): checks.total(row, log) checks.last_update(row, log) checks.positives_rate(row, log) checks.death_rate(row, log) checks.pendings_rate(row, log) df_history = ds.history[ds.history.state == row.state] checks.increasing_values(row, df_history, log) checks.expected_positive_increase(row, df_history, log, "current", config) df_county_rollup = ds.county_rollup[ds.county_rollup.state == row.state] if not df_county_rollup.empty: checks.counties_rollup_to_state(row, df_county_rollup, log) return log
def less_recovered_than_positive(row, log: ResultLog): """Check that we don't have more recovered than positive""" if row.recovered > row.positive: log.error(row.state, f"More recovered than positive (recovered={row.recovered:,}, positive={row.positive:,})")
def check_working(ds: DataSource, config: QCConfig) -> ResultLog: """ Check unpublished results in the working google sheet https://docs.google.com/spreadsheets/d/1MvvbHfnjF67GnYUDJJiNYUmGco5KQ9PW0ZRnEP9ndlU/edit#gid=1777138528 """ logger.info("check working") log = ResultLog() # targetDate is the date that the dev sheet is currently working on. # phase is what part of their process they are in. # targetDateEt is the time that should be used on any 'staleness' checks d, phase = checks.current_time_and_phase() ds._target_date = d df = ds.working df["targetDate"] = d.year * 10000 + d.month * 100 + d.day df["targetDateEt"] = d df["phase"] = phase logger.info(f"Running with target date = {d} and phase = {phase}") # *** WHEN YOU CHANGE A CHECK THAT IMPACTS WORKING, MAKE SURE TO UPDATE THE EXCEL TRACKING DOCUMENT *** for row in df.itertuples(): try: checks.total(row, log) #checks.total_tests(row, log) checks.last_update(row, log) checks.last_checked(row, log) checks.checkers_initials(row, log) checks.positives_rate(row, log) checks.death_rate(row, log) checks.less_recovered_than_positive(row, log) checks.pendings_rate(row, log) df_history = ds.history[ds.history.state == row.state] checks.increasing_values(row, df_history, log) checks.expected_positive_increase(row, df_history, log, "working", config) df_county_rollup = ds.county_rollup[ds.county_rollup.state == row.state] if not df_county_rollup.empty: checks.counties_rollup_to_state(row, df_county_rollup, log) except Exception as ex: logger.exception(ex) log.error(row.state, f"{ex}") # run loop at end, insted of during run if config.plot_models and config.save_results: for row in df.itertuples(): try: forecast = load_forecast_hd5(config.results_dir, row.state, row.targetDate) if forecast is None: logger.warning( f"Fould not load forecast for {row.state}/{row.targetDate}" ) else: plot_to_file(forecast, f"{config.images_dir}/working", checks.FIT_THRESHOLDS) except Exception as ex: logger.exception(ex) log.error(row.state, f"{ex}") return log