Exemple #1
0
def monotonically_increasing(df: pd.DataFrame, log: ResultLog):
    """Check that timeseries values are monotonically increasing

    Input is expected to be the values for a single state
    """

    columns_to_check = ["positive", "negative","hospitalized", "death"]

    state = df["state"].min()
    if state != df["state"].max():
        raise Exception("Expected input to be for a single state")

    # TODO: don't group on state -- this is already filtered to a single state
    df = df.sort_values(["state", "date"], ascending=True)
    df_lagged = df.groupby("state")[columns_to_check] \
        .shift(1) \
        .rename(columns=lambda c: c+"_lag")

    df_comparison = df.merge(df_lagged, left_index=True, right_index=True, how="left")

    # check that all the counts are >= the previous day
    for col in columns_to_check:
        if (df_comparison[f"{col}_lag"] > df_comparison[col]).any():
            error_dates = df_comparison.loc[(df_comparison[f"{col}_lag"] > df_comparison[col])]["date"]
            error_dates_str = error_dates.astype(str).str.cat(sep=", ")

            log.error(state, f"{col} values decreased from the previous day (on {error_dates_str})")
Exemple #2
0
def checkers_initials(row, log: ResultLog):
    """Confirm that checker initials are records"""

    phase = row.phase
    if phase == "inactive": return

    target_date = row.targetDateEt.to_pydatetime()
    checked_at = row.lastCheckEt.to_pydatetime()
    if checked_at <= START_OF_TIME: return

    is_near_release = phase in ["publish", "update"]

    checker = row.checker.strip()
    doubleChecker = row.doubleChecker.strip()

    delta_hours = (target_date - checked_at).total_seconds() / (60.0 * 60.0)

    if checker == "":
        if 0 < delta_hours < 5:
            s_checked = checked_at.strftime('%m/%d %H:%M')
            log.error(row.state, f"missing checker initials but checked date set recently (at {s_checked})")
        elif is_near_release:
            log.error(row.state, f"missing checker initials")
        else:
            log.info(row.state, f"missing checker initials")
        return
    if doubleChecker == "":
        if is_near_release:
            log.error(row.state, f"missing double-checker initials")
        else:
            log.info(row.state, f"Missing double-checker initials")
        return
Exemple #3
0
def total(row, log: ResultLog):
    """Check that pendings, positive, and negative sum to the reported total"""

    n_pos, n_neg, n_pending, n_tot = \
        row.positive, row.negative, row.pending, row.total

    n_diff = n_tot - (n_pos + n_neg + n_pending)
    if n_diff != 0:
        log.error(row.state, f"Formula broken -> Postive ({n_pos}) + Negative ({n_neg}) + Pending ({n_pending}) != Total ({n_tot}), delta = {n_diff}")
Exemple #4
0
def last_update(row, log: ResultLog):
    """Source has updated within a reasonable timeframe"""

    updated_at = row.lastUpdateEt.to_pydatetime()
    target_time = row.targetDateEt.to_pydatetime()
    delta = target_time - updated_at
    days = delta.total_seconds() / (24 * 60.0 * 60)

    if days >= 1.5:
        log.error(row.state, f"source hasn't updated in {days:.1f} days")
Exemple #5
0
def total_tests(row, log: ResultLog):
    """Check that positive, and negative sum to the reported totalTest"""

    # note -- I don't know where this field is in the sheet so this test is not used right now - Josh

    n_pos, n_neg, n_tests = \
        row.positive, row.negative, row.totalTestResults

    n_diff = n_tests - (n_pos + n_neg)
    if n_diff != 0:
        log.error(row.state, f"Formula broken -> Postive ({n_pos}) + Negative ({n_neg}) != Total Tests ({n_tests}), delta = {n_diff}")
Exemple #6
0
def pendings_rate(row, log: ResultLog):
    """Check that pendings are not more than 20% of total"""

    n_pos, n_neg, n_pending = row.positive, row.negative, row.pending
    n_tot = n_pos + n_neg
    percent_pending = 100.0 * n_pending / n_tot if n_tot > 0 else 0.0

    if n_tot > 1000:
        if percent_pending > 20.0:
            log.warning(row.state, f"too many pending {percent_pending:.0f}% (pending={n_pending:,}, total={n_tot:,})")
    else:
        if percent_pending > 80.0:
            log.warning(row.state, f"too many pending {percent_pending:.0f}% (pending={n_pending:,}, total={n_tot:,})")
Exemple #7
0
def death_rate(row, log: ResultLog):
    """Check that deaths are <5% of test results"""

    n_pos, n_neg, n_deaths = row.positive, row.negative, row.death
    n_tot = n_pos + n_neg

    percent_deaths = 100.0 * n_deaths / n_tot if n_tot > 0 else 0.0
    if n_tot > 100:
        if percent_deaths > 5.0:
            log.error(row.state, f"Too many deaths {percent_deaths:.0f}% (positive={n_deaths:,}, total={n_tot:,})")
    else:
        if percent_deaths > 10.0:
            log.error(row.state, f"Too many deaths {percent_deaths:.0f}% (positive={n_deaths:,}, total={n_tot:,})")
Exemple #8
0
def positives_rate(row, log: ResultLog):
    """Check that positives compose <20% test results"""

    n_pos, n_neg, n_deaths = row.positive, row.negative, row.death
    n_tot = n_pos + n_neg

    percent_pos = 100.0 * n_pos / n_tot if n_tot > 0 else 0.0
    if n_tot > 100:
        if percent_pos > 40.0 and n_pos > 20:
            log.error(row.state, f"Too many positive {percent_pos:.0f}% (positive={n_pos:,}, total={n_tot:,})")
    else:
        if percent_pos > 80.0 and n_pos > 20:
            log.error(row.state, f"Too many positive {percent_pos:.0f}% (positive={n_pos:,}, total={n_tot:,})")
Exemple #9
0
def last_checked(row, log: ResultLog):
    """Data was checked within a reasonable timeframe"""

    target_date = row.targetDateEt.to_pydatetime()
    updated_at = row.lastUpdateEt.to_pydatetime()
    checked_at = row.lastCheckEt.to_pydatetime()

    if checked_at <= START_OF_TIME:
        phase = row.phase
        if phase == "inactive":
            pass
        elif phase in ["publish", "update"]:
            log.error(row.state, f"check needed")
        elif phase in ["prepare", "cleanup"]:
            log.info(row.state, f"check needed")
        return

    delta = updated_at - checked_at
    hours = delta.total_seconds() / (60.0 * 60)
    if hours > 2.0:
        s_updated = updated_at.strftime('%m/%d %H:%M')
        s_checked = checked_at.strftime('%m/%d %H:%M')
        log.error(row.state, f"updated since last check: {hours:.0f} hours ago at {s_updated}, checked at {s_checked}")
        return

    delta = target_date - updated_at
    hours = delta.total_seconds() / (60.0 * 60)
    if hours > 12.0:
        s_checked = checked_at.strftime('%m/%d %H:%M')
        log.warning(row.state, f"source has not been checked in {hours:.0f} hours at {s_checked}")
        return
Exemple #10
0
def increasing_values(row, df: pd.DataFrame, log: ResultLog):
    """Check that new values more than previous values

    df contains the historical values (newest first).  offset controls how many days to look back.
    """

    df = df[df.date < row.targetDate]

    #print(df)
    #exit(-1)

    dict_row = row._asdict()

    for c in ["positive", "negative", "death", "total"]:
        val = dict_row[c]
        vec = df[c].values
        prev_val = vec[0] if vec.size > 0 else 0

        if val < prev_val:
            log.error(row.state, f"{c} value ({val:,}) is less than prior value ({prev_val:,})")

        # allow value to be the same if below a threshold
        if val < IGNORE_THRESHOLDS[c]: continue

        phase = row.phase
        checked_at = row.lastCheckEt.to_pydatetime()
        is_check_field_set = checked_at > START_OF_TIME

        if val == prev_val:
            n_days, d = days_since_change(val, df[c], df["date"])
            if n_days >= 0:
                d = str(d)
                d = d[4:6] + "/" + d[6:8]

                if prev_val >= 20 and (is_check_field_set or phase in ["publish", "update"]):
                    log.error(row.state, f"{c} value ({val:,}) has not changed since {d} ({n_days} days)")
                else:
                    log.warning(row.state, f"{c} value ({val:,}) has not changed since {d} ({n_days} days)")
            else:
                log.error(row.state, f"{c} value ({val:,}) constant for all time")
            continue

        p_observed = 100.0 * val / prev_val - 100.0

        #TODO: estimate expected increase from recent history
        p_min, p_max = EXPECTED_PERCENT_THRESHOLDS[c]
        if p_observed < p_min or p_observed > p_max:
            log.warning(row.state, f"{c} value ({val:,}) is a {p_observed:.0f}% increase, expected: {p_min:.0f} to {p_max:.0f}%")
Exemple #11
0
def counties_rollup_to_state(row, counties: pd.DataFrame, log: ResultLog):
    """
    Check that county totals from NYT, CSBS, CDS datasets are
    about equal to the reported state totals. Metrics compared are:
        - positive cases
        - patient deaths
    """
    if row.positive > 100:
        pos_error =  abs(counties["cases"] - row.positive).min() / row.positive
        if pos_error > COUNTY_ERROR_THRESHOLDS["positive"]:
            closest_pos = int(round(pos_error * row.positive + row.positive))
            log.error(row.state, f"county aggregate for positive tests does not match state totals (state: {row.positive}, county: {closest_pos})")

    if row.death > 20:
        death_error = abs(counties["deaths"] - row.death).min() / row.death
        if death_error > COUNTY_ERROR_THRESHOLDS["death"]:
            closest_death = int(round(death_error * row.death + row.death))
            log.error(row.state, f"county aggregate for patient deaths does not match state totals (state: {row.death}, county: {closest_death})")
Exemple #12
0
def expected_positive_increase( current: pd.DataFrame, history: pd.DataFrame,
                                log: ResultLog, context: str, config: QCConfig=None):
    """
    Fit state-level daily positives data to an exponential and a linear curve.
    Get expected vs actual case increase to determine if current positives
    are within the expected ranges.

    The exponential is used as the upper bound. The linear is used as the lower bound.

    TODO: Eventually these curves will NOT be exp (perhaps logistic?)
          Useful to know which curves have been "leveled" but from a
          data quality perspective, this check would become annoying
    """

    if not config: config = QCConfig()

    forecast_date = current.lastUpdateEt.to_pydatetime().strftime('%Y%m%d')
    history = history.loc[history["date"].astype(str) != forecast_date]

    forecast = Forecast()
    forecast.fit(history)
    forecast.project(current)

    if config.save_results:
        save_forecast_hd5(forecast, config.results_dir)
    elif config.plot_models:
        plot_to_file(forecast, f"{config.images_dir}/{context}", FIT_THRESHOLDS)

    state = forecast.state
    date = forecast.date
    actual_value, expected_linear, expected_exp = forecast.results

    min_value = int(FIT_THRESHOLDS[0] * expected_linear)
    max_value = int(FIT_THRESHOLDS[1] * expected_exp)

    if not (min_value <= actual_value <=  max_value):
        direction = "increase"
        if actual_value < expected_linear:
            direction = "drop"

        log.error(state, f"unexpected {direction} in positive cases ({actual_value:,}) for {date}, expected between {min_value:,} and {max_value:,}")
def check_history(ds: DataSource) -> ResultLog:
    """
    Check the history
    """

    logger.info("check history")

    df = ds.history

    log = ResultLog()

    for state in df["state"].drop_duplicates().values:
        state_df = df.loc[df["state"] == state]
        checks.monotonically_increasing(state_df, log)

    return log
def check_current(ds: DataSource, config: QCConfig) -> ResultLog:
    """
    Check the current published results
    """

    logger.info("check current")

    log = ResultLog()

    df = ds.current

    publish_date = 20200330
    logger.error(f" [date is hard-coded to {publish_date}]")

    # setup run settings equivalent to publish date at 5PM
    s = str(publish_date)
    y, m, d = int(s[0:4]), int(s[4:6]), int(s[6:8])
    publish_timestamp = udatetime.naivedatetime_as_eastern(
        datetime(y, m, d, 12 + 5))

    ds._target_date = publish_timestamp

    df["targetDate"] = publish_date
    df["targetDateEt"] = publish_timestamp
    df["lastCheckEt"] = df["targetDateEt"]
    df["phase"] = "publish"

    for row in df.itertuples():
        checks.total(row, log)
        checks.last_update(row, log)
        checks.positives_rate(row, log)
        checks.death_rate(row, log)
        checks.pendings_rate(row, log)

        df_history = ds.history[ds.history.state == row.state]
        checks.increasing_values(row, df_history, log)
        checks.expected_positive_increase(row, df_history, log, "current",
                                          config)

        df_county_rollup = ds.county_rollup[ds.county_rollup.state ==
                                            row.state]
        if not df_county_rollup.empty:
            checks.counties_rollup_to_state(row, df_county_rollup, log)

    return log
Exemple #15
0
def less_recovered_than_positive(row, log: ResultLog):
    """Check that we don't have more recovered than positive"""

    if row.recovered > row.positive:
        log.error(row.state, f"More recovered than positive (recovered={row.recovered:,}, positive={row.positive:,})")
def check_working(ds: DataSource, config: QCConfig) -> ResultLog:
    """
    Check unpublished results in the working google sheet
    https://docs.google.com/spreadsheets/d/1MvvbHfnjF67GnYUDJJiNYUmGco5KQ9PW0ZRnEP9ndlU/edit#gid=1777138528
    """

    logger.info("check working")
    log = ResultLog()

    # targetDate is the date that the dev sheet is currently working on.
    # phase is what part of their process they are in.
    # targetDateEt is the time that should be used on any 'staleness' checks

    d, phase = checks.current_time_and_phase()

    ds._target_date = d

    df = ds.working
    df["targetDate"] = d.year * 10000 + d.month * 100 + d.day
    df["targetDateEt"] = d
    df["phase"] = phase

    logger.info(f"Running with target date = {d} and phase = {phase}")

    # *** WHEN YOU CHANGE A CHECK THAT IMPACTS WORKING, MAKE SURE TO UPDATE THE EXCEL TRACKING DOCUMENT ***

    for row in df.itertuples():
        try:

            checks.total(row, log)
            #checks.total_tests(row, log)
            checks.last_update(row, log)
            checks.last_checked(row, log)
            checks.checkers_initials(row, log)
            checks.positives_rate(row, log)
            checks.death_rate(row, log)
            checks.less_recovered_than_positive(row, log)
            checks.pendings_rate(row, log)

            df_history = ds.history[ds.history.state == row.state]
            checks.increasing_values(row, df_history, log)
            checks.expected_positive_increase(row, df_history, log, "working",
                                              config)

            df_county_rollup = ds.county_rollup[ds.county_rollup.state ==
                                                row.state]
            if not df_county_rollup.empty:
                checks.counties_rollup_to_state(row, df_county_rollup, log)

        except Exception as ex:
            logger.exception(ex)
            log.error(row.state, f"{ex}")

    # run loop at end, insted of during run
    if config.plot_models and config.save_results:
        for row in df.itertuples():
            try:
                forecast = load_forecast_hd5(config.results_dir, row.state,
                                             row.targetDate)
                if forecast is None:
                    logger.warning(
                        f"Fould not load forecast for {row.state}/{row.targetDate}"
                    )
                else:
                    plot_to_file(forecast, f"{config.images_dir}/working",
                                 checks.FIT_THRESHOLDS)
            except Exception as ex:
                logger.exception(ex)
                log.error(row.state, f"{ex}")

    return log