Beispiel #1
0
 def test_bad_inputs(self):
     with pytest.raises(ValueError):
         Smoother(smoother_name="hamburger")
     with pytest.raises(ValueError):
         Smoother(impute_method="hamburger")
     with pytest.raises(ValueError):
         Smoother(boundary_method="hamburger")
     with pytest.raises(ValueError):
         Smoother(window_length=1)
Beispiel #2
0
 def test_causal_savgol_coeffs(self):
     # The coefficients should return standard average weights for M=0
     nl, nr = -10, 0
     window_length = nr - nl + 1
     smoother = Smoother(
         smoother_name="savgol",
         window_length=window_length,
         poly_fit_degree=0,
         gaussian_bandwidth=None,
     )
     assert np.allclose(smoother.coeffs,
                        np.ones(window_length) / window_length)
Beispiel #3
0
    def test_left_gauss_linear_smoother(self):
        # The raw and smoothed lengths should match
        signal = np.ones(30)
        smoother = Smoother(smoother_name="left_gauss_linear")
        smoothed_signal = smoother.smooth(signal)
        assert len(signal) == len(smoothed_signal)
        # The raw and smoothed arrays should be identical on constant data
        # modulo the nans
        assert np.allclose(signal[1:], smoothed_signal[1:])

        # The smoother should basically be the identity when the Gaussian kernel
        # is set to weigh the present value overwhelmingly
        signal = np.arange(1, 30) + np.random.normal(0, 1, 29)
        smoother = Smoother(smoother_name="left_gauss_linear",
                            gaussian_bandwidth=0.1)
        assert np.allclose(smoother.smooth(signal)[1:], signal[1:])
Beispiel #4
0
    def test_moving_average_smoother(self):
        # Test non-integer window-length
        with pytest.raises(ValueError):
            signal = np.array([1, 1, 1])
            Smoother(smoother_name="window_average",
                     window_length=5.5).smooth(signal)

        # The raw and smoothed lengths should match
        signal = np.ones(30)
        smoother = Smoother(smoother_name="moving_average")
        smoothed_signal = smoother.smooth(signal)
        assert len(signal) == len(smoothed_signal)

        # The raw and smoothed arrays should be identical on constant data
        # modulo the nans
        signal = np.ones(30)
        window_length = 10
        smoother = Smoother(smoother_name="moving_average",
                            window_length=window_length)
        smoothed_signal = smoother.smooth(signal)
        assert np.allclose(signal[window_length - 1:],
                           smoothed_signal[window_length - 1:])
Beispiel #5
0
]
SENSOR_NAME_MAP = {
    "new_counts": ("incidence_num", False),
    "cumulative_counts": ("cumulative_num", False),
    "incidence": ("incidence_prop", False),
    "cumulative_prop": ("cumulative_prop", False),
}
# Temporarily added for wip_ signals
# WIP_SENSOR_NAME_MAP = {
#     "new_counts":           ("incid_num", False),
#     "cumulative_counts":    ("cumul_num", False),
#     "incidence":            ("incid_prop", False),
#     "cumulative_prop":      ("cumul_prop", False),
# }
SMOOTHERS_MAP = {
    "unsmoothed": (Smoother("identity").smooth, ""),
    "seven_day_average": (Smoother("moving_average",
                                   window_length=7).smooth, "7dav_"),
}
GEO_RESOLUTIONS = ["county", "state", "msa", "hrr", "hhs", "nation"]


def run_module(params: Dict[str, Any]):
    """Run the JHU indicator module.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
Beispiel #6
0
    def test_causal_savgol_smoother(self):
        # The raw and smoothed lengths should match
        signal = np.ones(30)
        window_length = 10
        smoother = Smoother(smoother_name="savgol",
                            window_length=window_length,
                            poly_fit_degree=0)
        smoothed_signal = smoother.smooth(signal)
        assert len(signal) == len(smoothed_signal)
        # The raw and smoothed arrays should be identical on constant data
        # modulo the nans, when M >= 0
        assert np.allclose(signal[window_length - 1:],
                           smoothed_signal[window_length - 1:])

        # The raw and smoothed arrays should be identical on linear data
        # modulo the nans, when M >= 1
        signal = np.arange(30)
        smoother = Smoother(smoother_name="savgol",
                            window_length=window_length,
                            poly_fit_degree=1)
        smoothed_signal = smoother.smooth(signal)
        assert np.allclose(signal[window_length - 1:],
                           smoothed_signal[window_length - 1:])

        # The raw and smoothed arrays should be identical on quadratic data
        # modulo the nans, when M >= 2
        signal = np.arange(30)**2
        smoother = Smoother(smoother_name="savgol",
                            window_length=window_length,
                            poly_fit_degree=2)
        smoothed_signal = smoother.smooth(signal)
        assert np.allclose(signal[window_length - 1:],
                           smoothed_signal[window_length - 1:])

        # The savgol method should match the linear regression method on the first
        # window_length-many values of the signal, if the savgol_weighting is set to true,
        # and the polynomial fit degree is set to 1. Beyond that, there will be very small
        # differences between the signals (due to "left_gauss_linear" not having a window_length
        # cutoff).
        window_length = 50
        signal = np.arange(window_length) + np.random.randn(window_length)
        smoother = Smoother(smoother_name="left_gauss_linear")
        smoothed_signal1 = smoother.smooth(signal)
        smoother = Smoother(
            smoother_name="savgol",
            window_length=window_length,
            poly_fit_degree=1,
        )
        smoothed_signal2 = smoother.smooth(signal)
        assert np.allclose(smoothed_signal1, smoothed_signal2)

        # Test the all nans case
        signal = np.nan * np.ones(10)
        smoother = Smoother(window_length=9)
        smoothed_signal = smoother.smooth(signal)
        assert np.all(np.isnan(smoothed_signal))

        # Test the case where the signal is length 1
        signal = np.ones(1)
        smoother = Smoother()
        smoothed_signal = smoother.smooth(signal)
        assert np.allclose(smoothed_signal, signal)

        # Test the case where the signal length is less than polynomial_fit_degree
        signal = np.ones(2)
        smoother = Smoother(poly_fit_degree=3)
        smoothed_signal = smoother.smooth(signal)
        assert np.allclose(smoothed_signal, signal)

        # Test an edge fitting case
        signal = np.array([np.nan, 1, np.nan])
        smoother = Smoother(poly_fit_degree=1, window_length=2)
        smoothed_signal = smoother.smooth(signal)
        assert np.allclose(smoothed_signal,
                           np.array([np.nan, 1, 1]),
                           equal_nan=True)

        # Test a range of cases where the signal size following a sequence of nans is returned
        for i in range(10):
            signal = np.hstack([[np.nan, np.nan, np.nan], np.ones(i)])
            smoother = Smoother(poly_fit_degree=0, window_length=5)
            smoothed_signal = smoother.smooth(signal)
            assert np.allclose(smoothed_signal, signal, equal_nan=True)

        # test window_length > len(signal) and boundary_method="identity"
        signal = np.arange(20)
        smoother = Smoother(boundary_method="identity", window_length=30)
        smoothed_signal = smoother.smooth(signal)
        assert np.allclose(signal, smoothed_signal)
Beispiel #7
0
    def test_pandas_series_input(self):
        # The savgol method should match the linear regression method on the first
        # window_length-many values of the signal, if the savgol_weighting is set to true,
        # and the polynomial fit degree is set to 1. Beyond that, there will be very small
        # differences between the signals (due to "left_gauss_linear" not having a window_length
        # cutoff).
        window_length = 50
        signal = pd.Series(
            np.arange(window_length) + np.random.randn(window_length))
        smoother = Smoother(smoother_name="left_gauss_linear")
        smoothed_signal1 = smoother.smooth(signal)
        smoother = Smoother(
            smoother_name="savgol",
            window_length=window_length,
            poly_fit_degree=1,
        )
        smoothed_signal2 = smoother.smooth(signal)

        assert np.allclose(smoothed_signal1, smoothed_signal2)

        window_length = 50
        signal = pd.Series(
            np.arange(window_length) + np.random.randn(window_length))
        smoother = Smoother(smoother_name="left_gauss_linear")
        smoothed_signal1 = signal.transform(smoother.smooth)
        smoother = Smoother(
            smoother_name="savgol",
            window_length=window_length,
            poly_fit_degree=1,
        )
        smoothed_signal2 = signal.transform(smoother.smooth)

        assert np.allclose(smoothed_signal1, smoothed_signal2)

        # The raw and smoothed lengths should match
        signal = pd.Series(np.ones(30))
        smoother = Smoother(smoother_name="moving_average")
        smoothed_signal = signal.transform(smoother.smooth)
        assert len(signal) == len(smoothed_signal)

        # The raw and smoothed arrays should be identical on constant data
        # modulo the nans
        signal = pd.Series(np.ones(30))
        window_length = 10
        smoother = Smoother(smoother_name="moving_average",
                            window_length=window_length)
        smoothed_signal = signal.transform(smoother.smooth)
        assert np.allclose(signal[window_length - 1:],
                           smoothed_signal[window_length - 1:])

        # Test that the index of the series gets preserved
        signal = pd.Series(np.ones(30), index=np.arange(50, 80))
        smoother = Smoother(smoother_name="moving_average", window_length=10)
        smoothed_signal = signal.transform(smoother.smooth)
        ix1 = signal.index
        ix2 = smoothed_signal.index
        assert ix1.equals(ix2)
Beispiel #8
0
 def test_identity_smoother(self):
     signal = np.arange(30) + np.random.rand(30)
     assert np.allclose(signal,
                        Smoother(smoother_name="identity").smooth(signal))
Beispiel #9
0
    def test_impute(self):
        # test front nan error
        with pytest.raises(ValueError):
            Smoother().impute(signal=np.array([np.nan, 1, 1]))

        # test the nan imputer
        signal = np.array([i if i % 3 else np.nan for i in range(1, 40)])
        assert np.allclose(Smoother(impute_method=None).impute(signal),
                           signal,
                           equal_nan=True)

        # test the zeros imputer
        signal = np.array([i if i % 3 else np.nan for i in range(1, 40)])
        assert np.allclose(
            Smoother(impute_method="zeros").impute(signal),
            np.array([i if i % 3 else 0.0 for i in range(1, 40)]))

        # make a signal with periodic nans to test the imputer
        signal = np.array([i if i % 3 else np.nan for i in range(1, 40)])
        # test that the non-nan values are unchanged
        not_nans_ixs = np.bitwise_xor(np.isnan(signal, where=True),
                                      np.full(len(signal), True))
        smoothed_signal = Smoother().impute(signal)
        assert np.allclose(signal[not_nans_ixs], smoothed_signal[not_nans_ixs])
        # test that the imputer is close to the true line
        assert np.allclose(range(1, 40), smoothed_signal, atol=0.5)

        # should impute the next value in a linear progression with M>=1
        signal = np.hstack([np.arange(10), [np.nan], np.arange(10)])
        window_length = 10
        smoother = Smoother(window_length=window_length, poly_fit_degree=1)
        imputed_signal = smoother.impute(signal)
        assert np.allclose(imputed_signal,
                           np.hstack([np.arange(11),
                                      np.arange(10)]))
        smoother = Smoother(window_length=window_length, poly_fit_degree=2)
        imputed_signal = smoother.impute(signal)
        assert np.allclose(imputed_signal,
                           np.hstack([np.arange(11),
                                      np.arange(10)]))

        # if there are nans on the boundary, should dynamically change window
        signal = np.hstack(
            [np.arange(5), [np.nan],
             np.arange(20), [np.nan],
             np.arange(5)])
        smoother = Smoother(window_length=window_length, poly_fit_degree=2)
        imputed_signal = smoother.impute(signal)
        assert np.allclose(
            imputed_signal,
            np.hstack([np.arange(6), np.arange(21),
                       np.arange(5)]),
        )

        # if the array begins with np.nan, we should tell the user to peel it off before sending
        signal = np.hstack([[np.nan], np.arange(20), [np.nan], np.arange(5)])
        smoother = Smoother(window_length=window_length, poly_fit_degree=2)
        with pytest.raises(ValueError):
            imputed_signal = smoother.impute(signal)

        # test the boundary methods
        signal = np.arange(20)
        smoother = Smoother(poly_fit_degree=0,
                            boundary_method="identity",
                            window_length=10)
        smoothed_signal = smoother.impute(signal)
        assert np.allclose(smoothed_signal, signal)

        # test that we don't hit a matrix inversion error when there are
        # nans on less than window_length away from the boundary
        signal = np.hstack([[1], np.nan * np.ones(12), np.arange(5)])
        smoother = Smoother(smoother_name="savgol",
                            poly_fit_degree=2,
                            boundary_method="identity",
                            window_length=10)
        smoothed_signal = smoother.impute(signal)
        assert np.allclose(smoothed_signal,
                           np.hstack([[1], np.ones(12),
                                      np.arange(5)]))

        # test the impute_order argument
        signal = np.hstack([[1, np.nan, np.nan, 2], np.arange(5)])
        smoother = Smoother()
        smoothed_signal = smoother.impute(signal, impute_order=1)
        assert np.allclose(smoothed_signal,
                           np.hstack([[1, 1, 1, 2], np.arange(5)]))
    "new_counts": ("incidence_num", False),
    "cumulative_counts": ("cumulative_num", False),
    "incidence": ("incidence_prop", False),
    "cumulative_prop": ("cumulative_prop", False),
}
# Temporarily added for wip_ signals
# WIP_SENSOR_NAME_MAP = {
#     "new_counts":           ("incid_num", False),
#     "cumulative_counts":    ("cumul_num", False),
#     "incidence":            ("incid_prop", False),
#     "cumulative_prop":      ("cumul_prop", False),
# }

SMOOTHERS_MAP = {
    "unsmoothed":
    (Smoother("identity"), "", False, lambda d: d - timedelta(days=7)),
    "seven_day_average":
    (Smoother("moving_average", window_length=7), "7dav_", True, lambda d: d),
}
GEO_RESOLUTIONS = [
    "county",
    "state",
    "msa",
    "hrr",
]


def run_module():
    """Run the usafacts indicator."""
    params = read_params()
    export_start_date = params["export_start_date"]
Beispiel #11
0
class CHCSensor:
    """Sensor class to fit a signal using Covid counts from Change HC outpatient data."""

    smoother = Smoother("savgol",
                        poly_fit_degree=1,
                        gaussian_bandwidth=Config.SMOOTHER_BANDWIDTH)

    @staticmethod
    def gauss_smooth(count, total):
        """Smooth using the left_gauss_linear.

        Args:
            count, total: array
        """
        count_smooth = CHCSensor.smoother.smooth(count)
        total_smooth = CHCSensor.smoother.smooth(total)
        total_clip = np.clip(total_smooth, 0, None)
        count_clip = np.clip(count_smooth, 0, total_clip)
        return count_clip, total_clip

    @staticmethod
    def backfill(num,
                 den,
                 k=Config.MAX_BACKFILL_WINDOW,
                 min_visits_to_fill=Config.MIN_CUM_VISITS):
        """
        Adjust for retroactively added observations (backfill) by using a variable length smoother.

        The smoother starts from the RHS and moves leftwards (backwards through time).
        We cumulatively sum the total visits (denominator), until we have observed some minimum number of
        counts, then calculate the sum over that bin. We restrict the
        bin size so to avoid including long-past values.

        Args:
            num: array of covid counts
            den: array of total visits
            k: maximum number of days used to average a backfill correction
            min_visits_to_fill: minimum number of total visits needed in order to sum a bin

        Returns: dataframes of adjusted covid counts, adjusted visit counts, inclusion array
        """
        if isinstance(den, (pd.DataFrame, pd.Series)):
            den = den.values
        if isinstance(num, (pd.DataFrame, pd.Series)):
            num = num.values
        revden = den[::-1]
        revnum = num[::-1].reshape(-1, 1)
        new_num = np.full_like(revnum, np.nan, dtype=float)
        new_den = np.full_like(revden, np.nan, dtype=float)
        n, p = revnum.shape

        for i in range(n):
            visit_cumsum = revden[i:].cumsum()

            # calculate backfill window
            closest_fill_day = np.where(visit_cumsum >= min_visits_to_fill)[0]
            if len(closest_fill_day) > 0:
                closest_fill_day = min(k, closest_fill_day[0])
            else:
                closest_fill_day = k

            if closest_fill_day == 0:
                new_den[i] = revden[i]

                for j in range(p):
                    new_num[i, j] = revnum[i, j]
            else:
                den_bin = revden[i:(i + closest_fill_day + 1)]
                new_den[i] = den_bin.sum()

                for j in range(p):
                    num_bin = revnum[i:(i + closest_fill_day + 1), j]
                    new_num[i, j] = num_bin.sum()

        new_num = new_num[::-1]
        new_den = new_den[::-1]

        return new_num, new_den

    @staticmethod
    def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"):
        """Fitting routine.

        Args:
            y_data: dataframe for one geo_id, indexed by date
            first_sensor_date: datetime of first date
            geo_id: unique identifier for the location column
            num_col: str name of numerator column
            den_col: str name of denominator column

        Returns:
            dictionary of results

        """
        # backfill
        total_counts, total_visits = CHCSensor.backfill(
            y_data[num_col].values, y_data[den_col].values)

        # calculate smoothed counts and jeffreys rate
        # the left_gauss_linear smoother is not guaranteed to return values greater than 0

        smoothed_total_counts, smoothed_total_visits = CHCSensor.gauss_smooth(
            total_counts.flatten(), total_visits)

        # in smoothing, the numerator may have become more than the denominator
        # simple fix is to clip the max values elementwise to the denominator (note that
        # this has only been observed in synthetic data)
        # smoothed_total_counts = np.clip(smoothed_total_counts, 0, smoothed_total_visits)

        smoothed_total_rates = ((smoothed_total_counts + 0.5) /
                                (smoothed_total_visits + 1))

        # checks - due to the smoother, the first value will be NA
        assert (np.sum(np.isnan(
            smoothed_total_rates[1:])) == 0), "NAs in rate calculation"
        assert (np.sum(smoothed_total_rates[1:] <= 0) == 0
                ), f"0 or negative value, {geo_id}"

        # cut off at sensor indexes
        rate_data = pd.DataFrame(
            {
                'rate': smoothed_total_rates,
                'den': smoothed_total_visits
            },
            index=y_data.index)
        rate_data = rate_data[first_sensor_date:]
        include = rate_data['den'] >= Config.MIN_DEN
        valid_rates = rate_data[include]
        se_valid = valid_rates.eval('sqrt(rate * (1 - rate) / den)')
        rate_data['se'] = se_valid

        logging.debug("{0}: {1:.3f},[{2:.3f}]".format(geo_id,
                                                      rate_data['rate'][-1],
                                                      rate_data['se'][-1]))
        return {
            "geo_id": geo_id,
            "rate": 100 * rate_data['rate'],
            "se": 100 * rate_data['se'],
            "incl": include
        }