def test_bad_inputs(self): with pytest.raises(ValueError): Smoother(smoother_name="hamburger") with pytest.raises(ValueError): Smoother(impute_method="hamburger") with pytest.raises(ValueError): Smoother(boundary_method="hamburger") with pytest.raises(ValueError): Smoother(window_length=1)
def test_causal_savgol_coeffs(self): # The coefficients should return standard average weights for M=0 nl, nr = -10, 0 window_length = nr - nl + 1 smoother = Smoother( smoother_name="savgol", window_length=window_length, poly_fit_degree=0, gaussian_bandwidth=None, ) assert np.allclose(smoother.coeffs, np.ones(window_length) / window_length)
def test_left_gauss_linear_smoother(self): # The raw and smoothed lengths should match signal = np.ones(30) smoother = Smoother(smoother_name="left_gauss_linear") smoothed_signal = smoother.smooth(signal) assert len(signal) == len(smoothed_signal) # The raw and smoothed arrays should be identical on constant data # modulo the nans assert np.allclose(signal[1:], smoothed_signal[1:]) # The smoother should basically be the identity when the Gaussian kernel # is set to weigh the present value overwhelmingly signal = np.arange(1, 30) + np.random.normal(0, 1, 29) smoother = Smoother(smoother_name="left_gauss_linear", gaussian_bandwidth=0.1) assert np.allclose(smoother.smooth(signal)[1:], signal[1:])
def test_moving_average_smoother(self): # Test non-integer window-length with pytest.raises(ValueError): signal = np.array([1, 1, 1]) Smoother(smoother_name="window_average", window_length=5.5).smooth(signal) # The raw and smoothed lengths should match signal = np.ones(30) smoother = Smoother(smoother_name="moving_average") smoothed_signal = smoother.smooth(signal) assert len(signal) == len(smoothed_signal) # The raw and smoothed arrays should be identical on constant data # modulo the nans signal = np.ones(30) window_length = 10 smoother = Smoother(smoother_name="moving_average", window_length=window_length) smoothed_signal = smoother.smooth(signal) assert np.allclose(signal[window_length - 1:], smoothed_signal[window_length - 1:])
] SENSOR_NAME_MAP = { "new_counts": ("incidence_num", False), "cumulative_counts": ("cumulative_num", False), "incidence": ("incidence_prop", False), "cumulative_prop": ("cumulative_prop", False), } # Temporarily added for wip_ signals # WIP_SENSOR_NAME_MAP = { # "new_counts": ("incid_num", False), # "cumulative_counts": ("cumul_num", False), # "incidence": ("incid_prop", False), # "cumulative_prop": ("cumul_prop", False), # } SMOOTHERS_MAP = { "unsmoothed": (Smoother("identity").smooth, ""), "seven_day_average": (Smoother("moving_average", window_length=7).smooth, "7dav_"), } GEO_RESOLUTIONS = ["county", "state", "msa", "hrr", "hhs", "nation"] def run_module(params: Dict[str, Any]): """Run the JHU indicator module. The `params` argument is expected to have the following structure: - "common": - "export_dir": str, directory to write output - "log_exceptions" (optional): bool, whether to log exceptions to file - "log_filename" (optional): str, name of file to write logs - "indicator":
def test_causal_savgol_smoother(self): # The raw and smoothed lengths should match signal = np.ones(30) window_length = 10 smoother = Smoother(smoother_name="savgol", window_length=window_length, poly_fit_degree=0) smoothed_signal = smoother.smooth(signal) assert len(signal) == len(smoothed_signal) # The raw and smoothed arrays should be identical on constant data # modulo the nans, when M >= 0 assert np.allclose(signal[window_length - 1:], smoothed_signal[window_length - 1:]) # The raw and smoothed arrays should be identical on linear data # modulo the nans, when M >= 1 signal = np.arange(30) smoother = Smoother(smoother_name="savgol", window_length=window_length, poly_fit_degree=1) smoothed_signal = smoother.smooth(signal) assert np.allclose(signal[window_length - 1:], smoothed_signal[window_length - 1:]) # The raw and smoothed arrays should be identical on quadratic data # modulo the nans, when M >= 2 signal = np.arange(30)**2 smoother = Smoother(smoother_name="savgol", window_length=window_length, poly_fit_degree=2) smoothed_signal = smoother.smooth(signal) assert np.allclose(signal[window_length - 1:], smoothed_signal[window_length - 1:]) # The savgol method should match the linear regression method on the first # window_length-many values of the signal, if the savgol_weighting is set to true, # and the polynomial fit degree is set to 1. Beyond that, there will be very small # differences between the signals (due to "left_gauss_linear" not having a window_length # cutoff). window_length = 50 signal = np.arange(window_length) + np.random.randn(window_length) smoother = Smoother(smoother_name="left_gauss_linear") smoothed_signal1 = smoother.smooth(signal) smoother = Smoother( smoother_name="savgol", window_length=window_length, poly_fit_degree=1, ) smoothed_signal2 = smoother.smooth(signal) assert np.allclose(smoothed_signal1, smoothed_signal2) # Test the all nans case signal = np.nan * np.ones(10) smoother = Smoother(window_length=9) smoothed_signal = smoother.smooth(signal) assert np.all(np.isnan(smoothed_signal)) # Test the case where the signal is length 1 signal = np.ones(1) smoother = Smoother() smoothed_signal = smoother.smooth(signal) assert np.allclose(smoothed_signal, signal) # Test the case where the signal length is less than polynomial_fit_degree signal = np.ones(2) smoother = Smoother(poly_fit_degree=3) smoothed_signal = smoother.smooth(signal) assert np.allclose(smoothed_signal, signal) # Test an edge fitting case signal = np.array([np.nan, 1, np.nan]) smoother = Smoother(poly_fit_degree=1, window_length=2) smoothed_signal = smoother.smooth(signal) assert np.allclose(smoothed_signal, np.array([np.nan, 1, 1]), equal_nan=True) # Test a range of cases where the signal size following a sequence of nans is returned for i in range(10): signal = np.hstack([[np.nan, np.nan, np.nan], np.ones(i)]) smoother = Smoother(poly_fit_degree=0, window_length=5) smoothed_signal = smoother.smooth(signal) assert np.allclose(smoothed_signal, signal, equal_nan=True) # test window_length > len(signal) and boundary_method="identity" signal = np.arange(20) smoother = Smoother(boundary_method="identity", window_length=30) smoothed_signal = smoother.smooth(signal) assert np.allclose(signal, smoothed_signal)
def test_pandas_series_input(self): # The savgol method should match the linear regression method on the first # window_length-many values of the signal, if the savgol_weighting is set to true, # and the polynomial fit degree is set to 1. Beyond that, there will be very small # differences between the signals (due to "left_gauss_linear" not having a window_length # cutoff). window_length = 50 signal = pd.Series( np.arange(window_length) + np.random.randn(window_length)) smoother = Smoother(smoother_name="left_gauss_linear") smoothed_signal1 = smoother.smooth(signal) smoother = Smoother( smoother_name="savgol", window_length=window_length, poly_fit_degree=1, ) smoothed_signal2 = smoother.smooth(signal) assert np.allclose(smoothed_signal1, smoothed_signal2) window_length = 50 signal = pd.Series( np.arange(window_length) + np.random.randn(window_length)) smoother = Smoother(smoother_name="left_gauss_linear") smoothed_signal1 = signal.transform(smoother.smooth) smoother = Smoother( smoother_name="savgol", window_length=window_length, poly_fit_degree=1, ) smoothed_signal2 = signal.transform(smoother.smooth) assert np.allclose(smoothed_signal1, smoothed_signal2) # The raw and smoothed lengths should match signal = pd.Series(np.ones(30)) smoother = Smoother(smoother_name="moving_average") smoothed_signal = signal.transform(smoother.smooth) assert len(signal) == len(smoothed_signal) # The raw and smoothed arrays should be identical on constant data # modulo the nans signal = pd.Series(np.ones(30)) window_length = 10 smoother = Smoother(smoother_name="moving_average", window_length=window_length) smoothed_signal = signal.transform(smoother.smooth) assert np.allclose(signal[window_length - 1:], smoothed_signal[window_length - 1:]) # Test that the index of the series gets preserved signal = pd.Series(np.ones(30), index=np.arange(50, 80)) smoother = Smoother(smoother_name="moving_average", window_length=10) smoothed_signal = signal.transform(smoother.smooth) ix1 = signal.index ix2 = smoothed_signal.index assert ix1.equals(ix2)
def test_identity_smoother(self): signal = np.arange(30) + np.random.rand(30) assert np.allclose(signal, Smoother(smoother_name="identity").smooth(signal))
def test_impute(self): # test front nan error with pytest.raises(ValueError): Smoother().impute(signal=np.array([np.nan, 1, 1])) # test the nan imputer signal = np.array([i if i % 3 else np.nan for i in range(1, 40)]) assert np.allclose(Smoother(impute_method=None).impute(signal), signal, equal_nan=True) # test the zeros imputer signal = np.array([i if i % 3 else np.nan for i in range(1, 40)]) assert np.allclose( Smoother(impute_method="zeros").impute(signal), np.array([i if i % 3 else 0.0 for i in range(1, 40)])) # make a signal with periodic nans to test the imputer signal = np.array([i if i % 3 else np.nan for i in range(1, 40)]) # test that the non-nan values are unchanged not_nans_ixs = np.bitwise_xor(np.isnan(signal, where=True), np.full(len(signal), True)) smoothed_signal = Smoother().impute(signal) assert np.allclose(signal[not_nans_ixs], smoothed_signal[not_nans_ixs]) # test that the imputer is close to the true line assert np.allclose(range(1, 40), smoothed_signal, atol=0.5) # should impute the next value in a linear progression with M>=1 signal = np.hstack([np.arange(10), [np.nan], np.arange(10)]) window_length = 10 smoother = Smoother(window_length=window_length, poly_fit_degree=1) imputed_signal = smoother.impute(signal) assert np.allclose(imputed_signal, np.hstack([np.arange(11), np.arange(10)])) smoother = Smoother(window_length=window_length, poly_fit_degree=2) imputed_signal = smoother.impute(signal) assert np.allclose(imputed_signal, np.hstack([np.arange(11), np.arange(10)])) # if there are nans on the boundary, should dynamically change window signal = np.hstack( [np.arange(5), [np.nan], np.arange(20), [np.nan], np.arange(5)]) smoother = Smoother(window_length=window_length, poly_fit_degree=2) imputed_signal = smoother.impute(signal) assert np.allclose( imputed_signal, np.hstack([np.arange(6), np.arange(21), np.arange(5)]), ) # if the array begins with np.nan, we should tell the user to peel it off before sending signal = np.hstack([[np.nan], np.arange(20), [np.nan], np.arange(5)]) smoother = Smoother(window_length=window_length, poly_fit_degree=2) with pytest.raises(ValueError): imputed_signal = smoother.impute(signal) # test the boundary methods signal = np.arange(20) smoother = Smoother(poly_fit_degree=0, boundary_method="identity", window_length=10) smoothed_signal = smoother.impute(signal) assert np.allclose(smoothed_signal, signal) # test that we don't hit a matrix inversion error when there are # nans on less than window_length away from the boundary signal = np.hstack([[1], np.nan * np.ones(12), np.arange(5)]) smoother = Smoother(smoother_name="savgol", poly_fit_degree=2, boundary_method="identity", window_length=10) smoothed_signal = smoother.impute(signal) assert np.allclose(smoothed_signal, np.hstack([[1], np.ones(12), np.arange(5)])) # test the impute_order argument signal = np.hstack([[1, np.nan, np.nan, 2], np.arange(5)]) smoother = Smoother() smoothed_signal = smoother.impute(signal, impute_order=1) assert np.allclose(smoothed_signal, np.hstack([[1, 1, 1, 2], np.arange(5)]))
"new_counts": ("incidence_num", False), "cumulative_counts": ("cumulative_num", False), "incidence": ("incidence_prop", False), "cumulative_prop": ("cumulative_prop", False), } # Temporarily added for wip_ signals # WIP_SENSOR_NAME_MAP = { # "new_counts": ("incid_num", False), # "cumulative_counts": ("cumul_num", False), # "incidence": ("incid_prop", False), # "cumulative_prop": ("cumul_prop", False), # } SMOOTHERS_MAP = { "unsmoothed": (Smoother("identity"), "", False, lambda d: d - timedelta(days=7)), "seven_day_average": (Smoother("moving_average", window_length=7), "7dav_", True, lambda d: d), } GEO_RESOLUTIONS = [ "county", "state", "msa", "hrr", ] def run_module(): """Run the usafacts indicator.""" params = read_params() export_start_date = params["export_start_date"]
class CHCSensor: """Sensor class to fit a signal using Covid counts from Change HC outpatient data.""" smoother = Smoother("savgol", poly_fit_degree=1, gaussian_bandwidth=Config.SMOOTHER_BANDWIDTH) @staticmethod def gauss_smooth(count, total): """Smooth using the left_gauss_linear. Args: count, total: array """ count_smooth = CHCSensor.smoother.smooth(count) total_smooth = CHCSensor.smoother.smooth(total) total_clip = np.clip(total_smooth, 0, None) count_clip = np.clip(count_smooth, 0, total_clip) return count_clip, total_clip @staticmethod def backfill(num, den, k=Config.MAX_BACKFILL_WINDOW, min_visits_to_fill=Config.MIN_CUM_VISITS): """ Adjust for retroactively added observations (backfill) by using a variable length smoother. The smoother starts from the RHS and moves leftwards (backwards through time). We cumulatively sum the total visits (denominator), until we have observed some minimum number of counts, then calculate the sum over that bin. We restrict the bin size so to avoid including long-past values. Args: num: array of covid counts den: array of total visits k: maximum number of days used to average a backfill correction min_visits_to_fill: minimum number of total visits needed in order to sum a bin Returns: dataframes of adjusted covid counts, adjusted visit counts, inclusion array """ if isinstance(den, (pd.DataFrame, pd.Series)): den = den.values if isinstance(num, (pd.DataFrame, pd.Series)): num = num.values revden = den[::-1] revnum = num[::-1].reshape(-1, 1) new_num = np.full_like(revnum, np.nan, dtype=float) new_den = np.full_like(revden, np.nan, dtype=float) n, p = revnum.shape for i in range(n): visit_cumsum = revden[i:].cumsum() # calculate backfill window closest_fill_day = np.where(visit_cumsum >= min_visits_to_fill)[0] if len(closest_fill_day) > 0: closest_fill_day = min(k, closest_fill_day[0]) else: closest_fill_day = k if closest_fill_day == 0: new_den[i] = revden[i] for j in range(p): new_num[i, j] = revnum[i, j] else: den_bin = revden[i:(i + closest_fill_day + 1)] new_den[i] = den_bin.sum() for j in range(p): num_bin = revnum[i:(i + closest_fill_day + 1), j] new_num[i, j] = num_bin.sum() new_num = new_num[::-1] new_den = new_den[::-1] return new_num, new_den @staticmethod def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"): """Fitting routine. Args: y_data: dataframe for one geo_id, indexed by date first_sensor_date: datetime of first date geo_id: unique identifier for the location column num_col: str name of numerator column den_col: str name of denominator column Returns: dictionary of results """ # backfill total_counts, total_visits = CHCSensor.backfill( y_data[num_col].values, y_data[den_col].values) # calculate smoothed counts and jeffreys rate # the left_gauss_linear smoother is not guaranteed to return values greater than 0 smoothed_total_counts, smoothed_total_visits = CHCSensor.gauss_smooth( total_counts.flatten(), total_visits) # in smoothing, the numerator may have become more than the denominator # simple fix is to clip the max values elementwise to the denominator (note that # this has only been observed in synthetic data) # smoothed_total_counts = np.clip(smoothed_total_counts, 0, smoothed_total_visits) smoothed_total_rates = ((smoothed_total_counts + 0.5) / (smoothed_total_visits + 1)) # checks - due to the smoother, the first value will be NA assert (np.sum(np.isnan( smoothed_total_rates[1:])) == 0), "NAs in rate calculation" assert (np.sum(smoothed_total_rates[1:] <= 0) == 0 ), f"0 or negative value, {geo_id}" # cut off at sensor indexes rate_data = pd.DataFrame( { 'rate': smoothed_total_rates, 'den': smoothed_total_visits }, index=y_data.index) rate_data = rate_data[first_sensor_date:] include = rate_data['den'] >= Config.MIN_DEN valid_rates = rate_data[include] se_valid = valid_rates.eval('sqrt(rate * (1 - rate) / den)') rate_data['se'] = se_valid logging.debug("{0}: {1:.3f},[{2:.3f}]".format(geo_id, rate_data['rate'][-1], rate_data['se'][-1])) return { "geo_id": geo_id, "rate": 100 * rate_data['rate'], "se": 100 * rate_data['se'], "incl": include }