コード例 #1
0
def compute_initial_beta_scaling_parameters_by_draw(
        draw_id: int, total_deaths: pd.Series, beta_scaling: Dict,
        data_interface: ForecastDataInterface) -> pd.DataFrame:
    # Construct a list of pandas Series indexed by location and named
    # as their column will be in the output dataframe. We'll append
    # to this list as we construct the parameters.
    draw_data = [
        total_deaths.copy(),
        pd.Series(beta_scaling['window_size'],
                  index=total_deaths.index,
                  name='window_size')
    ]

    # Today in the data is unique by draw.  It's a combination of the
    # number of predicted days from the elastispliner in the ODE fit
    # and the random draw of lag between infection and death from the
    # infectionator. Don't compute, let's look it up.
    transition_date = data_interface.load_transition_date(draw_id)

    beta_regression_df = data_interface.load_beta_regression(draw_id)
    beta_regression_df = beta_regression_df.set_index(
        'location_id').sort_index()
    idx = beta_regression_df.index

    # Select out the transition day to compute the initial scaling parameter.
    beta_transition = beta_regression_df.loc[beta_regression_df['date'] ==
                                             transition_date.loc[idx]]
    draw_data.append(beta_transition['beta'].rename('fit_final'))
    draw_data.append(beta_transition['beta_pred'].rename('pred_start'))
    draw_data.append((beta_transition['beta'] /
                      beta_transition['beta_pred']).rename('scale_init'))

    # Compute the beta residual mean for our parameterization and hang on
    # to some ancillary information that may be useful for plotting/debugging.
    rs = np.random.RandomState(draw_id)
    a = rs.randint(1, beta_scaling['average_over_min'])
    b = rs.randint(a + 7, beta_scaling['average_over_max'])

    draw_data.append(
        pd.Series(a, index=total_deaths.index, name='history_days_start'))
    draw_data.append(
        pd.Series(b, index=total_deaths.index, name='history_days_end'))

    beta_past = (beta_regression_df.loc[
        beta_regression_df['date'] <= transition_date.loc[idx]].reset_index().
                 set_index(['location_id', 'date']).sort_index())

    log_beta_resid_mean = (np.log(
        beta_past['beta'] /
        beta_past['beta_pred']).groupby(level='location_id').apply(
            lambda x: x.iloc[-b:-a].mean()).rename('log_beta_residual_mean'))
    draw_data.append(log_beta_resid_mean)
    draw_data.append(pd.Series(draw_id, index=total_deaths.index, name='draw'))

    return pd.concat(draw_data, axis=1)
コード例 #2
0
    def test_regression_io(self, tmpdir, coefficients, dates, regression_beta,
                           location_data, parameters):
        """
        Test I/O relating to regression stage.

        This only includes loading files, as they are all saved by the
        RegressionDataInterface.
        """
        regress_paths = RegressionPaths(Path(tmpdir))
        rdi = RegressionDataInterface(
            infection_paths=None,
            regression_paths=regress_paths,
            covariate_paths=None,
            regression_marshall=CSVMarshall(regress_paths.root_dir),
        )

        fdi = ForecastDataInterface(
            forecast_paths=None,
            regression_paths=None,
            covariate_paths=None,
            regression_marshall=CSVMarshall.from_paths(regress_paths),
            forecast_marshall=None,
        )

        # Step 1: save files (normally done in regression)
        rdi.save_regression_coefficients(coefficients, draw_id=4)
        rdi.save_beta_param_file(parameters, draw_id=4)
        rdi.save_date_file(dates, draw_id=4)
        rdi.save_regression_betas(regression_beta, draw_id=4)
        rdi.save_location_data(location_data, draw_id=4)

        # Step 2: load files as they would be loaded in forecast
        loaded_coefficients = fdi.load_regression_coefficients(draw_id=4)
        loaded_parameters = fdi.load_beta_params(draw_id=4)
        loaded_transition_dates = fdi.load_transition_date(draw_id=4)
        loaded_regression_beta = fdi.load_beta_regression(draw_id=4)
        loaded_location_data = fdi.load_infection_data(draw_id=4)

        # Step 3: test files
        pandas.testing.assert_frame_equal(coefficients, loaded_coefficients)
        # some load methods do pandas.to_datetime conversion on columns
        transition_dates = dates.set_index('location_id').sort_index(
        )['end_date'].rename('date').reset_index()
        loaded_transition_dates = loaded_transition_dates.reset_index()
        assert_equal_after_date_conversion(transition_dates,
                                           loaded_transition_dates,
                                           date_cols=['date'])
        assert_equal_after_date_conversion(regression_beta,
                                           loaded_regression_beta,
                                           date_cols=['date'])
        assert_equal_after_date_conversion(location_data,
                                           loaded_location_data,
                                           date_cols=['date'])

        # load_beta_params does not return a DataFrame but instead a dict
        # in addition, some rounding error occurs in the save/load from CSV
        expected_parameters = parameters.set_index(
            'params')['values'].to_dict()
        try:
            assert expected_parameters == loaded_parameters
        except AssertionError:
            # assert keys are identical
            assert set(expected_parameters) == set(loaded_parameters)
            # assert each value is accurate to 15 decimal places
            for k, expected in expected_parameters.items():
                loaded = loaded_parameters[k]
                numpy.testing.assert_almost_equal(loaded, expected, decimal=15)
            warnings.warn(
                "beta fit parameters accurate only to 15 decimal places after save/load cycle"
            )