def compute_initial_beta_scaling_parameters_by_draw( draw_id: int, total_deaths: pd.Series, beta_scaling: Dict, data_interface: ForecastDataInterface) -> pd.DataFrame: # Construct a list of pandas Series indexed by location and named # as their column will be in the output dataframe. We'll append # to this list as we construct the parameters. draw_data = [ total_deaths.copy(), pd.Series(beta_scaling['window_size'], index=total_deaths.index, name='window_size') ] # Today in the data is unique by draw. It's a combination of the # number of predicted days from the elastispliner in the ODE fit # and the random draw of lag between infection and death from the # infectionator. Don't compute, let's look it up. transition_date = data_interface.load_transition_date(draw_id) beta_regression_df = data_interface.load_beta_regression(draw_id) beta_regression_df = beta_regression_df.set_index( 'location_id').sort_index() idx = beta_regression_df.index # Select out the transition day to compute the initial scaling parameter. beta_transition = beta_regression_df.loc[beta_regression_df['date'] == transition_date.loc[idx]] draw_data.append(beta_transition['beta'].rename('fit_final')) draw_data.append(beta_transition['beta_pred'].rename('pred_start')) draw_data.append((beta_transition['beta'] / beta_transition['beta_pred']).rename('scale_init')) # Compute the beta residual mean for our parameterization and hang on # to some ancillary information that may be useful for plotting/debugging. rs = np.random.RandomState(draw_id) a = rs.randint(1, beta_scaling['average_over_min']) b = rs.randint(a + 7, beta_scaling['average_over_max']) draw_data.append( pd.Series(a, index=total_deaths.index, name='history_days_start')) draw_data.append( pd.Series(b, index=total_deaths.index, name='history_days_end')) beta_past = (beta_regression_df.loc[ beta_regression_df['date'] <= transition_date.loc[idx]].reset_index(). set_index(['location_id', 'date']).sort_index()) log_beta_resid_mean = (np.log( beta_past['beta'] / beta_past['beta_pred']).groupby(level='location_id').apply( lambda x: x.iloc[-b:-a].mean()).rename('log_beta_residual_mean')) draw_data.append(log_beta_resid_mean) draw_data.append(pd.Series(draw_id, index=total_deaths.index, name='draw')) return pd.concat(draw_data, axis=1)
def test_regression_io(self, tmpdir, coefficients, dates, regression_beta, location_data, parameters): """ Test I/O relating to regression stage. This only includes loading files, as they are all saved by the RegressionDataInterface. """ regress_paths = RegressionPaths(Path(tmpdir)) rdi = RegressionDataInterface( infection_paths=None, regression_paths=regress_paths, covariate_paths=None, regression_marshall=CSVMarshall(regress_paths.root_dir), ) fdi = ForecastDataInterface( forecast_paths=None, regression_paths=None, covariate_paths=None, regression_marshall=CSVMarshall.from_paths(regress_paths), forecast_marshall=None, ) # Step 1: save files (normally done in regression) rdi.save_regression_coefficients(coefficients, draw_id=4) rdi.save_beta_param_file(parameters, draw_id=4) rdi.save_date_file(dates, draw_id=4) rdi.save_regression_betas(regression_beta, draw_id=4) rdi.save_location_data(location_data, draw_id=4) # Step 2: load files as they would be loaded in forecast loaded_coefficients = fdi.load_regression_coefficients(draw_id=4) loaded_parameters = fdi.load_beta_params(draw_id=4) loaded_transition_dates = fdi.load_transition_date(draw_id=4) loaded_regression_beta = fdi.load_beta_regression(draw_id=4) loaded_location_data = fdi.load_infection_data(draw_id=4) # Step 3: test files pandas.testing.assert_frame_equal(coefficients, loaded_coefficients) # some load methods do pandas.to_datetime conversion on columns transition_dates = dates.set_index('location_id').sort_index( )['end_date'].rename('date').reset_index() loaded_transition_dates = loaded_transition_dates.reset_index() assert_equal_after_date_conversion(transition_dates, loaded_transition_dates, date_cols=['date']) assert_equal_after_date_conversion(regression_beta, loaded_regression_beta, date_cols=['date']) assert_equal_after_date_conversion(location_data, loaded_location_data, date_cols=['date']) # load_beta_params does not return a DataFrame but instead a dict # in addition, some rounding error occurs in the save/load from CSV expected_parameters = parameters.set_index( 'params')['values'].to_dict() try: assert expected_parameters == loaded_parameters except AssertionError: # assert keys are identical assert set(expected_parameters) == set(loaded_parameters) # assert each value is accurate to 15 decimal places for k, expected in expected_parameters.items(): loaded = loaded_parameters[k] numpy.testing.assert_almost_equal(loaded, expected, decimal=15) warnings.warn( "beta fit parameters accurate only to 15 decimal places after save/load cycle" )