def predict(self, times, predict_space, predict_group): beta_predictions = None p_predictions = None if self.beta_weight > 0: loose_beta_predictions = self.loose_beta_model.predict( t=times, group_name=predict_group, prediction_functional_form=predict_space) tight_beta_predictions = self.tight_beta_model.predict( t=times, group_name=predict_group, prediction_functional_form=predict_space) beta_predictions = convex_combination(t=times, pred1=tight_beta_predictions, pred2=loose_beta_predictions, pred_fun=predict_space, start_day=self.blend_start_t, end_day=self.blend_end_t) if self.p_weight > 0: loose_p_predictions = self.loose_p_model.predict( t=times, group_name=predict_group, prediction_functional_form=predict_space) tight_p_predictions = self.tight_p_model.predict( t=times, group_name=predict_group, prediction_functional_form=predict_space) p_predictions = convex_combination(t=times, pred1=tight_p_predictions, pred2=loose_p_predictions, pred_fun=predict_space, start_day=self.blend_start_t, end_day=self.blend_end_t) if (self.beta_weight > 0) & (self.p_weight > 0): averaged_predictions = model_average(pred1=beta_predictions, pred2=p_predictions, w1=self.beta_weight, w2=self.p_weight, pred_fun=predict_space) elif (self.beta_weight > 0) & (self.p_weight == 0): averaged_predictions = beta_predictions elif (self.beta_weight == 0) & (self.p_weight > 0): averaged_predictions = p_predictions else: raise RuntimeError return averaged_predictions
def test_convex_combination(t, mat1, mat2, pred_fun, start_day, end_day, result): my_result = utils.convex_combination(t, mat1, mat2, pred_fun, start_day=start_day, end_day=end_day) assert np.allclose(result, my_result)
def ap_model(df, model_location, location_cov, n_draws, peaked_groups, exclude_groups, fix_gamma, fix_point, fix_day, pred_days=150): # our dataset (rename days as model assumes it's lower case) df = df.copy() df = df.rename(index=str, columns={'Days': 'days'}) ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## SET UP # basic information and model setting basic_info_dict = dict(all_cov_names=[COVARIATE], col_t='days', col_group='location', predict_space=ln_gaussian_pdf, col_obs_compare='ln asddr', peaked_groups=peaked_groups) basic_model_dict = dict( param_names=['alpha', 'beta', 'p'], col_covs=[['intercept'], [COVARIATE], ['intercept']], link_fun=[np.exp, lambda x: x, np.exp], var_link_fun=[lambda x: x, lambda x: x, lambda x: x]) # basic fit parameter dummy_gprior = [0.0, np.inf] dummy_uprior = [-np.inf, np.inf] zero_uprior = [0.0, 0.0] fe_init = np.array([-2.5, 28.0, -8.05]) fe_bounds = [[-np.inf, 0.0], [15.0, 100.0], [-10, -6]] options = {'ftol': 1e-10, 'gtol': 1e-10, 'maxiter': 500, 'disp': False} basic_fit_dict = dict(fe_init=fe_init, fe_bounds=fe_bounds, re_bounds=[zero_uprior] * 3, fe_gprior=[dummy_gprior] * 3, re_gprior=[dummy_gprior] * 3, options=options) basic_joint_model_fit_dict = dict( fe_gprior=[dummy_gprior] * 3, re_bounds=[dummy_uprior] * 3, re_gprior=[dummy_gprior, [0.0, 10.0], dummy_gprior], smart_initialize=True, smart_init_options=options, options={ 'ftol': 1e-10, 'gtol': 1e-10, 'maxiter': 10, 'disp': False }) # draw related parameters draw_dict = dict(n_draws=n_draws, prediction_times=np.arange(pred_days), cv_lower_threshold=1e-4, cv_upper_threshold=1., smoothed_radius=[5, 5], exclude_groups=exclude_groups, exclude_below=0, num_smooths=2) # for the convex combination start_day = 2 end_day = 25 # for prediction of places with no data alpha_times_beta = np.exp(0.7) obs_bounds = [40, np.inf] # filter the data rich models predict_cov = np.array([1.0, location_cov, 1.0]) # new covariates for the places. # tight prior control panel tight_info_dict = { **deepcopy(basic_info_dict), 'fun': ln_gaussian_cdf, 'col_obs': 'ln ascdr', 'col_obs_se': 'obs_se_tight', #'obs_se_func': lambda x: (1. / (1. + x)), 'obs_se_func': None, 'prior_modifier': lambda x: 10**(min(0.0, max(-1.0, 0.1 * x - 1.5))) / 10 } tight_fit_dict = { **deepcopy(basic_fit_dict), 'fun_gprior': [lambda params: params[0] * params[1], [np.exp(0.7), 1.0]] } # loose prior control panel loose_info_dict = { **deepcopy(basic_info_dict), 'fun': ln_gaussian_cdf, 'col_obs': 'ln ascdr', 'col_obs_se': 'obs_se_loose', #'obs_se_func': lambda x: (1 / (0.1 + x**1.4)), 'obs_se_func': None, 'prior_modifier': lambda x: 0.2 } loose_fit_dict = { **deepcopy(basic_fit_dict), 'fun_gprior': [lambda params: params[0] * params[1], dummy_gprior] } # prepare data (must exponentiate smoothed column, non-logged col is not smoothed) df['obs_se_tight'] = 1 / (1 + df['days']) df['obs_se_loose'] = 1 / (1 + df['days']**1.4) df.loc[df['pseudo'] == 1, 'obs_se_tight'] = PSEUDO_SE df.loc[df['pseudo'] == 1, 'obs_se_loose'] = PSEUDO_SE df['Age-standardized death rate'] = np.exp( df['ln(age-standardized death rate)']) df = process_input( df, 'location_id', 'days', 'Age-standardized death rate', col_covs=[COVARIATE, 'intercept', 'obs_se_tight', 'obs_se_loose']) ############# # RUN MODEL # ############# # set up last info if fix_point is not None: last_info = {model_location: [fix_day, fix_point]} else: last_info = None # The Alpha Prior Model tight_model = APModel(all_data=df, **tight_info_dict, joint_model_fit_dict=basic_joint_model_fit_dict, basic_model_dict=basic_model_dict, fit_dict=tight_fit_dict) if fix_gamma: fe_bounds = tight_model.fit_dict['fe_bounds'] tight_model.fit_dict.update( {'fe_bounds': [fe_bounds[0], [1, 1], fe_bounds[2]]}) tight_model.run(last_info=last_info, **draw_dict) loose_model = APModel(all_data=df, **loose_info_dict, joint_model_fit_dict=basic_joint_model_fit_dict, basic_model_dict=basic_model_dict, fit_dict=loose_fit_dict) if fix_gamma: fe_bounds = loose_model.fit_dict['fe_bounds'] loose_model.fit_dict.update( {'fe_bounds': [fe_bounds[0], [1, 1], fe_bounds[2]]}) loose_model.run(last_info=last_info, **draw_dict) # get truncated draws tight_draws = tight_model.process_draws(draw_dict['prediction_times'], last_info=last_info) loose_draws = loose_model.process_draws(draw_dict['prediction_times'], last_info=last_info) combined_draws = {} for group in tight_draws.keys(): draws = convex_combination( np.arange(tight_draws[group][1].shape[1]), tight_draws[group][1][np.argsort(tight_draws[group][1][:, -1]), :], loose_draws[group][1][np.argsort(loose_draws[group][1][:, -1]), :], basic_info_dict['predict_space'], start_day=start_day, end_day=end_day) if group == model_location and fix_point is not None: last_obs = fix_point else: last_obs = tight_model.models[group].obs[-1] combined_draws.update({ group: (tight_draws[group][0], np.log(np.exp(last_obs) + np.exp(draws).cumsum(axis=1))) }) # get overall draws filtered_tight_models = tight_model.run_filtered_models( df=tight_model.all_data, obs_bounds=obs_bounds) overall_tight_draws = tight_model.create_overall_draws( draw_dict['prediction_times'], filtered_tight_models, predict_cov, alpha_times_beta=alpha_times_beta, sample_size=draw_dict['n_draws'], slope_at=10, epsilon=draw_dict['cv_lower_threshold']) filtered_loose_models = loose_model.run_filtered_models( df=loose_model.all_data, obs_bounds=obs_bounds) overall_loose_draws = loose_model.create_overall_draws( draw_dict['prediction_times'], filtered_loose_models, predict_cov, alpha_times_beta=alpha_times_beta, sample_size=draw_dict['n_draws'], slope_at=10, epsilon=draw_dict['cv_lower_threshold']) # get specs and truncate overall, then combine if model_location in list(combined_draws.keys()): # last_day = tight_model.models[model_location].t[-1] if fix_day is None: last_day = tight_model.models[model_location].t[-1] else: last_day = fix_day if fix_point is not None: last_obs = fix_point else: last_obs = tight_model.models[model_location].obs[-1] overall_time = draw_dict['prediction_times'][int(np.round(last_day)):] else: if fix_day is None: last_day = draw_dict['prediction_times'][0] else: last_day = fix_day if fix_point is not None: last_obs = fix_point else: last_obs = RATE_THRESHOLD overall_time = np.arange(last_day, pred_days) overall_tight_draws = truncate_draws( t=draw_dict['prediction_times'], draws=overall_tight_draws, draw_space=basic_info_dict['predict_space'], last_day=last_day, last_obs=last_obs, last_obs_space=tight_info_dict['fun']) overall_loose_draws = truncate_draws( t=draw_dict['prediction_times'], draws=overall_loose_draws, draw_space=basic_info_dict['predict_space'], last_day=last_day, last_obs=last_obs, last_obs_space=loose_info_dict['fun']) draws = convex_combination( np.arange(overall_tight_draws.shape[1]), overall_tight_draws[np.argsort(overall_tight_draws[:, -1]), :], overall_loose_draws[np.argsort(overall_loose_draws[:, -1]), :], basic_info_dict['predict_space'], start_day=start_day, end_day=end_day) combined_draws.update({ 'overall': (overall_time[1:], np.log(np.exp(last_obs) + np.exp(draws).cumsum(axis=1))) }) return tight_model, loose_model, combined_draws
def run_death_models(): # args = argparse.Namespace( # cov_file='/ihme/covid-19/deaths/dev/2020_05_03_US_boundary/model_data_descartes_21/555_covariate.csv', # covariate_effect='gamma', # data_file='/ihme/covid-19/deaths/dev/2020_05_03_US_boundary/model_data_descartes_21/555.csv', # last_day_file='/ihme/covid-19/deaths/dev/2020_05_03_US_boundary/last_day.csv', # model_location_id=555, # n_draws=333, # n_b=43, # output_dir='/ihme/covid-19/deaths/dev/2020_05_03_US_boundary/model_data_descartes_21/555', # peaked_file='/ihme/covid-19/deaths/mobility_inputs/2020_04_20/peak_locs_april20_.csv' # ) parser = argparse.ArgumentParser() parser.add_argument('--model_location_id', help='id of location to which we are standardizing.', type=int) parser.add_argument('--data_file', help='Name of location-standardized data file.', type=str) parser.add_argument('--cov_file', help='Name of covariate file.', type=str) parser.add_argument('--last_day_file', help='Name of last day of deaths file.', type=str) parser.add_argument('--peaked_file', help='Name of peaked locations file.', type=str) parser.add_argument('--output_dir', help='Where we are storing results.', type=str) parser.add_argument('--covariate_effect', help='Whether covariate is acting on beta or gamma.', type=str) parser.add_argument('--n_draws', help='How many samples to take.', type=int) args = parser.parse_args() logger.info(args) # read data df = pd.read_csv(args.data_file) cov_df = pd.read_csv(args.cov_file) # only keep if more than one data point is present keep_idx = df.groupby('location_id')['location_id'].transform('count') > 1 df = df[keep_idx].reset_index(drop=True) # try setting floor for covariate cov_df.loc[cov_df[COVARIATE] < 0.75, COVARIATE] = 0.75 # attach covs to data file df = pd.merge(df, cov_df[['location_id', COVARIATE]], how='left') if df[COVARIATE].isnull().any(): missing_locs = df.loc[df[COVARIATE].isnull(), 'Location'].unique().tolist() print( f'The following locations are missing covariates: {", ".join(missing_locs)}' ) df = df.loc[~df[COVARIATE].isnull()] df = df.sort_values(['location_id', 'Days']).reset_index(drop=True) # 'Country/Region', # encode location_id for more explicit str indexing in model df['location_id'] = '_' + df['location_id'].astype(str) # add intercept df['intercept'] = 1.0 # identify covariate value for our location location_cov = cov_df.loc[cov_df['location_id'] == args.model_location_id, COVARIATE].item() n_b = get_number_of_basis_functions(location_cov) # get list of peaked locations peaked_df = pd.read_csv(args.peaked_file) peaked_df['location_id'] = '_' + peaked_df['location_id'].astype(str) # get true ln(dr) on last day last_day_df = pd.read_csv(args.last_day_file) last_day_df = last_day_df.loc[last_day_df['location_id'] == args.model_location_id] if last_day_df.empty: fix_point = None fix_day = None else: fix_point = last_day_df['ln(death rate)'].item() fix_day = last_day_df['Days'].item() ## run models model_seed = get_hash(f'_{args.model_location_id}') np.random.seed(model_seed) # AP model for data poor if len(df.loc[df['location_id'] == f'_{args.model_location_id}']) < DATA_THRESHOLD: logger.info('Running data poor model') # or df.loc[df['location_id'] == f'_{args.model_location_id}', 'Deaths'].max() < 5: # # are we using a beta or gamma covariate if args.covariate_effect == 'beta': fix_gamma = True elif args.covariate_effect == 'gamma': fix_gamma = False # alpha prior model (no flat top) tight_model, loose_model, draws = ap_model( df=df[[ 'location_id', 'intercept', 'Days', 'pseudo', 'ln(age-standardized death rate)', COVARIATE ]], model_location=f'_{args.model_location_id}', location_cov=location_cov, n_draws=args.n_draws, peaked_groups=peaked_df.loc[peaked_df['location_id'].isin( df['location_id'].unique().tolist()), 'location_id'].to_list(), exclude_groups=peaked_df.loc[ peaked_df['Location'].str.startswith('Wuhan'), 'location_id'].unique().tolist(), fix_gamma=fix_gamma, fix_point=fix_point, fix_day=fix_day) model = 'AP' # get point estimate d = pd.to_datetime( cov_df.loc[cov_df['location_id'] == args.model_location_id, 'threshold_date'].item()) if f'_{args.model_location_id}' in list(draws.keys()): t = np.arange(PRED_DAYS) loose_asdr = loose_model.models[ f'_{args.model_location_id}'].predict( t, group_name=f'_{args.model_location_id}') tight_asdr = tight_model.models[ f'_{args.model_location_id}'].predict( t, group_name=f'_{args.model_location_id}') ln_asdr = convex_combination(t, loose_asdr, tight_asdr, ln_gaussian_cdf, start_day=fix_day + 2, end_day=fix_day + 25) asdr = np.exp(ln_asdr) else: t = draws[f'overall'][0] asdr = np.exp(draws[f'overall'][1]).mean(axis=0) # store output as daily asddr = asdr[1:] - asdr[:-1] point_df = pd.DataFrame({ 'location_id': args.model_location_id, 'Date': [d + timedelta(days=int(t_i)) for t_i in t[1:]], 'Age-standardized death rate': asddr }) else: # AP model for data rich logger.info('Running data rich model.') tight_model, draws = ap_flat_asym_model( df=df[[ 'location_id', 'intercept', 'Days', 'pseudo', 'ln(age-standardized death rate)', COVARIATE ]], model_location=f'_{args.model_location_id}', n_draws=args.n_draws, peaked_groups=peaked_df.loc[peaked_df['location_id'].isin( df['location_id'].unique().tolist()), 'location_id'].to_list(), exclude_groups=peaked_df.loc[ peaked_df['Location'].str.startswith('Wuhan'), 'location_id'].unique().tolist(), fix_point=fix_point, fix_day=fix_day, n_b=n_b) loose_model = tight_model # just to plug into plot model = 'AP flat asymmetrical' # get point estimate d = pd.to_datetime( cov_df.loc[cov_df['location_id'] == args.model_location_id, 'threshold_date'].item()) t = np.arange(PRED_DAYS) asdr = np.exp( tight_model.predict(t, ln_gaussian_cdf, f'_{args.model_location_id}')) asddr = asdr[1:] - asdr[:-1] point_df = pd.DataFrame({ 'location_id': args.model_location_id, 'Date': [d + timedelta(days=int(t_i)) for t_i in t[1:]], 'Age-standardized death rate': asddr }) # only save this location and overall draws subset_draws = dict() for model_label in [f'_{args.model_location_id}', 'overall']: if model_label in list(draws.keys()): subset_draws.update({model_label: draws[model_label]}) # store outputs # data df[[ 'location_id', 'intercept', 'Days', 'pseudo', 'ln(age-standardized death rate)', COVARIATE ]].to_csv(f'{args.output_dir}/data.csv', index=False) # point estimate point_df.to_csv(f'{args.output_dir}/point_estimate.csv', index=False) # loose if model == 'AP': logger.info('Writing loose models.') with open(f'{args.output_dir}/loose_models.pkl', 'wb') as fwrite: pickle.dump(loose_model.models, fwrite, -1) with open(f'{args.output_dir}/loose_model_fit_dict.pkl', 'wb') as fwrite: pickle.dump(loose_model.fit_dict, fwrite, -1) else: # GM data logger.info('Writing Gaussian mixture metadata') with open(f'{args.output_dir}/gaussian_mixtures.pkl', 'wb') as fwrite: pickle.dump(tight_model.gaussian_mixtures, fwrite, -1) # tight logger.info('Writing tight models') with open(f'{args.output_dir}/tight_models.pkl', 'wb') as fwrite: pickle.dump(tight_model.models, fwrite, -1) with open(f'{args.output_dir}/tight_model_fit_dict.pkl', 'wb') as fwrite: pickle.dump(tight_model.fit_dict, fwrite, -1) # subset draws logger.info('Writing draws') with open(f'{args.output_dir}/draws.pkl', 'wb') as fwrite: pickle.dump(subset_draws, fwrite, -1) # plot (special condition if using multiple Gaussian) if model == 'AP': model_instance = None else: model_instance = tight_model logger.info('Writing model fit plots.') with PdfPages(f'{args.output_dir}/model_fits.pdf') as pdf: for location in tight_model.models.keys(): location_name = df.loc[df['location_id'] == location, 'Location'].values[0] plot_location( location=location, location_name=location_name, covariate_val=cov_df.loc[cov_df['Location'] == location_name, COVARIATE].item(), tm=tight_model.models[location], lm=loose_model.models[location], model_instance=model_instance, draw=draws[location], population=df.loc[df['location_id'] == location, 'population'].values[0], pdf=pdf, n_b=n_b)