def fit_posterior(dm, region, sex, year, fast_fit=False, inconsistent_fit=False, params_to_fit=['p', 'r', 'i'], zero_re=True, posteriors_only=False): """ Fit posterior of specified region/sex/year for specified model Parameters ---------- dm : DiseaseJson region : str From dismod3.settings.gbd_regions, but clean()-ed sex : str, from dismod3.settings.gbd_sexes year : str, from dismod3.settings.gbd_years fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing) inconsistent_fit : fit parameters separately params_to_fit : list of params to fit, if not fitting all consistently zero_re : bool, if true, enforce constraint that sibling area REs sum to zero posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010 Example ------- >>> import fit_posterior >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005') """ dir = dismod3.settings.JOB_WORKING_DIR % dm.id ## load the model from disk or from web import simplejson as json import data reload(data) try: model = data.ModelData.load(dir) print 'loaded data from new format from %s' % dir except (IOError, AssertionError): model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json())) #model.save(dir) print 'loaded data from json, saved in new format for next time in %s' % dir # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros ## next block fills in missing covariates with zero for col in model.input_data.columns: if col.startswith('x_'): model.input_data[col] = model.input_data[col].fillna(0.) # also fill all covariates missing in output template with zeros model.output_template = model.output_template.fillna(0) predict_area = dismod3.utils.clean(region) predict_sex = dismod3.utils.clean(sex) predict_year = int(year) ## load emp_priors dict from dm.params param_type = dict(i='incidence', p='prevalence', r='remission', f='excess-mortality', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality') emp_priors = {} for t in 'i r p f'.split(): # uncomment below to not use empirical prior for rate with zero data # if pl.all(model.input_data['data_type'] != t): # continue #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex) key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex) mu = dm.get_mcmc('emp_prior_mean', key) #mu = dm.get_mcmc('emp_prior_median', key) sigma = dm.get_mcmc('emp_prior_std', key) if len(mu) == 101 and len(sigma) == 101: emp_priors[t, 'mu'] = mu # TODO: determine best way to propagate prior on function emp_priors[t, 'sigma'] = sigma # ALT 1: scale so that the joint probability is not a # function of the length of the age function # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma)) ## update model.parameters['random_effects'] if there is information in the disease model expert_priors = model.parameters[t].get('random_effects', {}) model.parameters[t]['random_effects'] = dm.get_empirical_prior( param_type[t]).get('new_alpha', {}) model.parameters[t]['random_effects'].update(expert_priors) # shift random effects to make REs for observed children of predict area have mean zero re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \ for area in model.hierarchy.neighbors(predict_area) \ if area in model.parameters[t]['random_effects']]) for area in model.hierarchy.neighbors(predict_area): if area in model.parameters[t]['random_effects']: model.parameters[t]['random_effects'][area]['mu'] -= re_mean ## update model.parameters['fixed_effects'] if there is information in the disease model expert_fe_priors = model.parameters[t].get('fixed_effects', {}) model.parameters[t]['fixed_effects'].update( dm.get_empirical_prior(param_type[t]).get('new_beta', {})) ## create model and priors for region/sex/year # select data that is about areas in this region, recent years, and sex of male or total only assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area) def is_relevant(r): if (r['area'] not in subtree) and r['area'] != 'all': return False if predict_year == 1990: if r['year_start'] > 1997: return False elif predict_year == 2005: if posteriors_only: if r['year_end'] < 1997 or r['year_start'] > 2007: return False else: if r['year_end'] < 1997: return False elif predict_year == 2010: if posteriors_only: if r['data_type'] == 'm_all': # include m_all data from 2005, since 2010 is not loaded if r['year_end'] < 1997: return False else: if r['year_end'] < 2007: return False else: if r['year_end'] < 1997: return False else: assert 0, 'Predictions for year %d not yet implemented' % predict_year if r['sex'] not in [predict_sex, 'total']: return False return True old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \ if (r['area'] in subtree or r['area'] == 'all')\ and ((predict_year >= 1997 and r['year_end'] >= 1997) or (predict_year <= 1997 and r['year_start'] <= 1997)) \ and r['sex'] in [predict_sex, 'total']] relevant_rows = model.input_data.index[model.input_data.apply(is_relevant, axis=1)] if predict_year == 1990: assert pl.all( relevant_rows == old_relevant_rows ), "relevant rows should be the same in new and old implementation for 1990" if not posteriors_only: assert pl.all( relevant_rows == old_relevant_rows ), "relevant rows should be the same in new and old implementation when posteriors_only is False" model.input_data = model.input_data.ix[relevant_rows] # replace area 'all' with predict_area model.input_data['area'][model.input_data['area'] == 'all'] = predict_area if inconsistent_fit: # generate fits for requested parameters inconsistently for t in params_to_fit: model.vars += ism.age_specific_rate( model, t, reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year, mu_age=None, mu_age_parent=emp_priors.get((t, 'mu')), sigma_age_parent=emp_priors.get((t, 'sigma')), rate_type=(t == 'rr') and 'log_normal' or 'neg_binom', zero_re=zero_re) if fast_fit: dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100) else: dismod3.fit.fit_asr(model, t, iter=iter, burn=burn, thin=thin, tune_interval=100) else: model.vars += ism.consistent(model, reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year, priors=emp_priors, zero_re=zero_re) ## fit model to data if fast_fit: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100) else: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=iter, burn=burn, thin=thin, tune_interval=100, verbose=True) # generate estimates posteriors = {} for t in 'i r f p rr pf m_with X'.split(): if t in model.vars: if t in model.parameters and 'level_bounds' in model.parameters[t]: lower = model.parameters[t]['level_bounds']['lower'] upper = model.parameters[t]['level_bounds']['upper'] else: lower = 0 upper = pl.inf posteriors[t] = covariate_model.predict_for( model, model.parameters.get(t, {}), predict_area, predict_sex, predict_year, predict_area, predict_sex, predict_year, True, # population weighted averages model.vars[t], lower, upper) try: graphics.plot_fit(model, vars, emp_priors, {}) pl.savefig(dir + '/image/posterior-%s+%s+%s.png' % (predict_area, predict_sex, predict_year)) except Exception, e: print 'Error generating output graphics' print e
def fit_emp_prior(id, param_type, fast_fit=False, generate_emp_priors=True, zero_re=True, alt_prior=False, global_heterogeneity='Slightly'): """ Fit empirical prior of specified type for specified model Parameters ---------- id : int The model id number for the job to fit param_type : str, one of incidence, prevalence, remission, excess-mortality, prevalence_x_excess-mortality The disease parameter to generate empirical priors for Example ------- >>> import fit_emp_prior >>> fit_emp_prior.fit_emp_prior(2552, 'incidence') """ dir = dismod3.settings.JOB_WORKING_DIR % id ## load the model from disk or from web import simplejson as json import data reload(data) dm = dismod3.load_disease_model(id) try: model = data.ModelData.load(dir) print 'loaded data from new format from %s' % dir except (IOError, AssertionError): model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json())) #model.save(dir) print 'loaded data from json, saved in new format for next time in %s' % dir ## next block fills in missing covariates with zero for col in model.input_data.columns: if col.startswith('x_'): model.input_data[col] = model.input_data[col].fillna(0.) # also fill all covariates missing in output template with zeros model.output_template = model.output_template.fillna(0) # set all heterogeneity priors to Slightly for the global fit for t in model.parameters: if 'heterogeneity' in model.parameters[t]: model.parameters[t]['heterogeneity'] = global_heterogeneity t = { 'incidence': 'i', 'prevalence': 'p', 'remission': 'r', 'excess-mortality': 'f', 'prevalence_x_excess-mortality': 'pf' }[param_type] model.input_data = model.get_data(t) if len(model.input_data) == 0: print 'No data for type %s, exiting' % param_type return dm ### For testing: ## speed up computation by reducing number of knots ## model.parameters[t]['parameter_age_mesh'] = [0, 10, 20, 40, 60, 100] ## smooth Slightly, Moderately, or Very ## model.parameters[t]['smoothness'] = dict(age_start=0, age_end=100, amount='Very') ## speed up computation be reducing data size ## predict_area = 'super-region_0' ## predict_year=2005 ## predict_sex='total' ## subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area) ## relevant_rows = [i for i, r in model.input_data.T.iteritems() \ ## if (r['area'] in subtree or r['area'] == 'all')\ ## and (r['year_end'] >= 1997) \ ## and r['sex'] in [predict_sex, 'total']] ## model.input_data = model.input_data.ix[relevant_rows] # testing changes #model.input_data['effective_sample_size'] = pl.minimum(1.e3, model.input_data['effective_sample_size']) #missing_ess = pl.isnan(model.input_data['effective_sample_size']) #model.input_data['effective_sample_size'][missing_ess] = 1. #model.input_data['z_overdisperse'] = 1. #print model.describe(t) #model.input_data = model.input_data[model.input_data['area'].map(lambda x: x in nx.bfs_tree(model.hierarchy, 'super-region_5'))] #model.input_data = model.input_data = model.input_data.drop(['x_LDI_id_Updated_7July2011'], axis=1) #model.input_data = model.input_data.filter([model.input_data['x_nottroponinuse'] == 0.] #model.input_data = model.input_data[:100] ## speed up output by not making predictions for empirical priors #generate_emp_priors = False print 'fitting', t model.vars += ism.age_specific_rate(model, t, reference_area='all', reference_sex='total', reference_year='all', mu_age=None, mu_age_parent=None, sigma_age_parent=None, rate_type=(t == 'rr') and 'log_normal' or 'neg_binom', zero_re=zero_re) # for backwards compatibility, should be removed eventually dm.model = model dm.vars = model.vars[t] vars = dm.vars if fast_fit: dm.map, dm.mcmc = dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100) else: dm.map, dm.mcmc = dismod3.fit.fit_asr(model, t, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True) stats = dm.vars['p_pred'].stats(batches=5) dm.vars['data']['mu_pred'] = stats['mean'] dm.vars['data']['sigma_pred'] = stats['standard deviation'] stats = dm.vars['pi'].stats(batches=5) dm.vars['data']['mc_error'] = stats['mc error'] dm.vars['data'][ 'residual'] = dm.vars['data']['value'] - dm.vars['data']['mu_pred'] dm.vars['data']['abs_residual'] = pl.absolute(dm.vars['data']['residual']) graphics.plot_fit(model, data_types=[t], ylab=['PY'], plot_config=(1, 1), fig_size=(8, 8)) if generate_emp_priors: for a in [ dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions ]: print 'generating empirical prior for %s' % a for s in dismod3.settings.gbd_sexes: for y in dismod3.settings.gbd_years: key = dismod3.utils.gbd_key_for(param_type, a, y, s) if t in model.parameters and 'level_bounds' in model.parameters[ t]: lower = model.parameters[t]['level_bounds']['lower'] upper = model.parameters[t]['level_bounds']['upper'] else: lower = 0 upper = pl.inf emp_priors = covariate_model.predict_for( model, model.parameters[t], 'all', 'total', 'all', a, dismod3.utils.clean(s), int(y), alt_prior, vars, lower, upper) dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0)) if 'eta' in vars: N, A = emp_priors.shape # N samples, for A age groups delta_trace = pl.transpose([ pl.exp(vars['eta'].trace()) for _ in range(A) ]) # shape delta matrix to match prediction matrix emp_prior_std = pl.sqrt( emp_priors.var(0) + (emp_priors**2 / delta_trace).mean(0)) else: emp_prior_std = emp_priors.std(0) dm.set_mcmc('emp_prior_std', key, emp_prior_std) pl.plot(model.parameters['ages'], dm.get_mcmc('emp_prior_mean', key), color='grey', label=a, zorder=-10, alpha=.5) pl.savefig(dir + '/prior-%s.png' % param_type) store_effect_coefficients(dm, vars, param_type) #graphics.plot_one_ppc(vars, t) #pl.savefig(dir + '/prior-%s-ppc.png'%param_type) graphics.plot_acorr(model) pl.savefig(dir + '/prior-%s-convergence.png' % param_type) graphics.plot_trace(model) pl.savefig(dir + '/prior-%s-trace.png' % param_type) graphics.plot_one_effects(model, t) pl.savefig(dir + '/prior-%s-effects.png' % param_type) # save results (do this last, because it removes things from the disease model that plotting function, etc, might need try: dm.save('dm-%d-prior-%s.json' % (id, param_type)) except IOError, e: print e
def fit_emp_prior( id, param_type, fast_fit=False, generate_emp_priors=True, zero_re=True, alt_prior=False, global_heterogeneity="Slightly", ): """ Fit empirical prior of specified type for specified model Parameters ---------- id : int The model id number for the job to fit param_type : str, one of incidence, prevalence, remission, excess-mortality, prevalence_x_excess-mortality The disease parameter to generate empirical priors for Example ------- >>> import fit_emp_prior >>> fit_emp_prior.fit_emp_prior(2552, 'incidence') """ dir = dismod3.settings.JOB_WORKING_DIR % id ## load the model from disk or from web import simplejson as json import data reload(data) dm = dismod3.load_disease_model(id) try: model = data.ModelData.load(dir) print "loaded data from new format from %s" % dir except (IOError, AssertionError): model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json())) # model.save(dir) print "loaded data from json, saved in new format for next time in %s" % dir ## next block fills in missing covariates with zero for col in model.input_data.columns: if col.startswith("x_"): model.input_data[col] = model.input_data[col].fillna(0.0) # also fill all covariates missing in output template with zeros model.output_template = model.output_template.fillna(0) # set all heterogeneity priors to Slightly for the global fit for t in model.parameters: if "heterogeneity" in model.parameters[t]: model.parameters[t]["heterogeneity"] = global_heterogeneity t = { "incidence": "i", "prevalence": "p", "remission": "r", "excess-mortality": "f", "prevalence_x_excess-mortality": "pf", }[param_type] model.input_data = model.get_data(t) if len(model.input_data) == 0: print "No data for type %s, exiting" % param_type return dm ### For testing: ## speed up computation by reducing number of knots ## model.parameters[t]['parameter_age_mesh'] = [0, 10, 20, 40, 60, 100] ## smooth Slightly, Moderately, or Very ## model.parameters[t]['smoothness'] = dict(age_start=0, age_end=100, amount='Very') ## speed up computation be reducing data size ## predict_area = 'super-region_0' ## predict_year=2005 ## predict_sex='total' ## subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area) ## relevant_rows = [i for i, r in model.input_data.T.iteritems() \ ## if (r['area'] in subtree or r['area'] == 'all')\ ## and (r['year_end'] >= 1997) \ ## and r['sex'] in [predict_sex, 'total']] ## model.input_data = model.input_data.ix[relevant_rows] # testing changes # model.input_data['effective_sample_size'] = pl.minimum(1.e3, model.input_data['effective_sample_size']) # missing_ess = pl.isnan(model.input_data['effective_sample_size']) # model.input_data['effective_sample_size'][missing_ess] = 1. # model.input_data['z_overdisperse'] = 1. # print model.describe(t) # model.input_data = model.input_data[model.input_data['area'].map(lambda x: x in nx.bfs_tree(model.hierarchy, 'super-region_5'))] # model.input_data = model.input_data = model.input_data.drop(['x_LDI_id_Updated_7July2011'], axis=1) # model.input_data = model.input_data.filter([model.input_data['x_nottroponinuse'] == 0.] # model.input_data = model.input_data[:100] ## speed up output by not making predictions for empirical priors # generate_emp_priors = False print "fitting", t model.vars += ism.age_specific_rate( model, t, reference_area="all", reference_sex="total", reference_year="all", mu_age=None, mu_age_parent=None, sigma_age_parent=None, rate_type=(t == "rr") and "log_normal" or "neg_binom", zero_re=zero_re, ) # for backwards compatibility, should be removed eventually dm.model = model dm.vars = model.vars[t] vars = dm.vars if fast_fit: dm.map, dm.mcmc = dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100) else: dm.map, dm.mcmc = dismod3.fit.fit_asr( model, t, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True ) stats = dm.vars["p_pred"].stats(batches=5) dm.vars["data"]["mu_pred"] = stats["mean"] dm.vars["data"]["sigma_pred"] = stats["standard deviation"] stats = dm.vars["pi"].stats(batches=5) dm.vars["data"]["mc_error"] = stats["mc error"] dm.vars["data"]["residual"] = dm.vars["data"]["value"] - dm.vars["data"]["mu_pred"] dm.vars["data"]["abs_residual"] = pl.absolute(dm.vars["data"]["residual"]) graphics.plot_fit(model, data_types=[t], ylab=["PY"], plot_config=(1, 1), fig_size=(8, 8)) if generate_emp_priors: for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]: print "generating empirical prior for %s" % a for s in dismod3.settings.gbd_sexes: for y in dismod3.settings.gbd_years: key = dismod3.utils.gbd_key_for(param_type, a, y, s) if t in model.parameters and "level_bounds" in model.parameters[t]: lower = model.parameters[t]["level_bounds"]["lower"] upper = model.parameters[t]["level_bounds"]["upper"] else: lower = 0 upper = pl.inf emp_priors = covariate_model.predict_for( model, model.parameters[t], "all", "total", "all", a, dismod3.utils.clean(s), int(y), alt_prior, vars, lower, upper, ) dm.set_mcmc("emp_prior_mean", key, emp_priors.mean(0)) if "eta" in vars: N, A = emp_priors.shape # N samples, for A age groups delta_trace = pl.transpose( [pl.exp(vars["eta"].trace()) for _ in range(A)] ) # shape delta matrix to match prediction matrix emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors ** 2 / delta_trace).mean(0)) else: emp_prior_std = emp_priors.std(0) dm.set_mcmc("emp_prior_std", key, emp_prior_std) pl.plot( model.parameters["ages"], dm.get_mcmc("emp_prior_mean", key), color="grey", label=a, zorder=-10, alpha=0.5, ) pl.savefig(dir + "/prior-%s.png" % param_type) store_effect_coefficients(dm, vars, param_type) # graphics.plot_one_ppc(vars, t) # pl.savefig(dir + '/prior-%s-ppc.png'%param_type) graphics.plot_acorr(model) pl.savefig(dir + "/prior-%s-convergence.png" % param_type) graphics.plot_trace(model) pl.savefig(dir + "/prior-%s-trace.png" % param_type) graphics.plot_one_effects(model, t) pl.savefig(dir + "/prior-%s-effects.png" % param_type) # save results (do this last, because it removes things from the disease model that plotting function, etc, might need try: dm.save("dm-%d-prior-%s.json" % (id, param_type)) except IOError, e: print e
def fit_posterior(dm, region, sex, year, fast_fit=False, inconsistent_fit=False, params_to_fit=['p', 'r', 'i'], zero_re=True, posteriors_only=False): """ Fit posterior of specified region/sex/year for specified model Parameters ---------- dm : DiseaseJson region : str From dismod3.settings.gbd_regions, but clean()-ed sex : str, from dismod3.settings.gbd_sexes year : str, from dismod3.settings.gbd_years fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing) inconsistent_fit : fit parameters separately params_to_fit : list of params to fit, if not fitting all consistently zero_re : bool, if true, enforce constraint that sibling area REs sum to zero posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010 Example ------- >>> import fit_posterior >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005') """ dir = dismod3.settings.JOB_WORKING_DIR % dm.id ## load the model from disk or from web import simplejson as json import data reload(data) try: model = data.ModelData.load(dir) print 'loaded data from new format from %s' % dir except (IOError, AssertionError): model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json())) #model.save(dir) print 'loaded data from json, saved in new format for next time in %s' % dir # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros ## next block fills in missing covariates with zero for col in model.input_data.columns: if col.startswith('x_'): model.input_data[col] = model.input_data[col].fillna(0.) # also fill all covariates missing in output template with zeros model.output_template = model.output_template.fillna(0) predict_area = dismod3.utils.clean(region) predict_sex = dismod3.utils.clean(sex) predict_year = int(year) ## load emp_priors dict from dm.params param_type = dict(i='incidence', p='prevalence', r='remission', f='excess-mortality', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality') emp_priors = {} for t in 'i r p f'.split(): # uncomment below to not use empirical prior for rate with zero data # if pl.all(model.input_data['data_type'] != t): # continue #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex) key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex) mu = dm.get_mcmc('emp_prior_mean', key) #mu = dm.get_mcmc('emp_prior_median', key) sigma = dm.get_mcmc('emp_prior_std', key) if len(mu) == 101 and len(sigma) == 101: emp_priors[t, 'mu'] = mu # TODO: determine best way to propagate prior on function emp_priors[t, 'sigma'] = sigma # ALT 1: scale so that the joint probability is not a # function of the length of the age function # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma)) ## update model.parameters['random_effects'] if there is information in the disease model expert_priors = model.parameters[t].get('random_effects', {}) model.parameters[t]['random_effects'] = dm.get_empirical_prior(param_type[t]).get('new_alpha', {}) model.parameters[t]['random_effects'].update(expert_priors) # shift random effects to make REs for observed children of predict area have mean zero re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \ for area in model.hierarchy.neighbors(predict_area) \ if area in model.parameters[t]['random_effects']]) for area in model.hierarchy.neighbors(predict_area): if area in model.parameters[t]['random_effects']: model.parameters[t]['random_effects'][area]['mu'] -= re_mean ## update model.parameters['fixed_effects'] if there is information in the disease model expert_fe_priors = model.parameters[t].get('fixed_effects', {}) model.parameters[t]['fixed_effects'].update(dm.get_empirical_prior(param_type[t]).get('new_beta', {})) ## create model and priors for region/sex/year # select data that is about areas in this region, recent years, and sex of male or total only assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area) def is_relevant(r): if (r['area'] not in subtree) and r['area'] != 'all': return False if predict_year == 1990: if r['year_start'] > 1997: return False elif predict_year == 2005: if posteriors_only: if r['year_end'] < 1997 or r['year_start'] > 2007: return False else: if r['year_end'] < 1997: return False elif predict_year == 2010: if posteriors_only: if r['data_type'] == 'm_all': # include m_all data from 2005, since 2010 is not loaded if r['year_end'] < 1997: return False else: if r['year_end'] < 2007: return False else: if r['year_end'] < 1997: return False else: assert 0, 'Predictions for year %d not yet implemented' % predict_year if r['sex'] not in [predict_sex, 'total']: return False return True old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \ if (r['area'] in subtree or r['area'] == 'all')\ and ((predict_year >= 1997 and r['year_end'] >= 1997) or (predict_year <= 1997 and r['year_start'] <= 1997)) \ and r['sex'] in [predict_sex, 'total']] relevant_rows = model.input_data.index[model.input_data.apply(is_relevant, axis=1)] if predict_year == 1990: assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation for 1990" if not posteriors_only: assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation when posteriors_only is False" model.input_data = model.input_data.ix[relevant_rows] # replace area 'all' with predict_area model.input_data['area'][model.input_data['area'] == 'all'] = predict_area if inconsistent_fit: # generate fits for requested parameters inconsistently for t in params_to_fit: model.vars += ism.age_specific_rate(model, t, reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year, mu_age=None, mu_age_parent=emp_priors.get((t, 'mu')), sigma_age_parent=emp_priors.get((t, 'sigma')), rate_type=(t == 'rr') and 'log_normal' or 'neg_binom', zero_re=zero_re) if fast_fit: dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100) else: dismod3.fit.fit_asr(model, t, iter=iter, burn=burn, thin=thin, tune_interval=100) else: model.vars += ism.consistent(model, reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year, priors=emp_priors, zero_re=zero_re) ## fit model to data if fast_fit: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100) else: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=iter, burn=burn, thin=thin, tune_interval=100, verbose=True) # generate estimates posteriors = {} for t in 'i r f p rr pf m_with X'.split(): if t in model.vars: if t in model.parameters and 'level_bounds' in model.parameters[t]: lower=model.parameters[t]['level_bounds']['lower'] upper=model.parameters[t]['level_bounds']['upper'] else: lower=0 upper=pl.inf posteriors[t] = covariate_model.predict_for(model, model.parameters.get(t, {}), predict_area, predict_sex, predict_year, predict_area, predict_sex, predict_year, True, # population weighted averages model.vars[t], lower, upper) try: graphics.plot_fit(model, vars, emp_priors, {}) pl.savefig(dir + '/image/posterior-%s+%s+%s.png'%(predict_area, predict_sex, predict_year)) except Exception, e: print 'Error generating output graphics' print e
def fit_world(id, fast_fit=False, zero_re=True, alt_prior=False, global_heterogeneity='Slightly'): """ Fit consistent for all data in world Parameters ---------- id : int The model id number for the job to fit Example ------- >>> import fit_world >>> dm = fit_world.dismod3.load_disease_model(1234) >>> fit_world.fit_world(dm) """ dir = dismod3.settings.JOB_WORKING_DIR % id ## load the model from disk or from web import simplejson as json import data reload(data) try: model = data.ModelData.load(dir) print 'loaded data from new format from %s' % dir dm = dismod3.load_disease_model(id) except (IOError, AssertionError): dm = dismod3.load_disease_model(id) model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json())) try: model.save(dir) print 'loaded data from json, saved in new format for next time in %s' % dir except IOError: print 'loaded data from json, failed to save in new format' ## next block fills in missing covariates with zero for col in model.input_data.columns: if col.startswith('x_'): model.input_data[col] = model.input_data[col].fillna(0.) # also fill all covariates missing in output template with zeros model.output_template = model.output_template.fillna(0) # set all heterogeneity priors to Slightly for the global fit for t in model.parameters: if 'heterogeneity' in model.parameters[t]: model.parameters[t]['heterogeneity'] = global_heterogeneity ### For testing: ## speed up computation by reducing number of knots ## for t in 'irf': ## model.parameters[t]['parameter_age_mesh'] = [0, 100] model.vars += dismod3.ism.consistent(model, reference_area='all', reference_sex='total', reference_year='all', priors={}, zero_re=zero_re) ## fit model to data if fast_fit: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100) else: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True) dm.model = model # borrow strength to inform sigma_alpha between rate types post-hoc types_with_re = ['rr', 'f', 'i', 'm', 'smr', 'p', 'r', 'pf', 'm_with', 'X'] ## first calculate sigma_alpha_bar from posterior draws from each alpha alpha_vals = [] for type in types_with_re: if 'alpha' in model.vars[type]: for alpha_i in model.vars[type]['alpha']: alpha_vals += [a for a in alpha_i.trace() if a != 0] # remove zeros because areas with no siblings are included for convenience but are pinned to zero ## then blend sigma_alpha_i and sigma_alpha_bar for each sigma_alpha_i if len(alpha_vals) > 0: sigma_alpha_bar = pl.std(alpha_vals) for type in types_with_re: if 'sigma_alpha' in model.vars[type]: for sigma_alpha_i in model.vars[type]['sigma_alpha']: cur_val = sigma_alpha_i.trace() sigma_alpha_i.trace._trace[0] = (cur_val + sigma_alpha_bar) * pl.ones_like(sigma_alpha_i.trace._trace[0]) for t in 'p i r f rr pf m_with'.split(): param_type = dict(i='incidence', r='remission', f='excess-mortality', p='prevalence', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality')[t] #graphics.plot_one_type(model, model.vars[t], {}, t) for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]: print 'generating empirical prior for %s' % a for s in dismod3.settings.gbd_sexes: for y in dismod3.settings.gbd_years: key = dismod3.utils.gbd_key_for(param_type, a, y, s) if t in model.parameters and 'level_bounds' in model.parameters[t]: lower=model.parameters[t]['level_bounds']['lower'] upper=model.parameters[t]['level_bounds']['upper'] else: lower=0 upper=pl.inf emp_priors = covariate_model.predict_for(model, model.parameters.get(t, {}), 'all', 'total', 'all', a, dismod3.utils.clean(s), int(y), alt_prior, model.vars[t], lower, upper) dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0)) if 'eta' in model.vars[t]: N,A = emp_priors.shape # N samples, for A age groups delta_trace = pl.transpose([pl.exp(model.vars[t]['eta'].trace()) for _ in range(A)]) # shape delta matrix to match prediction matrix emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors**2 / delta_trace).mean(0)) else: emp_prior_std = emp_priors.std(0) dm.set_mcmc('emp_prior_std', key, emp_prior_std) from fit_emp_prior import store_effect_coefficients store_effect_coefficients(dm, model.vars[t], param_type) if 'p_pred' in model.vars[t]: graphics.plot_one_ppc(model, t) pl.savefig(dir + '/prior-%s-ppc.png'%param_type) if 'p_pred' in model.vars[t] or 'lb' in model.vars[t]: graphics.plot_one_effects(model, t) pl.savefig(dir + '/prior-%s-effects.png'%param_type) for t in 'i r f p rr pf X m_with smr'.split(): fname = dir + '/empirical_priors/data-%s.csv'%t print 'saving tables for', t, 'to', fname if 'data' in model.vars[t] and 'p_pred' in model.vars[t]: stats = model.vars[t]['p_pred'].stats(batches=5) model.vars[t]['data']['mu_pred'] = stats['mean'] model.vars[t]['data']['sigma_pred'] = stats['standard deviation'] stats = model.vars[t]['pi'].stats(batches=5) model.vars[t]['data']['mc_error'] = stats['mc error'] model.vars[t]['data']['residual'] = model.vars[t]['data']['value'] - model.vars[t]['data']['mu_pred'] model.vars[t]['data']['abs_residual'] = pl.absolute(model.vars[t]['data']['residual']) #if 'delta' in model.vars[t]: # model.vars[t]['data']['logp'] = [mc.negative_binomial_like(n*p_obs, n*p_pred, n*p_pred*d) for n, p_obs, p_pred, d \ # in zip(model.vars[t]['data']['effective_sample_size'], # model.vars[t]['data']['value'], # model.vars[t]['data']['mu_pred'], # pl.atleast_1d(model.vars[t]['delta'].stats()['mean']))] model.vars[t]['data'].to_csv(fname) graphics.plot_fit(model) pl.savefig(dir + '/prior.png') graphics.plot_acorr(model) pl.savefig(dir + '/prior-convergence.png') graphics.plot_trace(model) pl.savefig(dir + '/prior-trace.png') # save results (do this last, because it removes things from the disease model that plotting function, etc, might need try: dm.save('dm-%d-prior-%s.json' % (dm.id, 'all')) except IOError, e: print e
def validate_consistent_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t*age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type==t] = p_t[data_type==t] # add covariate shifts import dismod3 import simplejson as json gbd_model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.hierarchy = gbd_model.hierarchy from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = {} for t in types: alpha[t] = alpha_true_sim(model, area_list, sigma_true) print json.dumps(alpha, indent=2) model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): t = data_type[i] p[i] = p[i] * pl.exp(pl.sum([alpha[t][n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) #model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=101, burn=0, thin=1, tune_interval=100) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i+1) pl.plot(range(101), sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(range(101), sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr'] model.delta['sigma_pred'] = [pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr'] data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame() model.sigma = pandas.DataFrame() for t in types: alpha_t = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) alpha_t['true'] = pandas.Series(dict(alpha[t])) alpha_t['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['type'] = t model.alpha = model.alpha.append(alpha_t.dropna(), ignore_index=True) sigma_t = pandas.DataFrame(dict(true=sigma_true)) sigma_t['mu_pred'] = [n.stats()['mean'] for n in model.vars[t]['sigma_alpha']] sigma_t['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars[t]['sigma_alpha']] model.sigma = model.sigma.append(sigma_t.dropna(), ignore_index=True) data_simulation.add_quality_metrics(model.alpha) data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame(dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats()['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.mu['abs_err'].mean(), pl.median(pl.absolute(model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_consistent_model_sim(N=500, delta_true=.5, true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t * age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type == t] = p_t[data_type == t] n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i + 1) pl.plot(a, sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(a, sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) #graphics.plot_one_type(model, model.vars['p'], {}, 'p') #pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][ data_type == t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type == t] = model.vars['p'][ 'p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame( dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr' ] model.delta['sigma_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr' ] data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame( dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats() ['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.mu['abs_err'].mean(), pl.median(pl.absolute( model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.finalize_results(model) print model.results return model