def test_consistent_model_sim(): m = data.ModelData() # generate simulated data n = 50 sigma_true = .025 a = pl.arange(0, 100, 1) pi_age_true = .0001 * (a * (100. - a) + 100.) m.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true) last_index = m.input_data.index[-1] m.input_data.ix[last_index, 'data_type'] = 'r' # make sure that there are multiple data types in the data set # create model and priors vars = ism.consistent(m, 'all', 'total', 'all', {}) # fit model m = mc.MCMC(vars) m.sample(1) return vars
def test_consistent_model_forward(): m = data.ModelData() vars = ism.consistent(m, 'all', 'total', 'all', {}) def set_mu_age(vars, x): for n in vars['gamma']: n.value = pl.log(x) set_mu_age(vars['i'], .01) set_mu_age(vars['r'], .0001) set_mu_age(vars['f'], .0001) print vars['p']['mu_age'].value[::10].round(3) set_mu_age(vars['i'], .02) set_mu_age(vars['r'], .0001) set_mu_age(vars['f'], .0001) print vars['p']['mu_age'].value[::10].round(3) set_mu_age(vars['i'], 2.) set_mu_age(vars['r'], 20.) set_mu_age(vars['f'], .0001) print vars['p']['mu_age'].value[::10].round(3)
def fit_posterior(dm, region, sex, year, fast_fit=False, inconsistent_fit=False, params_to_fit=['p', 'r', 'i'], zero_re=True, posteriors_only=False): """ Fit posterior of specified region/sex/year for specified model Parameters ---------- dm : DiseaseJson region : str From dismod3.settings.gbd_regions, but clean()-ed sex : str, from dismod3.settings.gbd_sexes year : str, from dismod3.settings.gbd_years fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing) inconsistent_fit : fit parameters separately params_to_fit : list of params to fit, if not fitting all consistently zero_re : bool, if true, enforce constraint that sibling area REs sum to zero posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010 Example ------- >>> import fit_posterior >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005') """ dir = dismod3.settings.JOB_WORKING_DIR % dm.id ## load the model from disk or from web import simplejson as json import data reload(data) try: model = data.ModelData.load(dir) print 'loaded data from new format from %s' % dir except (IOError, AssertionError): model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json())) #model.save(dir) print 'loaded data from json, saved in new format for next time in %s' % dir # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros ## next block fills in missing covariates with zero for col in model.input_data.columns: if col.startswith('x_'): model.input_data[col] = model.input_data[col].fillna(0.) # also fill all covariates missing in output template with zeros model.output_template = model.output_template.fillna(0) predict_area = dismod3.utils.clean(region) predict_sex = dismod3.utils.clean(sex) predict_year = int(year) ## load emp_priors dict from dm.params param_type = dict(i='incidence', p='prevalence', r='remission', f='excess-mortality', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality') emp_priors = {} for t in 'i r p f'.split(): # uncomment below to not use empirical prior for rate with zero data # if pl.all(model.input_data['data_type'] != t): # continue #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex) key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex) mu = dm.get_mcmc('emp_prior_mean', key) #mu = dm.get_mcmc('emp_prior_median', key) sigma = dm.get_mcmc('emp_prior_std', key) if len(mu) == 101 and len(sigma) == 101: emp_priors[t, 'mu'] = mu # TODO: determine best way to propagate prior on function emp_priors[t, 'sigma'] = sigma # ALT 1: scale so that the joint probability is not a # function of the length of the age function # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma)) ## update model.parameters['random_effects'] if there is information in the disease model expert_priors = model.parameters[t].get('random_effects', {}) model.parameters[t]['random_effects'] = dm.get_empirical_prior( param_type[t]).get('new_alpha', {}) model.parameters[t]['random_effects'].update(expert_priors) # shift random effects to make REs for observed children of predict area have mean zero re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \ for area in model.hierarchy.neighbors(predict_area) \ if area in model.parameters[t]['random_effects']]) for area in model.hierarchy.neighbors(predict_area): if area in model.parameters[t]['random_effects']: model.parameters[t]['random_effects'][area]['mu'] -= re_mean ## update model.parameters['fixed_effects'] if there is information in the disease model expert_fe_priors = model.parameters[t].get('fixed_effects', {}) model.parameters[t]['fixed_effects'].update( dm.get_empirical_prior(param_type[t]).get('new_beta', {})) ## create model and priors for region/sex/year # select data that is about areas in this region, recent years, and sex of male or total only assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area) def is_relevant(r): if (r['area'] not in subtree) and r['area'] != 'all': return False if predict_year == 1990: if r['year_start'] > 1997: return False elif predict_year == 2005: if posteriors_only: if r['year_end'] < 1997 or r['year_start'] > 2007: return False else: if r['year_end'] < 1997: return False elif predict_year == 2010: if posteriors_only: if r['data_type'] == 'm_all': # include m_all data from 2005, since 2010 is not loaded if r['year_end'] < 1997: return False else: if r['year_end'] < 2007: return False else: if r['year_end'] < 1997: return False else: assert 0, 'Predictions for year %d not yet implemented' % predict_year if r['sex'] not in [predict_sex, 'total']: return False return True old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \ if (r['area'] in subtree or r['area'] == 'all')\ and ((predict_year >= 1997 and r['year_end'] >= 1997) or (predict_year <= 1997 and r['year_start'] <= 1997)) \ and r['sex'] in [predict_sex, 'total']] relevant_rows = model.input_data.index[model.input_data.apply(is_relevant, axis=1)] if predict_year == 1990: assert pl.all( relevant_rows == old_relevant_rows ), "relevant rows should be the same in new and old implementation for 1990" if not posteriors_only: assert pl.all( relevant_rows == old_relevant_rows ), "relevant rows should be the same in new and old implementation when posteriors_only is False" model.input_data = model.input_data.ix[relevant_rows] # replace area 'all' with predict_area model.input_data['area'][model.input_data['area'] == 'all'] = predict_area if inconsistent_fit: # generate fits for requested parameters inconsistently for t in params_to_fit: model.vars += ism.age_specific_rate( model, t, reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year, mu_age=None, mu_age_parent=emp_priors.get((t, 'mu')), sigma_age_parent=emp_priors.get((t, 'sigma')), rate_type=(t == 'rr') and 'log_normal' or 'neg_binom', zero_re=zero_re) if fast_fit: dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100) else: dismod3.fit.fit_asr(model, t, iter=iter, burn=burn, thin=thin, tune_interval=100) else: model.vars += ism.consistent(model, reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year, priors=emp_priors, zero_re=zero_re) ## fit model to data if fast_fit: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100) else: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=iter, burn=burn, thin=thin, tune_interval=100, verbose=True) # generate estimates posteriors = {} for t in 'i r f p rr pf m_with X'.split(): if t in model.vars: if t in model.parameters and 'level_bounds' in model.parameters[t]: lower = model.parameters[t]['level_bounds']['lower'] upper = model.parameters[t]['level_bounds']['upper'] else: lower = 0 upper = pl.inf posteriors[t] = covariate_model.predict_for( model, model.parameters.get(t, {}), predict_area, predict_sex, predict_year, predict_area, predict_sex, predict_year, True, # population weighted averages model.vars[t], lower, upper) try: graphics.plot_fit(model, vars, emp_priors, {}) pl.savefig(dir + '/image/posterior-%s+%s+%s.png' % (predict_area, predict_sex, predict_year)) except Exception, e: print 'Error generating output graphics' print e
def fit_posterior(dm, region, sex, year, fast_fit=False, inconsistent_fit=False, params_to_fit=['p', 'r', 'i'], zero_re=True, posteriors_only=False): """ Fit posterior of specified region/sex/year for specified model Parameters ---------- dm : DiseaseJson region : str From dismod3.settings.gbd_regions, but clean()-ed sex : str, from dismod3.settings.gbd_sexes year : str, from dismod3.settings.gbd_years fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing) inconsistent_fit : fit parameters separately params_to_fit : list of params to fit, if not fitting all consistently zero_re : bool, if true, enforce constraint that sibling area REs sum to zero posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010 Example ------- >>> import fit_posterior >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005') """ dir = dismod3.settings.JOB_WORKING_DIR % dm.id ## load the model from disk or from web import simplejson as json import data reload(data) try: model = data.ModelData.load(dir) print 'loaded data from new format from %s' % dir except (IOError, AssertionError): model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json())) #model.save(dir) print 'loaded data from json, saved in new format for next time in %s' % dir # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros ## next block fills in missing covariates with zero for col in model.input_data.columns: if col.startswith('x_'): model.input_data[col] = model.input_data[col].fillna(0.) # also fill all covariates missing in output template with zeros model.output_template = model.output_template.fillna(0) predict_area = dismod3.utils.clean(region) predict_sex = dismod3.utils.clean(sex) predict_year = int(year) ## load emp_priors dict from dm.params param_type = dict(i='incidence', p='prevalence', r='remission', f='excess-mortality', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality') emp_priors = {} for t in 'i r p f'.split(): # uncomment below to not use empirical prior for rate with zero data # if pl.all(model.input_data['data_type'] != t): # continue #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex) key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex) mu = dm.get_mcmc('emp_prior_mean', key) #mu = dm.get_mcmc('emp_prior_median', key) sigma = dm.get_mcmc('emp_prior_std', key) if len(mu) == 101 and len(sigma) == 101: emp_priors[t, 'mu'] = mu # TODO: determine best way to propagate prior on function emp_priors[t, 'sigma'] = sigma # ALT 1: scale so that the joint probability is not a # function of the length of the age function # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma)) ## update model.parameters['random_effects'] if there is information in the disease model expert_priors = model.parameters[t].get('random_effects', {}) model.parameters[t]['random_effects'] = dm.get_empirical_prior(param_type[t]).get('new_alpha', {}) model.parameters[t]['random_effects'].update(expert_priors) # shift random effects to make REs for observed children of predict area have mean zero re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \ for area in model.hierarchy.neighbors(predict_area) \ if area in model.parameters[t]['random_effects']]) for area in model.hierarchy.neighbors(predict_area): if area in model.parameters[t]['random_effects']: model.parameters[t]['random_effects'][area]['mu'] -= re_mean ## update model.parameters['fixed_effects'] if there is information in the disease model expert_fe_priors = model.parameters[t].get('fixed_effects', {}) model.parameters[t]['fixed_effects'].update(dm.get_empirical_prior(param_type[t]).get('new_beta', {})) ## create model and priors for region/sex/year # select data that is about areas in this region, recent years, and sex of male or total only assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area) def is_relevant(r): if (r['area'] not in subtree) and r['area'] != 'all': return False if predict_year == 1990: if r['year_start'] > 1997: return False elif predict_year == 2005: if posteriors_only: if r['year_end'] < 1997 or r['year_start'] > 2007: return False else: if r['year_end'] < 1997: return False elif predict_year == 2010: if posteriors_only: if r['data_type'] == 'm_all': # include m_all data from 2005, since 2010 is not loaded if r['year_end'] < 1997: return False else: if r['year_end'] < 2007: return False else: if r['year_end'] < 1997: return False else: assert 0, 'Predictions for year %d not yet implemented' % predict_year if r['sex'] not in [predict_sex, 'total']: return False return True old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \ if (r['area'] in subtree or r['area'] == 'all')\ and ((predict_year >= 1997 and r['year_end'] >= 1997) or (predict_year <= 1997 and r['year_start'] <= 1997)) \ and r['sex'] in [predict_sex, 'total']] relevant_rows = model.input_data.index[model.input_data.apply(is_relevant, axis=1)] if predict_year == 1990: assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation for 1990" if not posteriors_only: assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation when posteriors_only is False" model.input_data = model.input_data.ix[relevant_rows] # replace area 'all' with predict_area model.input_data['area'][model.input_data['area'] == 'all'] = predict_area if inconsistent_fit: # generate fits for requested parameters inconsistently for t in params_to_fit: model.vars += ism.age_specific_rate(model, t, reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year, mu_age=None, mu_age_parent=emp_priors.get((t, 'mu')), sigma_age_parent=emp_priors.get((t, 'sigma')), rate_type=(t == 'rr') and 'log_normal' or 'neg_binom', zero_re=zero_re) if fast_fit: dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100) else: dismod3.fit.fit_asr(model, t, iter=iter, burn=burn, thin=thin, tune_interval=100) else: model.vars += ism.consistent(model, reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year, priors=emp_priors, zero_re=zero_re) ## fit model to data if fast_fit: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100) else: dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=iter, burn=burn, thin=thin, tune_interval=100, verbose=True) # generate estimates posteriors = {} for t in 'i r f p rr pf m_with X'.split(): if t in model.vars: if t in model.parameters and 'level_bounds' in model.parameters[t]: lower=model.parameters[t]['level_bounds']['lower'] upper=model.parameters[t]['level_bounds']['upper'] else: lower=0 upper=pl.inf posteriors[t] = covariate_model.predict_for(model, model.parameters.get(t, {}), predict_area, predict_sex, predict_year, predict_area, predict_sex, predict_year, True, # population weighted averages model.vars[t], lower, upper) try: graphics.plot_fit(model, vars, emp_priors, {}) pl.savefig(dir + '/image/posterior-%s+%s+%s.png'%(predict_area, predict_sex, predict_year)) except Exception, e: print 'Error generating output graphics' print e