def write_data(data_list, wb): """ Write data as a table that can be loaded into dismod""" ws = wb.add_sheet('data') if len(data_list) == 0: return all_keys = set() for d in data_list: all_keys |= set(d.keys()) required_keys = ['GBD Cause', 'Parameter', 'GBD Region', 'Country ISO3 Code', 'Sex', 'Year Start', 'Year End', 'Age Start', 'Age End', 'Parameter Value', 'Standard Error', 'Units', ] redundant_keys = ['_row', 'age_weights', 'id', 'value', 'condition', 'data_type', 'region'] additional_keys = sorted(all_keys - set([clean(k) for k in required_keys] + redundant_keys)) keys = required_keys + additional_keys for c, k in enumerate(keys): if k == 'GBD Region': k = 'Region' ws.write(0, c, k) for r, d in enumerate(sorted(data_list, key=lambda d: d.get('_row'))): for c, k in enumerate(keys): val = d.get(clean(k), '') if val == 'mortality data': val = 'with condition mortality data' ws.write(r+1, c, val)
def covariates(d): """ extract the covariates from a data point as a vector; Xa represents region-level covariates: Xa[0],...,Xa[21] = region indicators Xa[22] = year-1997 Xa[23] = 1 if sex == 'male', -1 if sex == 'female' Xb represented study-level covariates: Xb[0] = self-reported Xb[1] = threshold (integer) """ Xa = np.zeros(len(gbd_regions) + 2) for ii, r in enumerate(gbd_regions): if clean(d['gbd_region']) == clean(r): Xa[ii] = 1. Xa[ii+1] = .1 * (.5 * (float(d['year_start']) + float(d['year_end'])) - 1997) if clean(d['sex']) == 'male': Xa[ii+2] = .5 elif clean(d['sex']) == 'female': Xa[ii+2] = -.5 else: Xa[ii+2] = 0. Xb = np.zeros(5.) # TODO: instead of hard-coding this, store it in the disease model # (and let users set it through the web) if clean(d.get('self_reported', '')) == 'true': Xb[0] = 1. if d.has_key('threshold'): Xb[0] = float(d['threshold']) return Xa, Xb
def regional_covariates(key, covariates_dict, derived_covariate): """ form the covariates for a gbd key""" if not key in covariate_hash: try: t,r,y,s = dismod3.utils.type_region_year_sex_from_key(key) except KeyError: r = 'world' y = 1997 s = 'total' d = {'gbd_region': r, 'year_start': y, 'year_end': y, 'sex': s} for level in ['Study_level', 'Country_level']: for k in covariates_dict[level]: if k == 'none': continue if covariates_dict[level][k]['rate']['value']: d[clean(k)] = covariates_dict[level][k]['value']['value'] if level == 'Country_level': d[clean(k)] = regional_average(derived_covariate, k, r, y, s) else: d[clean(k)] = float(d[clean(k)] or 0.) covariate_hash[key] = covariates(d, covariates_dict) return covariate_hash[key]
def covariates(d, covariates_dict): """ extract the covariates from a data point as a vector; Xa represents region-level covariates: Xa[0],...,Xa[21] = region indicators Xa[22] = .1*(year-1997) Xa[23] = .5 if sex == 'male', -.5 if sex == 'female' Xb represents study-level covariates, according to the covariates_dict """ Xa = np.zeros(len(gbd_regions) + 2) for ii, r in enumerate(gbd_regions): if clean(d['gbd_region']) == clean(r): Xa[ii] = 1. Xa[ii+1] = .1 * (.5 * (float(d['year_start']) + float(d['year_end'])) - 1997) if clean(d['sex']) == 'male': Xa[ii+2] = .5 elif clean(d['sex']) == 'female': Xa[ii+2] = -.5 else: Xa[ii+2] = 0. Xb = [] for level in ['Study_level', 'Country_level']: for k in sorted(covariates_dict[level]): if covariates_dict[level][k]['rate']['value'] == 1 and standardize_data_type[d['parameter']][:-5] in covariates_dict[level][k]['types']['value']: Xb.append(float(d.get(clean(k)) or 0.)) #debug('%s-%s-%s-%s: Xb = %s' % (d['sex'], d['year_start'], d['gbd_region'], d.get('country_iso3_code', 'none'), str(Xb))) if Xb == []: Xb = [0.] return Xa, Xb
def get_global_priors(self, type): """ Return the global priors that best match the specified type Since the type might be a key with the form 'incidence+sub-saharan_africa_east+1990+female', return the first global prior who's key is found as a substring of ``type`` Build and cache the global_priors_dict from the global_priors_json, if necessary. """ if not hasattr(self, "global_priors"): raw_dict = self.params.get("global_priors", {}) self.global_priors = { "prevalence": {}, "incidence": {}, "remission": {}, "excess_mortality": {}, "relative_risk": {}, "duration": {}, } # reverse the order of the first and second level of keys in the raw_dict # this will be more convenient later for k1 in [ "heterogeneity", "smoothness", "level_value", "level_bounds", "increasing", "decreasing", "unimodal", ]: if not raw_dict.has_key(k1): continue for k2 in raw_dict[k1]: self.global_priors[k2][k1] = raw_dict[k1][k2] # deal with the dash vs underscore self.global_priors["excess-mortality"] = self.global_priors["excess_mortality"] self.global_priors["relative-risk"] = self.global_priors["relative_risk"] for k in self.global_priors: self.global_priors[k]["prior_str"] = prior_dict_to_str(self.global_priors[k]) for k in self.global_priors: if clean(type) == clean(k): return self.global_priors[k]["prior_str"] return ""
def regional_population(key): """ calculate regional population for a gbd key""" t,r,y,s = type_region_year_sex_from_key(key) pop = np.zeros(MAX_AGE) for c in countries_for[clean(r)]: pop += population_by_age[(c, y, s)] return pop
def regional_population(key): """ calculate regional population for a gbd key""" t,r,y,s = dismod3.utils.type_region_year_sex_from_key(key) pop = pl.zeros(dismod3.settings.MAX_AGE) for c in countries_for[clean(r)]: if y == 'all' and s == 'all': for yy in dismod3.settings.gbd_years: for ss in dismod3.settings.gbd_sexes: pop += population_by_age[(c, yy, dismod3.utils.clean(ss))] else: pop += population_by_age[(c, y, s)] return pop
def country_covariates(key, iso3, covariates_dict, derived_covariate): """ form the covariates for a gbd key""" if not (key, iso3) in covariate_hash: t,r,y,s = dismod3.utils.type_region_year_sex_from_key(key) d = {'gbd_region': r, 'year_start': y, 'year_end': y, 'sex': s} for level in ['Study_level', 'Country_level']: for k in covariates_dict[level]: if k == 'none': continue if covariates_dict[level][k]['rate']['value']: d[clean(k)] = covariates_dict[level][k]['value']['value'] if level == 'Country_level': if k not in derived_covariate: debug('WARNING: derived covariate %s not found' % key) d[clean(k)] = 0. elif not derived_covariate[k].has_key('%s+%s+%s'%(iso3,y,s)): debug('WARNING: derived covariate %s not found for (%s, %s, %s)' % (k, iso3, y, s)) d[clean(k)] = 0. else: d[clean(k)] = derived_covariate[k].get('%s+%s+%s'%(iso3,y,s), 0.) else: d[clean(k)] = float(d[clean(k)] or 0.) covariate_hash[(key, iso3)] = covariates(d, covariates_dict) return covariate_hash[(key, iso3)]
def country_covariates(key, iso3, covariates_dict): """ form the covariates for a gbd key""" if not (key, iso3) in covariate_hash: t,r,y,s = type_region_year_sex_from_key(key) d = {'parameter': t, 'gbd_region': r, 'year_start': y, 'year_end': y, 'sex': s} for level in ['Study_level', 'Country_level']: for k in covariates_dict[level]: if k == 'none': continue d[clean(k)] = covariates_dict[level][k]['value']['value'] if d[clean(k)] == 'Country Specific Value': d[clean(k)] = covariates_dict[level][k]['defaults'].get(iso3, 0.) else: d[clean(k)] = float(d[clean(k)] or 0.) covariate_hash[(key, iso3)] = covariates(d, covariates_dict) return covariate_hash[(key, iso3)]
def regional_covariates(key, covariates_dict, derived_covariate): """ form the covariates for a gbd key""" if not key in covariate_hash: t,r,y,s = type_region_year_sex_from_key(key) d = {'gbd_region': r, 'year_start': y, 'year_end': y, 'sex': s} for level in ['Study_level', 'Country_level']: for k in covariates_dict[level]: if k == 'none': continue if covariates_dict[level][k]['rate']['value']: d[clean(k)] = covariates_dict[level][k]['value']['value'] if d[clean(k)] == 'Country Specific Value': d[clean(k)] = regional_average(derived_covariate, k, r, y, s) else: d[clean(k)] = float(d[clean(k)] or 0.) covariate_hash[key] = covariates(d, covariates_dict) return covariate_hash[key]
def covariates(d, covariates_dict): """ extract the covariates from a data point as a vector; Xa represents region-level covariates: Xa[0],...,Xa[21] = region indicators Xa[22] = .1*(year-1997) Xa[23] = .5 if sex == 'male', -.5 if sex == 'female' Xb represents study-level covariates, according to the covariates_dict """ Xa = pl.zeros(len(dismod3.gbd_regions) + 2) for ii, r in enumerate(dismod3.gbd_regions): if clean(d['gbd_region']) == clean(r): Xa[ii] = 1. if d['year_start'] == 'all': Xa[ii+1] = 0. else: Xa[ii+1] = .1 * (.5 * (float(d['year_start']) + float(d['year_end'])) - 1997) if clean(d['sex']) == 'male': Xa[ii+2] = .5 elif clean(d['sex']) == 'female': Xa[ii+2] = -.5 else: Xa[ii+2] = 0. Xb = [] for level in ['Study_level', 'Country_level']: for k in sorted(covariates_dict[level]): if covariates_dict[level][k]['rate']['value'] == 1: Xb.append(float(d.get(clean(k)) or 0.)) #debug('%s-%s-%s-%s: Xb = %s' % (d['sex'], d['year_start'], d['gbd_region'], d.get('country_iso3_code', 'none'), str(Xb))) if Xb == []: Xb = [0.] return Xa, Xb
def regional_covariates(key, covariates_dict): """ form the covariates for a gbd key""" if not key in covariate_hash: t,r,y,s = type_region_year_sex_from_key(key) d = {'parameter': t, 'gbd_region': r, 'year_start': y, 'year_end': y, 'sex': s} for level in ['Study_level', 'Country_level']: for k in covariates_dict[level]: if k == 'none': continue d[clean(k)] = covariates_dict[level][k]['value']['value'] if d[clean(k)] == 'Country Specific Value': # FIXME: this could be returning bogus answers d[clean(k)] = regional_average(covariates_dict[level][k]['defaults'], r) else: d[clean(k)] == float(d[clean(k)] or 0.) covariate_hash[key] = covariates(d, covariates_dict) return covariate_hash[key]
def relevant_to(self, d, t, r, y, s): """ Determine if data is relevant to specified type, region, year, and sex Parameters ---------- d : data hash t : str, one of 'incidence data', 'prevalence data', etc... or 'all' r : str, one of 21 GBD regions or 'all' y : int, one of 1990, 2005 or 'all' s : sex, one of 'male', 'female' or 'all' """ from dismod3.utils import clean # ignore data if requested if d.get('ignore') == 1: return False # check if data is of the correct type if t != 'all': if clean(d['data_type']) != clean(t + ' data'): return False # check if data is from correct region if r != 'all' and r != 'world': if clean(d['gbd_region']) != clean(r) and clean(d['gbd_region']) != 'all': return False # check if data is from relevant year if y != 'all': y = int(y) if not y in [1990, 1997, 2005]: raise KeyError, 'GBD Year must be 1990 or 2005 (or 1997 for all years)' if y == 2005 and d['year_end'] < 1997: return False if y == 1990 and d['year_start'] > 1997: return False # check if data is for relevant sex if s != 'all': if clean(d['sex']) != clean(s) and clean(d['sex']) != 'all': return False # if code makes it this far, the data is relevent return True
def relevant_to(d, t, r, y, s): """ Determine if data is relevant to specified type, region, year, and sex Parameters ---------- d : data hash t : str, one of 'incidence data', 'prevalence data', etc... or 'all' r : str, one of 21 GBD regions or 'all' y : int, one of 1990, 2005 or 'all' s : sex, one of 'male', 'female' or 'all' """ # ignore data if requested if d.get('ignore') == 1: return False # check if data is of the correct type if t != 'all': if clean(d['data_type']).find(clean(t)) != 0: return False # check if data is from correct region if r != 'all' and r != 'world': if clean(d['gbd_region']) != clean(r) and clean(d['gbd_region']) != 'all': return False # check if data is from relevant year if y != 'all': y = int(y) if not y in [1990, 1997, 2005]: raise KeyError, 'GBD Year must be 1990 or 2005 (or 1997 for all years)' if y == 2005 and d['year_end'] < 1997: return False if y == 1990 and d['year_start'] > 1997: return False # check if data is for relevant sex if s != 'all': if clean(d['sex']) != clean(s) and clean(d['sex']) != 'all': return False # if code makes it this far, the data is relevent return True
def daemon_loop(): on_sge = dismod3.settings.ON_SGE while True: try: job_queue = dismod3.get_job_queue() except: job_queue = [] for param_id in job_queue: #tweet('processing job %d' % id) log('processing job %d' % param_id) job_params = dismod3.remove_from_job_queue(param_id) id = int(job_params['dm_id']) dm = dismod3.get_disease_model(id) # make a working directory for the id dir = dismod3.settings.JOB_WORKING_DIR % id if not os.path.exists(dir): os.makedirs(dir) estimate_type = dm.params.get('run_status', {}).get('estimate_type', 'fit all individually') if estimate_type.find('posterior') != -1: #fit each region/year/sex individually for this model regions_to_fit = dm.params.get('run_status', {}).get('regions_to_fit', []) if regions_to_fit[0] == 'all_regions': regions_to_fit = dismod3.gbd_regions d = '%s/posterior' % dir if os.path.exists(d): rmtree(d) os.mkdir(d) os.mkdir('%s/stdout' % d) os.mkdir('%s/stderr' % d) dismod3.init_job_log(id, 'posterior', param_id) for r in regions_to_fit: for s in dismod3.gbd_sexes: for y in dismod3.gbd_years: # fit only one region, for the time being... # TODO: make region selection a user-settable option from the gui #if clean(r) != 'asia_southeast': # continue k = '%s+%s+%s' % (clean(r), s, y) o = '%s/stdout/%s' % (d, k) e = '%s/stderr/%s' % (d, k) if on_sge: call_str = dismod3.settings.GBD_FIT_STR % (o, e, '-l -r %s -s %s -y %s' % (clean(r), s, y), id) subprocess.call(call_str, shell=True) else: call_str = dismod3.settings.GBD_FIT_STR % ('-l -r %s -s %s -y %s' % (clean(r), s, y), id, o, e) subprocess.call(call_str, shell=True) time.sleep(1.) elif estimate_type.find('empirical priors') != -1: # fit empirical priors (by pooling data from all regions d = '%s/empirical_priors' % dir if os.path.exists(d): rmtree(d) os.mkdir(d) os.mkdir('%s/stdout' % d) os.mkdir('%s/stderr' % d) dismod3.init_job_log(id, 'empirical_priors', param_id) for t in ['excess-mortality', 'remission', 'incidence', 'prevalence']: o = '%s/stdout/%s' % (d, t) e = '%s/stderr/%s' % (d, t) if on_sge: subprocess.call(dismod3.settings.GBD_FIT_STR % (o, e, '-l -t %s' % t, id), shell=True) else: subprocess.call(dismod3.settings.GBD_FIT_STR % ('-l -t %s' % t, id, o, e), shell=True) else: #tweet('unrecognized estimate type: %s' % estimate_type) log('unrecognized estimate type: %s' % estimate_type) time.sleep(dismod3.settings.SLEEP_SECS)
def daemon_loop(): on_sge = dismod3.settings.ON_SGE while True: try: job_queue = dismod3.get_job_queue() except: job_queue = [] for param_id in job_queue: #tweet('processing job %d' % id) log('processing job %d' % param_id) job_params = dismod3.remove_from_job_queue(param_id) id = int(job_params['dm_id']) dm = dismod3.get_disease_model(id) # make a working directory for the id dir = dismod3.settings.JOB_WORKING_DIR % id if os.path.exists(dir): dismod3.disease_json.random_rename(dir) os.makedirs(dir) estimate_type = dm.params.get('run_status', {}).get('estimate_type', 'fit all individually') # sort the regions so that the data rich regions are fit first #data_hash = GBDDataHash(dm.data) #sorted_regions = sorted(dismod3.gbd_regions, reverse=True, #key=lambda r: len(data_hash.get(region=r))) if estimate_type == 'Fit continuous single parameter model': #dismod3.disease_json.create_disease_model_dir(id) o = '%s/continuous_spm.stdout' % dir e = '%s/continuous_spm.stderr' % dir if on_sge: print o print e call_str = 'qsub -cwd -o %s -e %s ' % (o, e) \ + 'run_on_cluster.sh /home/OUTPOST/abie/gbd_dev/gbd/fit_continuous_spm.py %d' % id else: call_str = 'python -u /home/abie/gbd/fit_continuous_spm.py %d 2>%s |tee %s' % (id, e, o) subprocess.call(call_str, shell=True) continue if estimate_type.find('posterior') != -1: #fit each region/year/sex individually for this model regions_to_fit = dm.params.get('run_status', {}).get('regions_to_fit', []) if regions_to_fit[0] == 'all_regions': regions_to_fit = dismod3.gbd_regions d = '%s/posterior' % dir if os.path.exists(d): rmtree(d) os.mkdir(d) os.mkdir('%s/stdout' % d) os.mkdir('%s/stderr' % d) os.mkdir('%s/pickle' % d) dismod3.init_job_log(id, 'posterior', param_id) for r in regions_to_fit: for s in dismod3.gbd_sexes: for y in dismod3.gbd_years: # fit only one region, for the time being... # TODO: make region selection a user-settable option from the gui #if clean(r) != 'asia_southeast': # continue k = '%s+%s+%s' % (clean(r), s, y) o = '%s/stdout/%s' % (d, k) e = '%s/stderr/%s' % (d, k) if on_sge: call_str = dismod3.settings.GBD_FIT_STR % (o, e, '-l -r %s -s %s -y %s' % (clean(r), s, y), id) subprocess.call(call_str, shell=True) else: call_str = dismod3.settings.GBD_FIT_STR % ('-l -r %s -s %s -y %s' % (clean(r), s, y), id, o, e) subprocess.call(call_str, shell=True) #time.sleep(1.) elif estimate_type.find('empirical priors') != -1: # fit empirical priors (by pooling data from all regions d = '%s/empirical_priors' % dir if os.path.exists(d): rmtree(d) os.mkdir(d) os.mkdir('%s/stdout' % d) os.mkdir('%s/stderr' % d) os.mkdir('%s/pickle' % d) dismod3.init_job_log(id, 'empirical_priors', param_id) for t in ['excess-mortality', 'remission', 'incidence', 'prevalence']: o = '%s/stdout/%s' % (d, t) e = '%s/stderr/%s' % (d, t) if on_sge: subprocess.call(dismod3.settings.GBD_FIT_STR % (o, e, '-l -t %s' % t, id), shell=True) else: subprocess.call(dismod3.settings.GBD_FIT_STR % ('-l -t %s' % t, id, o, e), shell=True) else: #tweet('unrecognized estimate type: %s' % estimate_type) log('unrecognized estimate type: %s' % estimate_type) time.sleep(dismod3.settings.SLEEP_SECS)
def fit_all(id): """ Enqueues all jobs necessary to fit specified model to the cluster Parameters ---------- id : int The model id number for the job to fit Example ------- >>> import fit_all >>> fit_all.fit_all(2552) """ # TODO: store all disease information in this dir already, so fetching is not necessary # download the disease model json and store it in the working dir print 'downloading disease model' dismod3.disease_json.create_disease_model_dir(id) dm = dismod3.fetch_disease_model(id) # get the all-cause mortality data, and merge it into the model mort = dismod3.fetch_disease_model('all-cause_mortality') dm.data += mort.data dm.save() # fit empirical priors (by pooling data from all regions) dir = dismod3.settings.JOB_WORKING_DIR % id # TODO: refactor into a function emp_names = [] for t in ['prevalence']: o = '%s/empirical_priors/stdout/%s' % (dir, t) e = '%s/empirical_priors/stderr/%s' % (dir, t) name_str = '%s-%d' %(t[0], id) emp_names.append(name_str) call_str = 'qsub -cwd -o %s -e %s ' % (o, e) \ + '-N %s ' % name_str \ + 'run_on_cluster.sh fit_emp_prior.py %d -t %s' % (id, t) subprocess.call(call_str, shell=True) # directory to save the country level posterior csv files temp_dir = dir + '/posterior/country_level_posterior_dm-' + str(id) + '/' if os.path.exists(temp_dir): rmtree(temp_dir) os.makedirs(temp_dir) #fit each region/year/sex individually for this model hold_str = '-hold_jid %s ' % ','.join(emp_names) post_names = [] for ii, r in enumerate(dismod3.gbd_regions): for s in dismod3.gbd_sexes: for y in dismod3.gbd_years: k = '%s+%s+%s' % (clean(r), s, y) o = '%s/posterior/stdout/%s' % (dir, k) e = '%s/posterior/stderr/%s' % (dir, k) name_str = '%s%d%s%s%d' % (r[0], ii+1, s[0], str(y)[-1], id) post_names.append(name_str) call_str = 'qsub -cwd -o %s -e %s ' % (o,e) \ + hold_str \ + '-N %s ' % name_str \ + 'run_on_cluster.sh fit_posterior_prevonly.py %d -r %s -s %s -y %s' % (id, clean(r), s, y) subprocess.call(call_str, shell=True) # after all posteriors have finished running, upload disease model json hold_str = '-hold_jid %s ' % ','.join(post_names) o = '%s/upload.stdout' % dir e = '%s/upload.stderr' % dir call_str = 'qsub -cwd -o %s -e %s ' % (o,e) \ + hold_str \ + '-N upld-%s ' % id \ + 'run_on_cluster.sh upload_fits.py %d' % id subprocess.call(call_str, shell=True)
def fit(dm, method='map'): """ Generate an estimate of the generic disease model parameters using maximum a posteriori liklihood (MAP) or Markov-chain Monte Carlo (MCMC) Parameters ---------- dm : dismod3.DiseaseModel the object containing all the data, priors, and additional information (like input and output age-mesh) method : string, optional the parameter estimation method, either 'map' or 'mcmc' Example ------- >>> import dismod3 >>> import dismod3.generic_disease_model as model >>> dm = dismod3.get_disease_model(1) >>> model.fit(dm, method='map') >>> model.fit(dm, method='mcmc') """ if not hasattr(dm, 'vars'): for param_type in ['incidence', 'remission', 'excess-mortality']: # find initial values for these rates data = [d for d in dm.data if clean(d['data_type']).find(param_type) != -1] # use a random subset of the data if there is a lot of it, # to speed things up if len(data) > 25: dm.fit_initial_estimate(param_type, random.sample(data,25)) else: dm.fit_initial_estimate(param_type, data) dm.set_units(param_type, '(per person-year)') dm.set_units('prevalence', '(per person)') dm.set_units('duration', '(years)') dm.vars = setup(dm) if method == 'map': if not hasattr(dm, 'map'): dm.map = mc.MAP(dm.vars) try: dm.map.fit(method='fmin_powell', iterlim=500, tol=.001, verbose=1) except KeyboardInterrupt: # if user cancels with cntl-c, save current values for "warm-start" pass for t in dismod3.settings.output_data_types: t = clean(t) val = dm.vars[t]['rate_stoch'].value dm.set_map(t, val) dm.set_initial_value(t, val) # better initial value may save time in the future elif method == 'mcmc': if not hasattr(dm, 'mcmc'): dm.mcmc = mc.MCMC(dm.vars) for key in dm.vars: stochs = dm.vars[key].get('logit_p_stochs', []) if len(stochs) > 0: dm.mcmc.use_step_method(mc.AdaptiveMetropolis, stochs) try: dm.mcmc.sample(iter=60*1000, burn=10*1000, thin=50, verbose=1) except KeyboardInterrupt: # if user cancels with cntl-c, save current values for "warm-start" pass for t in dismod3.settings.output_data_types: t = clean(t) rate_model.store_mcmc_fit(dm, t, dm.vars[t])
def table_disease_model(dm, keys, ws, x, y, group_size): """Make a table representation of the disease model data and estimates provided Parameters ---------- dm_json : str or DiseaseJson object the json string or a thin python wrapper around this data that is to be plotted keys : list the keys to include ws : work sheet x : horizontal shift y : vertical shift group_size : positive integer smaller than 102 """ MAX_AGE = dismod3.MAX_AGE group_sizes = [1, 4, 5, 5, 5, 5, 10, 10, 10, 10, 10, 10, 16] if group_size > 1: group_sizes = [] for i in range(MAX_AGE / group_size): group_sizes.append(group_size) if MAX_AGE % group_size > 0: group_sizes.append(MAX_AGE % group_size) data_hash = GBDDataHash(dm.data) c = dismod3.utils.KEY_DELIM_CHAR type, region, year, sex = keys[0].split(c) # add a key: with-condition-death = with-condition-mortality * prevalence * population keys.append('with-condition-death' + c + region + c + year + c + sex) ws.write(x + 2, y, "Condition: %s" % (dm.params['condition'])) ws.write(x + 3, y, "Region: %s" % (region)) ws.write(x + 4, y + 1, "%s %s" % (sex.capitalize(), year)) x += 5 for i in range(1, 5): ws.write(x, y + i, "Data") for i in range(5, 17): ws.write(x, y + i, "Prior") for i in range(17, 45): ws.write(x, y + i, "Posterior") x += 1 ws.write(x, y, "Age") ws.write(x, y + 1, "Prevalence") ws.write(x, y + 2, "Incidence") ws.write(x, y + 3, "Remission") ws.write(x, y + 4, "Excess Mortality") ws.write(x, y + 5, "Prevalence") ws.write(x, y + 6, "Prevalence") ws.write(x, y + 7, "Prevalence") ws.write(x, y + 8, "Incidence") ws.write(x, y + 9, "Incidence") ws.write(x, y + 10, "Incidence") ws.write(x, y + 11, "Remission") ws.write(x, y + 12, "Remission") ws.write(x, y + 13, "Remission") ws.write(x, y + 14, "Excess Mortality") ws.write(x, y + 15, "Excess Mortality") ws.write(x, y + 16, "Excess Mortality") ws.write(x, y + 17, "Prevalence") ws.write(x, y + 18, "Prevalence") ws.write(x, y + 19, "Prevalence") ws.write(x, y + 20, "Incidence") ws.write(x, y + 21, "Incidence") ws.write(x, y + 22, "Incidence") ws.write(x, y + 23, "Remission") ws.write(x, y + 24, "Remission") ws.write(x, y + 25, "Remission") ws.write(x, y + 26, "Excess Mortality") ws.write(x, y + 27, "Excess Mortality") ws.write(x, y + 28, "Excess Mortality") ws.write(x, y + 29, "Duration") ws.write(x, y + 30, "Duration") ws.write(x, y + 31, "Duration") ws.write(x, y + 32, "With-condition Mortality") ws.write(x, y + 33, "With-condition Mortality") ws.write(x, y + 34, "With-condition Mortality") ws.write(x, y + 35, "RR Mortality") ws.write(x, y + 36, "RR Mortality") ws.write(x, y + 37, "RR Mortality") ws.write(x, y + 38, "Age of onset") ws.write(x, y + 39, "Incidence_x_duration") ws.write(x, y + 40, "Incidence_x_duration") ws.write(x, y + 41, "Incidence_x_duration") ws.write(x, y + 42, "With-condition Death") ws.write(x, y + 43, "With-condition Death") ws.write(x, y + 44, "With-condition Death") x += 1 ws.write(x, y, "(years)") for i in range(1, 6): ws.write(x, y + i, "(rate)") ws.write(x, y + 6, "lower ui") ws.write(x, y + 7, "upper ui") ws.write(x, y + 8, "(rate)") ws.write(x, y + 9, "lower ui") ws.write(x, y + 10, "upper ui") ws.write(x, y + 11, "(rate)") ws.write(x, y + 12, "lower ui") ws.write(x, y + 13, "upper ui") ws.write(x, y + 14, "(rate)") ws.write(x, y + 15, "lower ui") ws.write(x, y + 16, "upper ui") ws.write(x, y + 17, "(rate)") ws.write(x, y + 18, "lower ui") ws.write(x, y + 19, "upper ui") ws.write(x, y + 20, "(rate)") ws.write(x, y + 21, "lower ui") ws.write(x, y + 22, "upper ui") ws.write(x, y + 23, "(rate)") ws.write(x, y + 24, "lower ui") ws.write(x, y + 25, "upper ui") ws.write(x, y + 26, "(rate)") ws.write(x, y + 27, "lower ui") ws.write(x, y + 28, "upper ui") ws.write(x, y + 29, "(years)") ws.write(x, y + 30, "lower ui") ws.write(x, y + 31, "upper ui") ws.write(x, y + 32, "(rate)") ws.write(x, y + 33, "lower ui") ws.write(x, y + 34, "upper ui") ws.write(x, y + 35, "(rate)") ws.write(x, y + 36, "lower ui") ws.write(x, y + 37, "upper ui") ws.write(x, y + 38, "(years)") ws.write(x, y + 39, "(thousand person-years)") ws.write(x, y + 40, "lower ui") ws.write(x, y + 41, "upper ui") ws.write(x, y + 42, "(thousands)") ws.write(x, y + 43, "lower ui") ws.write(x, y + 44, "upper ui") x += 1 y38 = y + 38 if group_size == 1: for j in range(MAX_AGE): ws.write(x + j, y, j) ws.write(x + j, y38, j + .5) elif group_size == 0: start = 0 end = 0 for j, s in enumerate(group_sizes): start = end end = start + s if start == 0: ws.write(x + j, y, "0") elif start == 85: ws.write(x + j, y, "85+") else: ws.write(x + j, y, "%s-%s" % (start, end - 1)) ws.write(x + j, y38, .5 * (start + end)) else: for j in range(MAX_AGE / group_size + 1): start = j * group_size end = start + group_size if end > MAX_AGE: end = MAX_AGE ws.write(x + j, y, "%s-%s" % (start, end - 1)) ws.write(x + j, y38, .5 * (start + end)) for k in keys: type, region, year, sex = k.split(c) data_type = clean(type) + ' data' data = data_hash.get(data_type, region, year, sex) \ + data_hash.get(data_type, region, year, 'total') column = y if type == 'prevalence': column = y + 1 elif type == 'incidence': column = y + 2 elif type == 'remission': column = y + 3 elif type == 'excess-mortality': column = y + 4 else: column = -1 if column != -1: data_all = [] data_weight_all = [] for j in range(MAX_AGE): data_all.append('') data_weight_all.append(0) for i in range(len(data)): start = data[i]['age_start'] end = data[i]['age_end'] if end > MAX_AGE: end = MAX_AGE for j in range(start, end + 1): p = data[i]['parameter_value'] / float(data[i]['units']) #std = data[i]['standard_error'] #age_weight = data[i]['age_weights'][j - start] data_weight = 1 #if std != 0: #data_weight = age_weight / std / std #else: #if p != 0: #data_weight = age_weight * 25 / (p**2 * (1 - p)**2) if data_all[j] == '': data_all[j] = p * data_weight else: data_all[j] += p * data_weight data_weight_all[j] += data_weight for j in range(MAX_AGE): if data_weight_all[j] != 0: data_all[j] = data_all[j] / data_weight_all[j] if group_size == 1: for j in range(MAX_AGE): ws.write(x + j, column, data_all[j]) elif group_size == 0: start = 0 end = 0 for j, gs in enumerate(group_sizes): start = end end = start + gs s = 0 n = 0 for i in range(start, end): if data_all[i] != '': s += data_all[i] n += 1 if n != 0: ws.write(x + j, column, s / n) else: for j in range(MAX_AGE / group_size + 1): start = j * group_size end = start + group_size if end > MAX_AGE: end = MAX_AGE s = 0 n = 0 for i in range(start, end): if data_all[i] != '': s += data_all[i] n += 1 if n != 0: ws.write(x + j, column, s / n) if type == 'prevalence': column = y + 5 elif type == 'incidence': column = y + 8 elif type == 'remission': column = y + 11 elif type == 'excess-mortality': column = y + 14 else: column = -1 if column != -1: if group_size == 1: write_table_age_value(dm, k, 'emp_prior_mean', ws, x, column) write_table_age_value(dm, k, 'emp_prior_lower_ui', ws, x, column + 1) write_table_age_value(dm, k, 'emp_prior_upper_ui', ws, x, column + 2) else: write_table_group_value(dm, k, 'emp_prior_mean', ws, x, column, group_sizes) write_table_group_value(dm, k, 'emp_prior_lower_ui', ws, x, column + 1, group_sizes) write_table_group_value(dm, k, 'emp_prior_upper_ui', ws, x, column + 2, group_sizes) if type == 'prevalence': column = y + 17 elif type == 'incidence': column = y + 20 elif type == 'remission': column = y + 23 elif type == 'excess-mortality': column = y + 26 elif type == 'duration': column = y + 29 elif type == 'mortality': column = y + 32 elif type == 'relative-risk': column = y + 35 elif type == 'incidence_x_duration': column = y + 39 elif type == 'with-condition-death': column = y + 42 else: column = -1 if column != -1: if group_size == 1: write_table_age_value(dm, k, 'mean', ws, x, column) write_table_age_value(dm, k, 'lower_ui', ws, x, column + 1) write_table_age_value(dm, k, 'upper_ui', ws, x, column + 2) else: write_table_group_value(dm, k, 'mean', ws, x, column, group_sizes) write_table_group_value(dm, k, 'lower_ui', ws, x, column + 1, group_sizes) write_table_group_value(dm, k, 'upper_ui', ws, x, column + 2, group_sizes)
def fit_emp_prior(dm, param_type, iter=30000, thin=20, burn=10000, dbname='/dev/null'): """ Generate an empirical prior distribution for a single disease parameter Parameters ---------- dm : dismod3.DiseaseModel The object containing all the data, (hyper)-priors, and additional information (like input and output age-mesh). param_type : str, one of 'incidence', 'prevalence', 'remission', 'excess-mortality' The disease parameter to work with Notes ----- The results of this fit are stored in the disease model's params hash for use when fitting multiple paramter types together Example ------- $ python2.5 gbd_fit.py 231 -t incidence """ data = [d for d in dm.data if clean(d['data_type']).find(param_type) != -1 and d.get('ignore') != -1] dm.calc_effective_sample_size(data) lower_bound_data = [] if param_type == 'excess-mortality': lower_bound_data = [d for d in dm.data if d['data_type'] == 'cause-specific mortality data'] dm.calc_effective_sample_size(lower_bound_data) dm.clear_empirical_prior() dm.fit_initial_estimate(param_type, data) dm.vars = setup(dm, param_type, data, lower_bound_data=lower_bound_data) # don't do anything if there is no data for this parameter type if len(dm.vars['data']) == 0: return debug('i: %s' % ', '.join(['%.2f' % x for x in dm.vars['rate_stoch'].value[::10]])) sys.stdout.flush() # fit the model #dm.na = mc.NormApprox(dm.vars) #dm.na.fit(method='fmin_powell', verbose=1) #dm.na.sample(1000, verbose=1) log_dispersion = dm.vars.pop('log_dispersion') # remove the dispersion term while finding initial values for MCMC dm.map = mc.MAP(dm.vars) dm.vars.update(log_dispersion=log_dispersion) try: dm.map.fit(method='fmin_powell', iterlim=500, verbose=1) except KeyboardInterrupt: debug('User halted optimization routine before optimal value found') sys.stdout.flush() # make pymc warnings go to stdout mc.warnings.warn = sys.stdout.write dm.mcmc = mc.MCMC(dm.vars, db='pickle', dbname=dbname) dm.mcmc.use_step_method(mc.Metropolis, dm.vars['log_dispersion'], proposal_sd=dm.vars['dispersion_step_sd']) dm.mcmc.use_step_method(mc.AdaptiveMetropolis, dm.vars['age_coeffs_mesh'], cov=dm.vars['age_coeffs_mesh_step_cov'], verbose=0) dm.mcmc.sample(iter=iter, burn=burn, thin=thin, verbose=1) dm.mcmc.db.commit() dm.vars['region_coeffs'].value = dm.vars['region_coeffs'].stats()['mean'] dm.vars['study_coeffs'].value = dm.vars['study_coeffs'].stats()['mean'] dm.vars['age_coeffs_mesh'].value = dm.vars['age_coeffs_mesh'].stats()['mean'] dm.vars['log_dispersion'].value = dm.vars['log_dispersion'].stats()['mean'] alpha = dm.vars['region_coeffs'].stats()['mean'] beta = dm.vars['study_coeffs'].stats()['mean'] gamma_mesh = dm.vars['age_coeffs_mesh'].stats()['mean'] debug('a: %s' % ', '.join(['%.2f' % x for x in alpha])) debug('b: %s' % ', '.join(['%.2f' % x for x in beta])) debug('g: %s' % ', '.join(['%.2f' % x for x in gamma_mesh])) debug('d: %.2f' % dm.vars['dispersion'].stats()['mean']) debug('m: %s' % ', '.join(['%.2f' % x for x in dm.vars['rate_stoch'].stats()['mean'][::10]])) covariates_dict = dm.get_covariates() X = covariates(data[0], covariates_dict) debug('p: %s' % ', '.join(['%.2f' % x for x in predict_rate(X, alpha, beta, gamma_mesh, dm.vars['bounds_func'], dm.get_param_age_mesh())])) # save the results in the param_hash prior_vals = dict( alpha=list(dm.vars['region_coeffs'].stats()['mean']), beta=list(dm.vars['study_coeffs'].stats()['mean']), gamma=list(dm.vars['age_coeffs'].stats()['mean']), delta=float(dm.vars['dispersion'].stats()['mean'])) prior_vals.update( sigma_alpha=list(dm.vars['region_coeffs'].stats()['standard deviation']), sigma_beta=list(dm.vars['study_coeffs'].stats()['standard deviation']), sigma_gamma=list(dm.vars['age_coeffs'].stats()['standard deviation']), sigma_delta=float(dm.vars['dispersion'].stats()['standard deviation'])) # save the goodness-of-fit statistics for the empirical prior prior_vals.update( aic=dm.map.AIC, bic=dm.map.BIC, dic=dm.mcmc.dic() ) dm.set_empirical_prior(param_type, prior_vals) dispersion = prior_vals['delta'] median_sample_size = np.median([values_from(dm, d)[3] for d in dm.vars['data']] + [1000]) debug('median effective sample size: %.1f' % median_sample_size) param_mesh = dm.get_param_age_mesh() age_mesh = dm.get_estimate_age_mesh() import random trace = zip(dm.vars['region_coeffs'].trace(), dm.vars['study_coeffs'].trace(), dm.vars['age_coeffs'].trace())[::5] for r in dismod3.gbd_regions: print 'predicting rates for %s' % r for y in dismod3.gbd_years: for s in dismod3.gbd_sexes: key = dismod3.gbd_key_for(param_type, r, y, s) rate_trace = [] for a, b, g in trace: rate_trace.append(predict_region_rate(key, alpha=a, beta=b, gamma=g, covariates_dict=covariates_dict, bounds_func=dm.vars['bounds_func'], ages=dm.get_estimate_age_mesh())) mu = dismod3.utils.interpolate(param_mesh, np.mean(rate_trace, axis=0)[param_mesh], age_mesh) dm.set_initial_value(key, mu) dm.set_mcmc('emp_prior_mean', key, mu) # similar to saving upper_ui and lower_ui in function store_mcmc_fit below rate_trace = np.sort(rate_trace, axis=0) dm.set_mcmc('emp_prior_upper_ui', key, dismod3.utils.interpolate(param_mesh, rate_trace[.975 * len(rate_trace), :][param_mesh], age_mesh)) dm.set_mcmc('emp_prior_lower_ui', key, dismod3.utils.interpolate(param_mesh, rate_trace[.025 * len(rate_trace), :][param_mesh], age_mesh))
Xb = [] for level in ['Study_level', 'Country_level']: for k in sorted(covariates_dict[level]): if covariates_dict[level][k]['rate']['value'] == 1 and standardize_data_type[d['parameter']][:-5] in covariates_dict[level][k]['types']['value']: Xb.append(float(d.get(clean(k)) or 0.)) #debug('%s-%s-%s-%s: Xb = %s' % (d['sex'], d['year_start'], d['gbd_region'], d.get('country_iso3_code', 'none'), str(Xb))) if Xb == []: Xb = [0.] return Xa, Xb from dismod3.utils import clean import csv import settings countries_for = dict( [[clean(x[0]), x[1:]] for x in csv.reader(open(settings.CSV_PATH + 'country_region.csv'))] ) population_by_age = dict( [[(d['Country Code'], d['Year'], d['Sex']), [max(.001,float(d['Age %d Population' % i])) for i in range(MAX_AGE)]] for d in csv.DictReader(open(settings.CSV_PATH + 'population.csv')) if len(d['Country Code']) == 3] ) def regional_population(key): """ calculate regional population for a gbd key""" t,r,y,s = type_region_year_sex_from_key(key) pop = np.zeros(MAX_AGE) for c in countries_for[clean(r)]: pop += population_by_age[(c, y, s)] return pop
def fit(dm, method="map", param_type="prevalence", units="(per 1.0)", emp_prior={}): """ Generate an estimate of the beta binomial model parameters using maximum a posteriori liklihood (MAP) or Markov-chain Monte Carlo (MCMC). Parameters ---------- dm : dismod3.DiseaseModel The object containing all the data, priors, and additional information (like input and output age-mesh). method : string, optional The parameter estimation method, either 'map' or 'mcmc'. param_type : str, optional Only data in dm.data with clean(d['data_type']).find(param_type) != -1 will be included in the beta-binomial liklihood function. units : str, optional The units of this parameter, for pretty plotting, etc. emp_prior : dict, optional the empirical prior dictionary, retrieved from the disease model if appropriate by:: >>> t, r, y, s = type_region_year_sex_from_key(key) >>> emp_prior = dm.get_empirical_prior(t) Example ------- >>> import dismod3 >>> import dismod3.beta_binomial_model as model >>> dm = dismod3.get_disease_model(1) >>> model.fit(dm, method='map', param_type='excess-mortality', units='(per person-year)') >>> model.fit(dm, method='mcmc', param_type='excess-mortality', units='(per person-year)') """ # setup model variables, if they do not already exist if not hasattr(dm, "vars"): data = [d for d in dm.data if clean(d["data_type"]).find(param_type) != -1] # use a random subset of the data if there is a lot of it, # to speed things up if len(data) > 25: dm.fit_initial_estimate(param_type, random.sample(data, 25)) else: dm.fit_initial_estimate(param_type, data) dm.set_units(param_type, units) dm.vars = setup(dm, param_type, data, emp_prior) # fit the model, with the selected method if method == "map": if not hasattr(dm, "map"): dm.map = mc.MAP(dm.vars) dm.map.fit(method="fmin_powell", iterlim=500, tol=0.001, verbose=1) dm.set_map(param_type, dm.vars["rate_stoch"].value) elif method == "mcmc": if not hasattr(dm, "mcmc"): dm.mcmc = mc.MCMC(dm.vars) if len(dm.vars["latent_p"]) > 0: dm.mcmc.use_step_method(mc.AdaptiveMetropolis, dm.vars["latent_p"]) dm.mcmc.sample(iter=40000, burn=10000, thin=30, verbose=1) store_mcmc_fit(dm, param_type, dm.vars["rate_stoch"])
Xb = [] for level in ['Study_level', 'Country_level']: for k in sorted(covariates_dict[level]): if covariates_dict[level][k]['rate']['value'] == 1: Xb.append(float(d.get(clean(k)) or 0.)) #debug('%s-%s-%s-%s: Xb = %s' % (d['sex'], d['year_start'], d['gbd_region'], d.get('country_iso3_code', 'none'), str(Xb))) if Xb == []: Xb = [0.] return Xa, Xb from dismod3.utils import clean import csv import settings countries_for = dict( [[clean(x[0]), x[1:]] for x in csv.reader(open(settings.CSV_PATH + 'country_region.csv'))] ) population_by_age = dict( [[(d['Country Code'], d['Year'], d['Sex']), [max(.001,float(d['Age %d Population' % i])) for i in range(dismod3.settings.MAX_AGE)]] for d in csv.DictReader(open(settings.CSV_PATH + 'population.csv')) if len(d['Country Code']) == 3] ) def regional_population(key): """ calculate regional population for a gbd key""" t,r,y,s = dismod3.utils.type_region_year_sex_from_key(key) pop = pl.zeros(dismod3.settings.MAX_AGE) for c in countries_for[clean(r)]: if y == 'all' and s == 'all': for yy in dismod3.settings.gbd_years: for ss in dismod3.settings.gbd_sexes:
def fit_emp_prior(dm, param_type): """ Generate an empirical prior distribution for a single disease parameter Parameters ---------- dm : dismod3.DiseaseModel The object containing all the data, (hyper)-priors, and additional information (like input and output age-mesh). param_type : str, one of 'incidence', 'prevalence', 'remission', 'excess-mortality' The disease parameter to work with Notes ----- The results of this fit are stored in the disease model's params hash for use when fitting multiple paramter types together Example ------- $ python2.5 gbd_fit.py 175 -t incidence -p 'zero 0 4, zero 41 100, smooth 25' # takes 7m to run """ data = [d for d in dm.data if clean(d['data_type']).find(param_type) != -1] # don't do anything if there is no data for this parameter type if len(data) == 0: return dm.fit_initial_estimate(param_type, data) dm.vars = setup(dm, param_type, data) # fit the model dm.map = mc.MAP(dm.vars) try: dm.map.fit(method='fmin_powell', iterlim=500, tol=.00001, verbose=1) except KeyboardInterrupt: print 'User halted optimization routine before optimal value found' # save the results in the param_hash dm.clear_empirical_prior() prior_vals = dict( alpha=list(dm.vars['region_coeffs'].value), beta=list(dm.vars['study_coeffs'].value), gamma=list(dm.vars['age_coeffs'].value), sigma=float(dm.vars['dispersion'].value)) dm.set_empirical_prior(param_type, prior_vals) dispersion = prior_vals['sigma'] for r in dismod3.gbd_regions: for y in dismod3.gbd_years: for s in dismod3.gbd_sexes: key = dismod3.gbd_key_for(param_type, r, y, s) logit_mu = predict_logit_rate(regional_covariates(key), **prior_vals) mu = mc.invlogit(logit_mu) dm.set_initial_value(key, mu) dm.set_mcmc('emp_prior_mean', key, mu) dm.set_mcmc('emp_prior_lower_ui', key, mc.invlogit(logit_mu - 1.96*dispersion)) dm.set_mcmc('emp_prior_upper_ui', key, mc.invlogit(logit_mu + 1.96*dispersion)) key = dismod3.gbd_key_for(param_type, 'world', 1997, 'total') logit_mu = predict_logit_rate(regional_covariates(key), **prior_vals) mu = mc.invlogit(logit_mu) dm.set_initial_value(key, mu) dm.set_mcmc('emp_prior_mean', key, mu) dm.set_mcmc('emp_prior_lower_ui', key, mc.invlogit(logit_mu - 1.96*dispersion)) dm.set_mcmc('emp_prior_upper_ui', key, mc.invlogit(logit_mu + 1.96*dispersion))
0.02044349, 0.02214463, 0.02396039, 0.02589065, 0.0279525 , 0.03017836, 0.03261135, 0.03530052, 0.03828981, 0.04160153, 0.04523777, 0.04918468, 0.05341633, 0.05790466, 0.06263516, 0.06760523, 0.07281963, 0.07828758, 0.08401736, 0.09000903, 0.09625542, 0.10274424, 0.10945923, 0.11638187, 0.1234935 , 0.13077522, 0.13820759, 0.14577067, 0.15344416, 0.16120755, 0.16904026, 0.17692176, 0.18483165, 0.19274966, 0.20065553, 0.20852876, 0.2163489 , 0.22409584, 0.23174999, 0.23929245, 0.2467051 ]) for region in dismod3.gbd_regions: for year in dismod3.gbd_years: for sex in dismod3.gbd_sexes: key = dismod3.gbd_key_for('%s', region, year, sex) if clean(region) == 'north_america_high_income': regional_offset = 0. else: regional_offset = -.5 time_offset = (int(year)-1997)/10. if clean(sex) == 'male': sex_offset = .1 else: sex_offset = 0. # incidence rate i = mc.invlogit(mc.logit(.012 * mc.invlogit((ages - 44) / 3)) + regional_offset + time_offset + sex_offset) truth[key % 'incidence'] = i