def SCpm(SC_0=SC_0, i=i, r=r, f=f, m_all_cause=m_all_cause, age_mesh=dm.get_param_age_mesh()): SC = np.zeros([2, len(age_mesh)]) p = np.zeros(len(age_mesh)) m = np.zeros(len(age_mesh)) SC[:, 0] = SC_0 p[0] = SC_0[1] / (SC_0[0] + SC_0[1]) m[0] = trim( m_all_cause[age_mesh[0]] - f[age_mesh[0]] * p[0], 0.1 * m_all_cause[age_mesh[0]], 1 - NEARLY_ZERO ) # trim m[0] to avoid numerical instability for ii, a in enumerate(age_mesh[:-1]): A = np.array([[-i[a] - m[ii], r[a]], [i[a], -r[a] - m[ii] - f[a]]]) * (age_mesh[ii + 1] - age_mesh[ii]) SC[:, ii + 1] = np.dot(scipy.linalg.expm(A), SC[:, ii]) p[ii + 1] = trim(SC[1, ii + 1] / (SC[0, ii + 1] + SC[1, ii + 1]), NEARLY_ZERO, 1 - NEARLY_ZERO) m[ii + 1] = trim( m_all_cause[age_mesh[ii + 1]] - f[age_mesh[ii + 1]] * p[ii + 1], 0.1 * m_all_cause[age_mesh[ii + 1]], 1 - NEARLY_ZERO, ) SCpm = np.zeros([4, len(age_mesh)]) SCpm[0:2, :] = SC SCpm[2, :] = p SCpm[3, :] = m return SCpm
def setup(dm, key, data_list, rate_stoch=None, emp_prior={}): """ Generate the PyMC variables for a beta binomial model of a single rate function Parameters ---------- dm : dismod3.DiseaseModel the object containing all the data, priors, and additional information (like input and output age-mesh) key : str the name of the key for everything about this model (priors, initial values, estimations) data_list : list of data dicts the observed data to use in the beta-binomial liklihood function rate_stoch : pymc.Stochastic, optional a PyMC stochastic (or deterministic) object, with len(rate_stoch.value) == len(dm.get_estimation_age_mesh()). This is used to link beta-binomial stochs into a larger model, for example. emp_prior : dict, optional the empirical prior dictionary, retrieved from the disease model if appropriate by:: >>> t, r, y, s = type_region_year_sex_from_key(key) >>> emp_prior = dm.get_empirical_prior(t) Results ------- vars : dict Return a dictionary of all the relevant PyMC objects for the beta binomial model. vars['rate_stoch'] is of particular relevance; this is what is used to link the beta-binomial model into more complicated models, like the generic disease model. Details ------- The beta binomial model parameters are the following: * the mean age-specific rate function * dispersion of this mean * the p_i value for each data observation that has a standard error (data observations that do not have standard errors recorded are fit as observations of the beta r.v., while observations with standard errors recorded have a latent variable for the beta, and an observed binomial r.v.). """ vars = {} est_mesh = dm.get_estimate_age_mesh() if np.any(np.diff(est_mesh) != 1): raise ValueError, "ERROR: Gaps in estimation age mesh must all equal 1" # set up age-specific rate function, if it does not yet exist if not rate_stoch: param_mesh = dm.get_param_age_mesh() if emp_prior.has_key("mu"): initial_value = emp_prior["mu"] else: initial_value = dm.get_initial_value(key) # find the logit of the initial values, which is a little bit # of work because initial values are sampled from the est_mesh, # but the logit_initial_values are needed on the param_mesh logit_initial_value = mc.logit(interpolate(est_mesh, initial_value, param_mesh)) logit_rate = mc.Normal( "logit(%s)" % key, mu=-5.0 * np.ones(len(param_mesh)), tau=1.0e-2, value=logit_initial_value ) # logit_rate = [mc.Normal('logit(%s)_%d' % (key, a), mu=-5., tau=1.e-2) for a in param_mesh] vars["logit_rate"] = logit_rate @mc.deterministic(name=key) def rate_stoch(logit_rate=logit_rate): return interpolate(param_mesh, mc.invlogit(logit_rate), est_mesh) if emp_prior.has_key("mu"): @mc.potential(name="empirical_prior_%s" % key) def emp_prior_potential(f=rate_stoch, mu=emp_prior["mu"], tau=1.0 / np.array(emp_prior["se"]) ** 2): return mc.normal_like(f, mu, tau) vars["empirical_prior"] = emp_prior_potential vars["rate_stoch"] = rate_stoch # create stochastic variable for over-dispersion "random effect" mu_od = emp_prior.get("dispersion", 0.001) dispersion = mc.Gamma("dispersion_%s" % key, alpha=10.0, beta=10.0 / mu_od) vars["dispersion"] = dispersion @mc.deterministic(name="alpha_%s" % key) def alpha(rate=rate_stoch, dispersion=dispersion): return rate / dispersion ** 2 @mc.deterministic(name="beta_%s" % key) def beta(rate=rate_stoch, dispersion=dispersion): return (1.0 - rate) / dispersion ** 2 vars["alpha"] = alpha vars["beta"] = beta # create potentials for priors vars["priors"] = generate_prior_potentials(dm.get_priors(key), est_mesh, rate_stoch, dispersion) # create latent and observed stochastics for data vars["data"] = data_list vars["ab"] = [] vars["latent_p"] = [] vars["observations"] = [] for d in data_list: # set up observed stochs for all relevant data id = d["id"] if d["value"] == MISSING: print "WARNING: data %d missing value" % id continue # ensure all rate data is valid d_val = dm.value_per_1(d) d_se = dm.se_per_1(d) if d_val < 0 or d_val > 1: print "WARNING: data %d not in range [0,1]" % id continue if d["age_start"] < est_mesh[0] or d["age_end"] > est_mesh[-1]: raise ValueError, "Data %d is outside of estimation range---([%d, %d] is not inside [%d, %d])" % ( d["id"], d["age_start"], d["age_end"], est_mesh[0], est_mesh[-1], ) age_indices = indices_for_range(est_mesh, d["age_start"], d["age_end"]) age_weights = d["age_weights"] @mc.deterministic(name="a_%d^%s" % (id, key)) def a_i(alpha=alpha, age_indices=age_indices, age_weights=age_weights): return rate_for_range(alpha, age_indices, age_weights) @mc.deterministic(name="b_%d^%s" % (id, key)) def b_i(beta=beta, age_indices=age_indices, age_weights=age_weights): return rate_for_range(beta, age_indices, age_weights) vars["ab"] += [a_i, b_i] if d_se > 0: # if the data has a standard error, model it as a realization # of a beta binomial r.v. latent_p_i = mc.Beta( "latent_p_%d^%s" % (id, key), alpha=a_i, beta=b_i, value=trim(d_val, NEARLY_ZERO, 1 - NEARLY_ZERO) ) vars["latent_p"].append(latent_p_i) denominator = d_val * (1 - d_val) / d_se ** 2.0 numerator = d_val * denominator obs_binomial = mc.Binomial( "data_%d^%s" % (id, key), value=numerator, n=denominator, p=latent_p_i, observed=True ) vars["observations"].append(obs_binomial) else: # if the data is a point estimate with no uncertainty # recorded, model it as a realization of a beta r.v. obs_p_i = mc.Beta( "latent_p_%d" % id, value=trim(d_val, NEARLY_ZERO, 1 - NEARLY_ZERO), alpha=a_i, beta=b_i, observed=True ) vars["observations"].append(obs_p_i) return vars
def generate_disease_data(condition, cov): """ Generate csv files with gold-standard disease data, and somewhat good, somewhat dense disease data, as might be expected from a condition that is carefully studied in the literature """ age_len = dismod3.MAX_AGE ages = np.arange(age_len, dtype='float') # incidence rate i0 = .005 + .02 * mc.invlogit((ages - 44) / 3) #i0 = np.maximum(0., .001 * (-.125 + np.ones_like(ages) + (ages / age_len)**2.)) # remission rate #r = 0. * ages r = .1 * np.ones_like(ages) # excess-mortality rate #f_init = .085 * (ages / 100) ** 2.5 SMR = 3. * np.ones_like(ages) - ages / age_len # all-cause mortality-rate mort = dismod3.get_disease_model('all-cause_mortality') #age_intervals = [[a, a+9] for a in range(0, dismod3.MAX_AGE-4, 10)] + [[0, 100] for ii in range(1)] age_intervals = [[a, a] for a in range(0, dismod3.MAX_AGE, 1)] # TODO: take age structure from real data sparse_intervals = dict([[ region, random.sample(age_intervals, (ii**3 * len(age_intervals)) / len(countries_for)**3 / 1) ] for ii, region in enumerate(countries_for)]) dense_intervals = dict( [[region, random.sample(age_intervals, len(age_intervals) / 2)] for ii, region in enumerate(countries_for)]) gold_data = [] noisy_data = [] for ii, region in enumerate(sorted(countries_for)): if region == 'world': continue print region sys.stdout.flush() # introduce unexplained regional variation #i = i0 * (1 + float(ii) / 21) # or not i = i0 for year in [1990, 2005]: for sex in ['male', 'female']: param_type = 'all-cause_mortality' key = dismod3.gbd_key_for(param_type, region, year, sex) m_all_cause = mort.mortality(key, mort.data) # calculate excess-mortality rate from smr f = (SMR - 1.) * m_all_cause ## compartmental model (bins S, C, D, M) import scipy.linalg from dismod3 import NEARLY_ZERO from dismod3.utils import trim SCDM = np.zeros([4, age_len]) p = np.zeros(age_len) m = np.zeros(age_len) SCDM[0, 0] = 1. SCDM[1, 0] = 0. SCDM[2, 0] = 0. SCDM[3, 0] = 0. p[0] = SCDM[1, 0] / (SCDM[0, 0] + SCDM[1, 0] + NEARLY_ZERO) m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO, 1 - NEARLY_ZERO) for a in range(age_len - 1): A = [[-i[a] - m[a], r[a], 0., 0.], [i[a], -r[a] - m[a] - f[a], 0., 0.], [m[a], m[a], 0., 0.], [0., f[a], 0., 0.]] SCDM[:, a + 1] = np.dot(scipy.linalg.expm(A), SCDM[:, a]) p[a + 1] = SCDM[1, a + 1] / (SCDM[0, a + 1] + SCDM[1, a + 1] + NEARLY_ZERO) m[a + 1] = m_all_cause[a + 1] - f[a + 1] * p[a + 1] # duration = E[time in bin C] hazard = r + m + f pr_not_exit = np.exp(-hazard) X = np.empty(len(hazard)) X[-1] = 1 / hazard[-1] for ii in reversed(range(len(X) - 1)): X[ii] = (pr_not_exit[ii] * (X[ii + 1] + 1)) + (1 / hazard[ii] * (1 - pr_not_exit[ii]) - pr_not_exit[ii]) country = countries_for[region][0] params = dict(age_intervals=age_intervals, condition=condition, gbd_region=region, country=country, year=year, sex=sex, effective_sample_size=1000) params['age_intervals'] = [[0, 99]] generate_and_append_data(gold_data, 'prevalence data', p, **params) generate_and_append_data(gold_data, 'incidence data', i, **params) generate_and_append_data(gold_data, 'excess-mortality data', f, **params) generate_and_append_data(gold_data, 'remission data', r, **params) generate_and_append_data(gold_data, 'duration data', X, **params) # TODO: use this approach to age standardize all gold data, and then change it to get iX as a direct sum params['age_intervals'] = [[0, 99]] iX = i * X * (1 - p) * regional_population(key) generate_and_append_data(gold_data, 'incidence_x_duration', iX, **params) params['effective_sample_size'] = 1000 params['cov'] = 0. params['age_intervals'] = age_intervals generate_and_append_data(noisy_data, 'prevalence data', p, **params) generate_and_append_data(noisy_data, 'excess-mortality data', f, **params) generate_and_append_data(noisy_data, 'remission data', r, **params) generate_and_append_data(noisy_data, 'incidence data', i, **params) col_names = sorted(data_dict_for_csv(gold_data[0]).keys()) f_file = open(OUTPUT_PATH + '%s_gold.tsv' % condition, 'w') csv_f = csv.writer(f_file, dialect='excel-tab') csv_f.writerow(col_names) for d in gold_data: dd = data_dict_for_csv(d) csv_f.writerow([dd[c] for c in col_names]) f_file.close() f_name = OUTPUT_PATH + '%s_data.tsv' % condition f_file = open(f_name, 'w') csv_f = csv.writer(f_file, dialect='excel-tab') csv_f.writerow(col_names) for d in noisy_data: dd = data_dict_for_csv(d) csv_f.writerow([dd[c] for c in col_names]) f_file.close() # upload data file from dismod3.disease_json import dismod_server_login, twc, DISMOD_BASE_URL dismod_server_login() twc.go(DISMOD_BASE_URL + 'dismod/data/upload/') twc.formvalue(1, 'tab_separated_values', open(f_name).read()) # TODO: find or set the model number for this model, set the # expert priors and covariates, merge the covariate data into the # model, and add the "ground truth" to the disease json try: url = twc.submit() except Exception, e: print e
def generate_disease_data(condition, cov): """ Generate csv files with gold-standard disease data, and somewhat good, somewhat dense disease data, as might be expected from a condition that is carefully studied in the literature """ age_len = dismod3.MAX_AGE ages = np.arange(age_len, dtype='float') # incidence rate i0 = .005 + .02 * mc.invlogit((ages - 44) / 3) #i0 = np.maximum(0., .001 * (-.125 + np.ones_like(ages) + (ages / age_len)**2.)) # remission rate #r = 0. * ages r = .1 * np.ones_like(ages) # excess-mortality rate #f_init = .085 * (ages / 100) ** 2.5 SMR = 3. * np.ones_like(ages) - ages / age_len # all-cause mortality-rate mort = dismod3.get_disease_model('all-cause_mortality') #age_intervals = [[a, a+9] for a in range(0, dismod3.MAX_AGE-4, 10)] + [[0, 100] for ii in range(1)] age_intervals = [[a, a] for a in range(0, dismod3.MAX_AGE, 1)] # TODO: take age structure from real data sparse_intervals = dict([[region, random.sample(age_intervals, (ii**3 * len(age_intervals)) / len(countries_for)**3 / 1)] for ii, region in enumerate(countries_for)]) dense_intervals = dict([[region, random.sample(age_intervals, len(age_intervals)/2)] for ii, region in enumerate(countries_for)]) gold_data = [] noisy_data = [] for ii, region in enumerate(sorted(countries_for)): if region == 'world': continue print region sys.stdout.flush() # introduce unexplained regional variation #i = i0 * (1 + float(ii) / 21) # or not i = i0 for year in [1990, 2005]: for sex in ['male', 'female']: param_type = 'all-cause_mortality' key = dismod3.gbd_key_for(param_type, region, year, sex) m_all_cause = mort.mortality(key, mort.data) # calculate excess-mortality rate from smr f = (SMR - 1.) * m_all_cause ## compartmental model (bins S, C, D, M) import scipy.linalg from dismod3 import NEARLY_ZERO from dismod3.utils import trim SCDM = np.zeros([4, age_len]) p = np.zeros(age_len) m = np.zeros(age_len) SCDM[0,0] = 1. SCDM[1,0] = 0. SCDM[2,0] = 0. SCDM[3,0] = 0. p[0] = SCDM[1,0] / (SCDM[0,0] + SCDM[1,0] + NEARLY_ZERO) m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO, 1-NEARLY_ZERO) for a in range(age_len - 1): A = [[-i[a]-m[a], r[a] , 0., 0.], [ i[a] , -r[a]-m[a]-f[a], 0., 0.], [ m[a], m[a] , 0., 0.], [ 0., f[a], 0., 0.]] SCDM[:,a+1] = np.dot(scipy.linalg.expm(A), SCDM[:,a]) p[a+1] = SCDM[1,a+1] / (SCDM[0,a+1] + SCDM[1,a+1] + NEARLY_ZERO) m[a+1] = m_all_cause[a+1] - f[a+1] * p[a+1] # duration = E[time in bin C] hazard = r + m + f pr_not_exit = np.exp(-hazard) X = np.empty(len(hazard)) X[-1] = 1 / hazard[-1] for ii in reversed(range(len(X)-1)): X[ii] = (pr_not_exit[ii] * (X[ii+1] + 1)) + (1 / hazard[ii] * (1 - pr_not_exit[ii]) - pr_not_exit[ii]) country = countries_for[region][0] params = dict(age_intervals=age_intervals, condition=condition, gbd_region=region, country=country, year=year, sex=sex, effective_sample_size=1000) params['age_intervals'] = [[0,99]] generate_and_append_data(gold_data, 'prevalence data', p, **params) generate_and_append_data(gold_data, 'incidence data', i, **params) generate_and_append_data(gold_data, 'excess-mortality data', f, **params) generate_and_append_data(gold_data, 'remission data', r, **params) generate_and_append_data(gold_data, 'duration data', X, **params) # TODO: use this approach to age standardize all gold data, and then change it to get iX as a direct sum params['age_intervals'] = [[0,99]] iX = i * X * (1-p) * regional_population(key) generate_and_append_data(gold_data, 'incidence_x_duration', iX, **params) params['effective_sample_size'] = 1000 params['cov'] = 0. params['age_intervals'] = age_intervals generate_and_append_data(noisy_data, 'prevalence data', p, **params) generate_and_append_data(noisy_data, 'excess-mortality data', f, **params) generate_and_append_data(noisy_data, 'remission data', r, **params) generate_and_append_data(noisy_data, 'incidence data', i, **params) col_names = sorted(data_dict_for_csv(gold_data[0]).keys()) f_file = open(OUTPUT_PATH + '%s_gold.tsv' % condition, 'w') csv_f = csv.writer(f_file, dialect='excel-tab') csv_f.writerow(col_names) for d in gold_data: dd = data_dict_for_csv(d) csv_f.writerow([dd[c] for c in col_names]) f_file.close() f_name = OUTPUT_PATH + '%s_data.tsv' % condition f_file = open(f_name, 'w') csv_f = csv.writer(f_file, dialect='excel-tab') csv_f.writerow(col_names) for d in noisy_data: dd = data_dict_for_csv(d) csv_f.writerow([dd[c] for c in col_names]) f_file.close() # upload data file from dismod3.disease_json import dismod_server_login, twc, DISMOD_BASE_URL dismod_server_login() twc.go(DISMOD_BASE_URL + 'dismod/data/upload/') twc.formvalue(1, 'tab_separated_values', open(f_name).read()) # TODO: find or set the model number for this model, set the # expert priors and covariates, merge the covariate data into the # model, and add the "ground truth" to the disease json try: url = twc.submit() except Exception, e: print e
## compartmental model (bins S, C, D, M) import scipy.linalg from dismod3 import NEARLY_ZERO from dismod3.utils import trim SCDM = np.zeros([4, age_len]) p = np.zeros(age_len) m = np.zeros(age_len) SCDM[0, 0] = 1. SCDM[1, 0] = 0. SCDM[2, 0] = NEARLY_ZERO SCDM[3, 0] = NEARLY_ZERO p[0] = SCDM[1, 0] / (SCDM[0, 0] + SCDM[1, 0] + NEARLY_ZERO) m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO, 1 - NEARLY_ZERO) for a in range(age_len - 1): A = [[-i[a] - m[a], r[a], 0., 0.], [i[a], -r[a] - m[a] - f[a], 0., 0.], [m[a], m[a], 0., 0.], [0., f[a], 0., 0.]] SCDM[:, a + 1] = np.dot(scipy.linalg.expm(A), SCDM[:, a]) p[a + 1] = SCDM[1, a + 1] / (SCDM[0, a + 1] + SCDM[1, a + 1] + NEARLY_ZERO) m[a + 1] = trim(m_all_cause[a + 1] - f[a + 1] * p[a + 1], .1 * m_all_cause[a + 1], 1 - NEARLY_ZERO) # duration = E[time in bin C] pr_exit = 1 - r - m - f
## compartmental model (bins S, C, D, M) import scipy.linalg from dismod3 import NEARLY_ZERO from dismod3.utils import trim SCDM = np.zeros([4, age_len]) p = np.zeros(age_len) m = np.zeros(age_len) SCDM[0,0] = 1. SCDM[1,0] = 0. SCDM[2,0] = NEARLY_ZERO SCDM[3,0] = NEARLY_ZERO p[0] = SCDM[1,0] / (SCDM[0,0] + SCDM[1,0] + NEARLY_ZERO) m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO, 1-NEARLY_ZERO) for a in range(age_len - 1): A = [[-i[a]-m[a], r[a] , 0., 0.], [ i[a] , -r[a]-m[a]-f[a], 0., 0.], [ m[a], m[a] , 0., 0.], [ 0., f[a], 0., 0.]] SCDM[:,a+1] = np.dot(scipy.linalg.expm(A), SCDM[:,a]) p[a+1] = SCDM[1,a+1] / (SCDM[0,a+1] + SCDM[1,a+1] + NEARLY_ZERO) m[a+1] = trim(m_all_cause[a+1] - f[a+1] * p[a+1], .1*m_all_cause[a+1], 1-NEARLY_ZERO) # duration = E[time in bin C] pr_exit = 1 - r - m - f
# excess-mortality rate f = .085 * (ages / 100) ** 2.5 truth[key % 'excess-mortality'] = f ## compartmental model (bins S, C, D, M) SCDM = np.zeros([4, age_len]) SCDM[0,0] = 1. for a in range(age_len - 1): A = [[-i[a]-m[a], r[a] , 0., 0.], [ i[a] , -r[a]-m[a]-f[a], 0., 0.], [ m[a], m[a] , 0., 0.], [ 0., f[a], 0., 0.]] SCDM[:,a+1] = trim(np.dot(scipy.linalg.expm2(A), SCDM[:,a]), 0, 1) S = SCDM[0,:] C = SCDM[1,:] # prevalence = # with condition / (# with condition + # without) p = C / (S + C + NEARLY_ZERO) truth[key % 'prevalence'] = p truth[key % 'relative-risk'] = (m + f) / m # duration = E[time in bin C] pr_exit = 1 - r - m - f X = np.empty(len(pr_exit)) t = 1. for a in xrange(len(X) - 1, -1, -1): X[a] = t * pr_exit[a]