def test_simulated_disease(): """ Test fit for simulated disease data""" # load model to test fitting dm = DiseaseJson(file('tests/test_disease_1.json').read()) # filter and noise up data cov = .5 data = [] for d in dm.data: d['truth'] = d['value'] if dismod3.utils.clean(d['gbd_region']) == 'north_america_high_income': if d['data_type'] == 'all-cause mortality data': data.append(d) else: se = (cov * d['value']) d['value'] = mc.rtruncnorm(d['truth'], se**-2, 0, np.inf) d['age_start'] -= 5 d['age_end'] = d['age_start']+9 d['age_weights'] = np.ones(d['age_end']-d['age_start']+1) d['age_weights'] /= float(len(d['age_weights'])) d['standard_error'] = se data.append(d) dm.data = data # fit empirical priors and compare fit to data from dismod3 import neg_binom_model for rate_type in 'prevalence incidence remission excess-mortality'.split(): neg_binom_model.fit_emp_prior(dm, rate_type, '/dev/null') check_emp_prior_fits(dm) # fit posterior delattr(dm, 'vars') # remove vars so that gbd_disease_model creates its own version from dismod3 import gbd_disease_model keys = dismod3.utils.gbd_keys(region_list=['north_america_high_income'], year_list=[1990], sex_list=['male']) gbd_disease_model.fit(dm, method='map', keys=keys, verbose=1) ## first generate decent initial conditions gbd_disease_model.fit(dm, method='mcmc', keys=keys, iter=1000, thin=5, burn=5000, verbose=1, dbname='/dev/null') ## then sample the posterior via MCMC print 'error compared to the noisy data (coefficient of variation = %.2f)' % cov check_posterior_fits(dm) for d in dm.data: d['value'] = d['truth'] d['age_start'] += 5 d['age_end'] = d['age_start'] d['age_weights'] = np.ones(d['age_end']-d['age_start']+1) d['age_weights'] /= float(len(d['age_weights'])) print 'error compared to the truth' check_posterior_fits(dm) return dm
def step(self): # The right-hand sides for the linear constraints self.rhs = dict(zip(self.constraint_offdiags, [np.asarray(np.dot(pm.utils.value(od), self.g.value)).squeeze() for od in self.constraint_offdiags])) for i in xrange(self.n): try: lb, ub, rhs = self.get_bounds(i) except ConstraintError: warnings.warn('Bounds could not be set, this element is very highly constrained') continue newgs = np.hstack((self.g.value[i], pm.rtruncnorm(0,1,lb,ub,size=self.n_draws))) lpls = np.hstack((self.get_likelihood_only(), np.empty(self.n_draws))) for j, newg in enumerate(newgs[1:]): self.set_g_value(newg, i) # The newgs are drawn from the prior, taking the canstraints into account, so # accept them based on the 'likelihood children' only. try: lpls[j+1] = self.get_likelihood_only() except pm.ZeroProbability: lpls[j+1] = -np.inf lpls -= pm.flib.logsum(lpls) newg = newgs[pm.rcategorical(np.exp(lpls))] self.set_g_value(newg, i) for od in self.constraint_offdiags: rhs[od] += np.asarray(pm.utils.value(od))[:,i].squeeze() * newg self.rhs = rhs
def propose(self): tau = 1. / (self.adaptive_scale_factor * self.proposal_sd) ** 2 self.stochastic.value = pm.rtruncnorm( self.stochastic.value, tau, self.low_bound, self.up_bound)
def step(self): # TODO: Propose from not the prior, and tune using the asf's. # The right-hand sides for the linear constraints self.rhs = dict(zip(self.constraint_offdiags, [np.asarray(np.dot(pm.utils.value(od), self.g.value)).squeeze() for od in self.constraint_offdiags])) this_round = np.zeros(self.n, dtype='int') for i in xrange(self.n): self.check_constraints() # Jump an element of g. lb, ub, rhs = self.get_bounds(i) # Propose a new value curg = self.g.value[i] tau = 1./self.adaptive_scale_factor[i]**2 newg = pm.rtruncnorm(curg,tau,lb,ub)[0] # The Hastings factor hf = pm.truncnorm_like(curg,newg,tau,lb,ub)-pm.truncnorm_like(newg,curg,tau,lb,ub) # The difference in prior log-probabilities of g dpri = .5*(curg**2 - newg**2) # Get the current log-likelihood of the non-constraint children. lpl = self.get_likelihood_only() cv = {} for od in self.all_offdiags: for c in od.children: cv[c] = c.value.copy() # Inter the proposed value and get the proposed log-likelihood. self.set_g_value(newg, i) try: lpl_p = self.get_likelihood_only() except pm.ZeroProbability: self.reject(i, cv) self.check_constraints() this_round[i] = -1 continue # M-H acceptance if np.log(np.random.random()) < lpl_p - lpl + hf + dpri: self.accepted[i] += 1 this_round[i] = 1 for od in self.constraint_offdiags: rhs[od] += np.asarray(pm.utils.value(od))[:,i].squeeze() * newg self.rhs = rhs self.check_constraints() else: self.reject(i, cv) self.check_constraints() this_round[i] = -1
def generate_and_append_data(data, data_type, truth, age_intervals, condition, gbd_region, country, year, sex, effective_sample_size, cov=0.): """ create simulated data""" for a0, a1 in age_intervals: d = { 'condition': condition, 'data_type': data_type, 'gbd_region': gbd_region, 'region': country, 'year_start': year, 'year_end': year, 'sex': sex, 'age_start': a0, 'age_end': a1, 'id': len(data),} holdout = 0 d['ignore'] = holdout d['test_set'] = holdout ages = range(a0, a1 + 1) if data_type == 'incidence_x_duration': pop = 1. * np.ones_like(ages) else: pop = np.array([population_by_age[(country, str(year), sex)][a] for a in ages]) if np.sum(pop) > 0: pop /= float(np.sum(pop)) # normalize the pop weights to sum to 1 else: pop = np.ones_like(ages) / float(len(ages)) # for countries where pop is zero, fill in constant structure d['age_weights'] = list(pop) p0 = dismod3.utils.rate_for_range(truth, ages, pop) d['truth'] = p0 if p0 == 0 or cov == 0: p1 = p0 d['value'] = p1 d['effective_sample_size'] = effective_sample_size else: p1 = mc.rtruncnorm(p0, (cov/100. * p0)**-2, 0, np.inf) assert not np.isnan(p1) d['value'] = p1 d['standard_error'] = p0 * cov/100. data.append(d)
def generate_and_append_data(data, data_type, truth, age_intervals, condition, gbd_region, country, year, sex, effective_sample_size, cov=0.): """ create simulated data""" for a0, a1 in age_intervals: d = { 'condition': condition, 'data_type': data_type, 'gbd_region': gbd_region, 'region': country, 'year_start': year, 'year_end': year, 'sex': sex, 'age_start': a0, 'age_end': a1, 'id': len(data), } holdout = 0 d['ignore'] = holdout d['test_set'] = holdout ages = range(a0, a1 + 1) if data_type == 'incidence_x_duration': pop = 1. * np.ones_like(ages) else: pop = np.array([ population_by_age[(country, str(year), sex)][a] for a in ages ]) if np.sum(pop) > 0: pop /= float( np.sum(pop)) # normalize the pop weights to sum to 1 else: pop = np.ones_like(ages) / float( len(ages) ) # for countries where pop is zero, fill in constant structure d['age_weights'] = list(pop) p0 = dismod3.utils.rate_for_range(truth, ages, pop) d['truth'] = p0 if p0 == 0 or cov == 0: p1 = p0 d['value'] = p1 d['effective_sample_size'] = effective_sample_size else: p1 = mc.rtruncnorm(p0, (cov / 100. * p0)**-2, 0, np.inf) assert not np.isnan(p1) d['value'] = p1 d['standard_error'] = p0 * cov / 100. data.append(d)
def propose(self): tau = 1. / (self.adaptive_scale_factor * self.proposal_sd)**2 self.stochastic.value = pm.rtruncnorm(self.stochastic.value, tau, self.low_bound, self.up_bound)
def fit_simulated_disease(n=300, cv=2.): """ Test fit for simulated disease data with noise and missingness""" # load model to test fitting dm = DiseaseJson(file('tests/simulation_gold_standard.json').read()) # adjust any priors and covariates as desired dm.set_param_age_mesh(arange(0,101,2)) for type in 'incidence prevalence remission excess_mortality'.split(): dm.params['global_priors']['heterogeneity'][type] = 'Very' dm.params['covariates']['Country_level']['LDI_id']['rate']['value'] = 0 # filter and noise up data mort_data = [] all_data = [] for d in dm.data: d['truth'] = d['value'] d['age_weights'] = array([1.]) if d['data_type'] == 'all-cause mortality data': mort_data.append(d) else: if d['value'] > 0: se = (cv / 100.) * d['value'] Y_i = mc.rtruncnorm(d['truth'], se**-2, 0, np.inf) d['value'] = Y_i d['standard_error'] = se d['effective_sample_size'] = Y_i * (1-Y_i) / se**2 all_data.append(d) sampled_data = random.sample(all_data, n) + mort_data dm.data = sampled_data # fit empirical priors and compare fit to data from dismod3 import neg_binom_model for rate_type in 'prevalence incidence remission excess-mortality'.split(): #neg_binom_model.fit_emp_prior(dm, rate_type, iter=1000, thin=1, burn=0, dbname='/dev/null') neg_binom_model.fit_emp_prior(dm, rate_type, iter=30000, thin=15, burn=15000, dbname='/dev/null') check_emp_prior_fits(dm) # fit posterior delattr(dm, 'vars') # remove vars so that gbd_disease_model creates its own version from dismod3 import gbd_disease_model keys = dismod3.utils.gbd_keys(region_list=['north_america_high_income'], year_list=[1990], sex_list=['male']) gbd_disease_model.fit(dm, method='map', keys=keys, verbose=1) ## first generate decent initial conditions gbd_disease_model.fit(dm, method='mcmc', keys=keys, iter=30000, thin=15, burn=15000, verbose=1, dbname='/dev/null') ## then sample the posterior via MCMC #gbd_disease_model.fit(dm, method='mcmc', keys=keys, iter=1000, thin=1, burn=0, verbose=1, dbname='/dev/null') ## fast for dev print 'error compared to the noisy data (coefficient of variation = %.2f)' % cv check_posterior_fits(dm) dm.data = all_data for d in dm.data: if d['data_type'] != 'all-cause mortality data': d['noisy_data'] = d['value'] d['value'] = d['truth'] print 'error compared to the truth' are, coverage = check_posterior_fits(dm) print print 'Median Absolute Relative Error of Posterior Predictions:', median(are) print 'Pct coverage:', 100*mean(coverage) f = open('score_%d_%f.txt' % (n, cv), 'a') f.write('%10.10f,%10.10f\n' % (median(are), mean(coverage))) f.close() dm.all_data = all_data dm.data = sampled_data for d in dm.data: if d['data_type'] != 'all-cause mortality data': d['value'] = d['noisy_data'] generate_figure(dm, n, cv) return dm