def test_pred_samps(self): "A dry run in Kenya with only one sample point. This test should not work with N>1." N = 1 lat_pred = np.atleast_1d(pm.runiform(-5., 5., size=N) * deg_to_rad) # lat_pred = array([8.89, 9.5, 1.17, 1.39]) lon_pred = np.atleast_1d(pm.runiform(33., 40., size=N) * deg_to_rad) # lon_pred = array([-1.54, .08, 39.44, 38.12]) t_pred = np.atleast_1d(array([2007] * N) - 2009) pred_mesh = vstack((lon_pred, lat_pred, t_pred)).T age_lims = [(lo_age, up_age)] * len(lon_pred) N_exam = ones(len(lat_pred)) * 1000 input_pts = [{'lon': lon_pred[i], 'lat': lat_pred[i], 'month': 1, 'year': 2009, 'lo_age': 2, 'up_age': 10, 'n': N_exam[i]}\ for i in range(len(lat_pred))] output_pts = [{ 'lon': lon_pred[i], 'lat': lat_pred[i], 'year': 2009, 'month': 1, 'lo_age': 2, 'up_age': 10, 'nmonths': 2 } for i in range(len(lat_pred))] correction_factor_array = mbgw.correction_factors.known_age_corr_factors( arange(0, 27), 1000) ind_outer, ind_inner, Ms, Cs, Vs, likelihood_means, likelihood_variances, model_posteriors =\ mbgw.EP.pred_samps(pred_mesh*deg_to_rad, pred_mesh*deg_to_rad, N_exam, tracefile, trace_thin, trace_burn, N_param_vals, N_per_param, N_nearest, age_lims, correction_factor_array, debug=True)
def step(self): """ Slice step method """ y = self.loglike - rexponential(1) # Stepping out procedure L = self.stochastic.value - self.w*runiform(0,1) R = L + self.w J = floor(self.m*runiform(0,1)) K = (self.m-1)-J while(J>0 and y<self.fll(L)): L = L - self.w J = J - 1 while(K>0 and y<self.fll(R)): R = R + self.w K = K - 1 #self.stochastic.last_value = self.stochastic.value self.stochastic.value = runiform(L,R) try: y_new = self.loglike except ZeroProbability: y_new = -infty while(y_new<y): if (self.stochastic.value < self.stochastic.last_value): L = float(self.stochastic.value) else: R = float(self.stochastic.value) self.stochastic.revert() self.stochastic.value = runiform(L,R) try: y_new = self.loglike except ZeroProbability: y_new = -infty
def simulate_age_group_data(N=50, delta_true=150, pi_true=true_rate_function): """ generate simulated data """ # start with a simple model with N rows of data model = data_simulation.simple_model(N) # record the true age-specific rates model.ages = pl.arange(0, 101, 1) model.pi_age_true = pi_true(model.ages) # choose age groups randomly age_width = mc.runiform(1, 100, size=N) age_mid = mc.runiform(age_width / 2, 100 - age_width / 2, size=N) age_width[:10] = 10 age_mid[:10] = pl.arange(5, 105, 10) #age_width[10:20] = 10 #age_mid[10:20] = pl.arange(5, 105, 10) age_start = pl.array(age_mid - age_width / 2, dtype=int) age_end = pl.array(age_mid + age_width / 2, dtype=int) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end # choose effective sample size uniformly at random n = mc.runiform(100, 10000, size=N) model.input_data['effective_sample_size'] = n # integrate true age-specific rate across age groups to find true group rate model.input_data['true'] = pl.nan model.input_data['age_weights'] = '' for i in range(N): beta = mc.rnormal(0., .025**-2) # TODO: clean this up, it is computing more than is necessary age_weights = pl.exp(beta * model.ages) sum_pi_wt = pl.cumsum(model.pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) model.input_data.ix[i, 'true'] = p[i] model.input_data.ix[i, 'age_weights'] = ';'.join( ['%.4f' % w for w in age_weights[age_start[i]:(age_end[i] + 1)]]) # sample observed rate values from negative binomial distribution model.input_data['value'] = mc.rnegative_binomial( n * model.input_data['true'], delta_true) / n print model.input_data.drop(['standard_error', 'upper_ci', 'lower_ci'], axis=1) return model
def simulate_age_group_data(N=50, delta_true=150, pi_true=true_rate_function): """ generate simulated data """ # start with a simple model with N rows of data model = data_simulation.simple_model(N) # record the true age-specific rates model.ages = pl.arange(0, 101, 1) model.pi_age_true = pi_true(model.ages) # choose age groups randomly age_width = mc.runiform(1, 100, size=N) age_mid = mc.runiform(age_width/2, 100-age_width/2, size=N) age_width[:10] = 10 age_mid[:10] = pl.arange(5, 105, 10) #age_width[10:20] = 10 #age_mid[10:20] = pl.arange(5, 105, 10) age_start = pl.array(age_mid - age_width/2, dtype=int) age_end = pl.array(age_mid + age_width/2, dtype=int) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end # choose effective sample size uniformly at random n = mc.runiform(100, 10000, size=N) model.input_data['effective_sample_size'] = n # integrate true age-specific rate across age groups to find true group rate model.input_data['true'] = pl.nan model.input_data['age_weights'] = '' for i in range(N): beta = mc.rnormal(0., .025**-2) # TODO: clean this up, it is computing more than is necessary age_weights = pl.exp(beta*model.ages) sum_pi_wt = pl.cumsum(model.pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) model.input_data.ix[i, 'true'] = p[i] model.input_data.ix[i, 'age_weights'] = ';'.join(['%.4f'%w for w in age_weights[age_start[i]:(age_end[i]+1)]]) # sample observed rate values from negative binomial distribution model.input_data['value'] = mc.rnegative_binomial(n*model.input_data['true'], delta_true) / n print model.input_data.drop(['standard_error', 'upper_ci', 'lower_ci'], axis=1) return model
def multipoly_sample(n, mp): """ Returns uniformly-distributed points on the earth's surface conditioned to be inside a multipolygon. Not particularly fast. """ # b = basemap.Basemap(-180,-90,180,90) if isinstance(mp, geometry.MultiPolygon): print 'Breaking down multipolygon' areas = [shapely_poly_area(p) for p in mp.geoms] areas = np.array(areas)/np.sum(areas) # ns = pm.rmultinomial(n, areas) stair = np.array(np.concatenate(([0],np.cumsum(areas*n))),dtype='int') ns = np.diff(stair) locs = [multipoly_sample(ns[i], mp.geoms[i]) for i in np.where(ns>0)[0]] return np.concatenate([loc[0] for loc in locs]), np.concatenate([loc[1] for loc in locs]) lons = np.empty(n) lats = np.empty(n) done = 0 xmin = mp.bounds[0]*np.pi/180 ymin = mp.bounds[1]*np.pi/180 xmax = mp.bounds[2]*np.pi/180 ymax = mp.bounds[3]*np.pi/180 print 'Starting: n=%i'%n while done < n: x = np.atleast_1d(pm.runiform(xmin,xmax, size=n)) y = np.atleast_1d(np.arcsin(pm.runiform(np.sin(ymin),np.sin(ymax),size=n))) points=[geom.Point([x[i]*180./np.pi,y[i]*180./np.pi]) for i in xrange(len(x))] good = list(iterops.contains(mp, points, True)) n_good = min(n,len(good)) lons[done:done+n_good] = [p.coords[0][0] for p in good][:n-done] lats[done:done+n_good] = [p.coords[0][1] for p in good][:n-done] done += n_good print '\tDid %i, %i remaining.'%(n_good,n-done) # plot_unit(b,mp) # b.plot(x*180./np.pi,y*180./np.pi,'r.') # # from IPython.Debugger import Pdb # Pdb(color_scheme='Linux').set_trace() print 'Filled' return lons, lats
def simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true): # choose age intervals to measure age_start = np.array(mc.runiform(0, 100, n), dtype=int) age_start.sort() # sort to make it easy to discard the edges when testing age_end = np.array(mc.runiform(age_start + 1, np.minimum(age_start + 10, 100)), dtype=int) # find truth for the integral across the age intervals import scipy.integrate pi_interval_true = [ scipy.integrate.trapz(pi_age_true[a_0i:(a_1i + 1)]) / (a_1i - a_0i) for a_0i, a_1i in zip(age_start, age_end) ] # generate covariates that add explained variation X = mc.rnormal(0., 1.**2, size=(n, 3)) beta_true = [-.1, .1, .2] beta_true = [0, 0, 0] Y_true = np.dot(X, beta_true) # calculate the true value of the rate in each interval pi_true = pi_interval_true * np.exp(Y_true) # simulate the noisy measurement of the rate in each interval p = np.maximum(0., mc.rnormal(pi_true, 1. / sigma_true**2.)) # store the simulated data in a pandas DataFrame data = pandas.DataFrame( dict(value=p, age_start=age_start, age_end=age_end, x_0=X[:, 0], x_1=X[:, 1], x_2=X[:, 2])) data['effective_sample_size'] = np.maximum(p * (1 - p) / sigma_true**2, 1.) data['standard_error'] = np.nan data['upper_ci'] = np.nan data['lower_ci'] = np.nan data['year_start'] = 2005. # TODO: make these vary data['year_end'] = 2005. data['sex'] = 'total' data['area'] = 'all' data['data_type'] = data_type return data
def generate_data(N, delta_true, pi_true, heterogeneity, bias, sigma_prior): a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) model = data_simulation.simple_model(N) model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount='Moderately') model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] n = mc.runiform(10000, 100000, size=N) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n * pl.exp(bias) emp_priors = {} emp_priors['p', 'mu'] = pi_age_true emp_priors['p', 'sigma'] = sigma_prior*pi_age_true model.emp_priors = emp_priors model.a = a model.pi_age_true = pi_age_true model.delta_true = delta_true return model
def simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true): # choose age intervals to measure age_start = np.array(mc.runiform(0, 100, n), dtype=int) age_start.sort() # sort to make it easy to discard the edges when testing age_end = np.array(mc.runiform(age_start+1, np.minimum(age_start+10,100)), dtype=int) # find truth for the integral across the age intervals import scipy.integrate pi_interval_true = [scipy.integrate.trapz(pi_age_true[a_0i:(a_1i+1)]) / (a_1i - a_0i) for a_0i, a_1i in zip(age_start, age_end)] # generate covariates that add explained variation X = mc.rnormal(0., 1.**2, size=(n,3)) beta_true = [-.1, .1, .2] beta_true = [0, 0, 0] Y_true = np.dot(X, beta_true) # calculate the true value of the rate in each interval pi_true = pi_interval_true*np.exp(Y_true) # simulate the noisy measurement of the rate in each interval p = np.maximum(0., mc.rnormal(pi_true, 1./sigma_true**2.)) # store the simulated data in a pandas DataFrame data = pandas.DataFrame(dict(value=p, age_start=age_start, age_end=age_end, x_0=X[:,0], x_1=X[:,1], x_2=X[:,2])) data['effective_sample_size'] = np.maximum(p*(1-p)/sigma_true**2, 1.) data['standard_error'] = np.nan data['upper_ci'] = np.nan data['lower_ci'] = np.nan data['year_start'] = 2005. # TODO: make these vary data['year_end'] = 2005. data['sex'] = 'total' data['area'] = 'all' data['data_type'] = data_type return data
def test_pred_samps(self): "A dry run in Kenya with only one sample point. This test should not work with N>1." N = 1 lat_pred = np.atleast_1d(pm.runiform(-5., 5., size=N) * deg_to_rad) # lat_pred = array([8.89, 9.5, 1.17, 1.39]) lon_pred = np.atleast_1d(pm.runiform(33., 40., size=N) * deg_to_rad) # lon_pred = array([-1.54, .08, 39.44, 38.12]) t_pred = np.atleast_1d(array([2007]*N)-2009) pred_mesh = vstack((lon_pred, lat_pred, t_pred)).T age_lims = [(lo_age, up_age)]*len(lon_pred) N_exam = ones(len(lat_pred))*1000 input_pts = [{'lon': lon_pred[i], 'lat': lat_pred[i], 'month': 1, 'year': 2009, 'lo_age': 2, 'up_age': 10, 'n': N_exam[i]}\ for i in range(len(lat_pred))] output_pts = [{'lon': lon_pred[i], 'lat': lat_pred[i], 'year': 2009, 'month': 1, 'lo_age': 2, 'up_age': 10, 'nmonths': 2} for i in range(len(lat_pred))] correction_factor_array = mbgw.correction_factors.known_age_corr_factors(arange(0,27), 1000) ind_outer, ind_inner, Ms, Cs, Vs, likelihood_means, likelihood_variances, model_posteriors =\ mbgw.EP.pred_samps(pred_mesh*deg_to_rad, pred_mesh*deg_to_rad, N_exam, tracefile, trace_thin, trace_burn, N_param_vals, N_per_param, N_nearest, age_lims, correction_factor_array, debug=True)
def test_covariate_model_sim_w_hierarchy(): n = 50 # setup hierarchy hierarchy, output_template = data_simulation.small_output() # simulate normal data area_list = np.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] sex_list = np.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] year = np.array(mc.runiform(1990, 2010, n), dtype=int) alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = np.exp([alpha_true[a] for a in area]) sigma_true = .05 * np.ones_like(pi_true) p = mc.rnormal(pi_true, 1. / sigma_true**2.) model = dismod_mr.data.ModelData() model.input_data = pd.DataFrame( dict(value=p, area=area, sex=sex, year_start=year, year_end=year)) model.hierarchy, model.output_template = hierarchy, output_template # create model and priors vars = {} vars.update( dismod_mr.model.covariates.mean_covariate_model( 'test', 1, model.input_data, {}, model, 'all', 'total', 'all')) vars.update( dismod_mr.model.likelihood.normal('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2) assert 'sex' not in vars['U'] assert 'x_sex' in vars['X'] assert len(vars['beta']) == 1
def test_covariate_model_sim_w_hierarchy(): n = 50 # setup hierarchy hierarchy, output_template = data_simulation.small_output() # simulate normal data area_list = pl.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] sex_list = pl.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] year = pl.array(mc.runiform(1990, 2010, n), dtype=int) alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = pl.exp([alpha_true[a] for a in area]) sigma_true = .05*pl.ones_like(pi_true) p = mc.rnormal(pi_true, 1./sigma_true**2.) model = data.ModelData() model.input_data = pandas.DataFrame(dict(value=p, area=area, sex=sex, year_start=year, year_end=year)) model.hierarchy, model.output_template = hierarchy, output_template # create model and priors vars = {} vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, {}, model, 'all', 'total', 'all')) vars.update(rate_model.normal_model('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2) assert 'sex' not in vars['U'] assert 'x_sex' in vars['X'] assert len(vars['beta']) == 1
def sim_data_for_validation(N, true_cf=[[0.1, 0.3, 0.6], [0.2, 0.3, 0.5]], true_std=[[.2, .05, .05], [.3, 0.1, 0.1]], std_bias=[1.,1.,1.]): """ Input ----- true_cf - a list of lists of true cause fractions (each must sum to one). true_std - a list of lists of the standard deviations corresponding to the true csmf's for each time point. Can either be a list of length J inside a list of length 1 (in this case, the same standard deviation is used for all time points) or can be T lists of length J (in this case, the a separate standard deviation is specified and used for each time point). This is meant to capture how variable estimates of the true cause fraction will be (i.e. causes that are more difficult to estimate will be more variable and therefore will have greater uncertainty). std_bias - a list of length J giving the bias for the standard deviations for each cause (as a multiplier: i.e. 0.9 would imply that we will underestimate the standard deviation by 10% on average while 1.1 would imply that we will overestimate the standard deviation by 10% on average). Output ----- N JxT draws from an 'estimated' distribution for the specified causes """ if len(true_std)==1 and len(true_cf)>1: true_std = [true_std[0] for i in range(len(true_cf))] est_cf = sim_data(1, true_cf, true_std)[0] est_error = est_cf - true_cf est_std = true_std*mc.runiform(pl.array(std_bias)*0.9, pl.array(std_bias)*1.1) sims = sim_data(N, est_cf, est_std, sum_to_one=False) return sims
def sim_data_for_validation(N, true_cf=[[0.1, 0.3, 0.6], [0.2, 0.3, 0.5]], true_std=[[.2, .05, .05], [.3, 0.1, 0.1]], std_bias=[1., 1., 1.]): """ Input ----- true_cf - a list of lists of true cause fractions (each must sum to one). true_std - a list of lists of the standard deviations corresponding to the true csmf's for each time point. Can either be a list of length J inside a list of length 1 (in this case, the same standard deviation is used for all time points) or can be T lists of length J (in this case, the a separate standard deviation is specified and used for each time point). This is meant to capture how variable estimates of the true cause fraction will be (i.e. causes that are more difficult to estimate will be more variable and therefore will have greater uncertainty). std_bias - a list of length J giving the bias for the standard deviations for each cause (as a multiplier: i.e. 0.9 would imply that we will underestimate the standard deviation by 10% on average while 1.1 would imply that we will overestimate the standard deviation by 10% on average). Output ----- N JxT draws from an 'estimated' distribution for the specified causes """ if len(true_std) == 1 and len(true_cf) > 1: true_std = [true_std[0] for i in range(len(true_cf))] est_cf = sim_data(1, true_cf, true_std)[0] est_error = est_cf - true_cf est_std = true_std * mc.runiform( pl.array(std_bias) * 0.9, pl.array(std_bias) * 1.1) sims = sim_data(N, est_cf, est_std, sum_to_one=False) return sims
if __name__ == '__main__': from tables import openFile from pylab import * N=10000 pop=10000*np.ones(N) nyr = 10 pop[::10] = 0 p = BurdenPredictor('traces/Africa+_scale_0.6_model_exp.hdf5', pop, nyr, 0) pr_max = .6 # p = BurdenPredictor('traces/CSE_Asia_and_Americas_scale_0.6_model_exp.hdf5', np.ones(N)*pop, nyr) # pr_max = .5 # for i in xrange(10000): pr = pm.runiform(0,pr_max,size=N) # for i in xrange(p.n): # clf() # plot(xplot, p.cols.fplot[i],'r.',markersize=2) # plot(pr, p.f[i](pr), 'b.',markersize=1) # a=raw_input() b = p(pr) clf() plot(pr,b,'k.',markersize=1)
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]): ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0 delta = delta_true * pl.exp(pl.dot(Z, zeta_true)) for i in range(len(zeta_true)): model.input_data['z_%d' % i] = Z[:, i] model.input_data['true'] = pi_true model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] add_quality_metrics(model.input_data) model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns) model.zeta['true'] = zeta_true model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean'] model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats( )['standard deviation'] add_quality_metrics(model.zeta) print '\nzeta' print model.zeta model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % ( pl.median(pl.absolute(model.zeta['abs_err'].dropna())), model.zeta.dropna()['covered?'].mean()) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'delta') add_to_results(model, 'input_data') add_to_results(model, 'zeta') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) return model
where p(x) stands for the prior for the true input and p(a,b) the prior for the regression parameters. """ from pymc import stochastic, observed, deterministic, uniform_like, runiform, rnormal, Sampler, Normal, Uniform from numpy import inf, log, cos, array import pylab # ------------------------------------------------------------------------------ # Synthetic values # Replace by real data # ------------------------------------------------------------------------------ slope = 1.5 intercept = 4 N = 30 true_x = runiform(0, 50, N) true_y = slope * true_x + intercept data_y = rnormal(true_y, 2) data_x = rnormal(true_x, 2) # ------------------------------------------------------------------------------ # Calibration of straight line parameters from data # ------------------------------------------------------------------------------ @stochastic def theta(value=array([2., 5.])): """Slope and intercept parameters for a straight line. The likelihood corresponds to the prior probability of the parameters.""" slope, intercept = value prob_intercept = uniform_like(intercept, -10, 10)
import numpy as np import pymc as pm from matplotlib import pyplot as plt N = 20 #create some artificial data. lifetime = pm.rweibull(2, 5, size=N) birth = pm.runiform(0, 10, N) censor = (birth + lifetime) > 10 #an individual is right-censored if this is True lifetime_ = np.ma.masked_array(lifetime, censor) #create the censorship event. lifetime_.set_fill_value(10) #good for computations later. #this begins the model alpha = pm.Uniform("alpha", 0, 20) #lets just use uninformative priors beta = pm.Uniform("beta", 0, 20) obs = pm.Weibull('obs', alpha, beta, value=lifetime_, observed=True) @pm.potential def censor_factor(obs=obs): if np.any((obs + birth < 10)[lifetime_.mask]): return -np.inf else: return 0 #perform Markov Chain Monte Carlo - see chapter 3 of BMH
import pymc as pm import numpy as np import pylab slp=1.5 intc=4.0 N=30 true_x= pm.runiform(0, 50, N) true_y = slp*true_x + intc data_y=pm.rnormal(true_y, 2) data_x=rnormal(true_x, 2) def theta(value=array([2.,5.])): """Slope and intercept parameters for a straight line. The likelihood corresponds to the prior probability of the parameters.""" slope, intercept = value prob_intercept = pm.uniform_like(intercept, -10, 10) prob_slope = np.log(1./np.cos(slope)**2) return prob_intercept+prob_slope init_x=data_x.clip(min=0, max=50) x=pm.Uniform('x', lower=0, upper=50, value=init_x)
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights*1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']] model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
import numpy as np import matplotlib.pyplot as plt import pymc as mc import scipy.stats as stats import math # http://blog.yhathq.com/posts/estimating-user-lifetimes-with-pymc.html # artificial data N = 20 true_alpha = 2 true_beta = 5 lifetime = mc.rweibull( true_alpha, true_beta, size=N ) birth = mc.runiform( 0, 10, N ) # an individual is right censored if this is true censor = (birth + lifetime) > 10 lifetime_ = np.ma.masked_array( lifetime, censor ) lifetime_.set_fill_value( 10 ) plt.clf() y = np.arange( 0, N ) for b,l,yy in zip( birth, lifetime, y ): plt.plot( [b,b+l], [yy,yy] ) plt.plot( birth+lifetime, y, linestyle="", marker="o" ) plt.draw() plt.show( block=False ) # begin the model # just use uniform priors alpha = mc.Uniform( "alpha", 0, 20 )
def validate_covariate_model_fe(N=100, delta_true=3, pi_true=.01, beta_true=[.5, -.5, 0.], replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) # add fixed effect to simulated data X = mc.rnormal(0., 1.**-2, size=(N,len(beta_true))) Y_true = pl.dot(X, beta_true) for i in range(len(beta_true)): model.input_data['x_%d'%i] = X[:,i] model.input_data['true'] = pi_true * pl.exp(Y_true) model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] add_quality_metrics(model.input_data) model.beta = pandas.DataFrame(index=model.vars['p']['X'].columns) model.beta['true'] = 0. for i in range(len(beta_true)): model.beta['true']['x_%d'%i] = beta_true[i] model.beta['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['beta']] model.beta['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars['p']['beta']] add_quality_metrics(model.beta) print '\nbeta' print model.beta model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'beta') model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta add_to_results(model, 'delta') print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.beta['abs_err'].dropna())), model.beta.dropna()['covered?'].mean()) add_to_results(model, 'input_data') add_to_results(model, 'beta') model.results = pandas.DataFrame(model.results) return model
def validate_age_pattern_model_sim(N=500, delta_true=.15, pi_true=quadratic): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) model = data_simulation.simple_model(N) model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) age_list = pl.array(mc.runiform(0, 100, size=N), dtype=int) p = pi_age_true[age_list] n = mc.runiform(100, 10000, size=N) model.input_data['age_start'] = age_list model.input_data['age_end'] = age_list model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(a, pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) print model.results return model
def propose(self): x_other = [X_i.value for X_i in self.others] max_val = pl.sqrt(1. - pl.dot(x_other, x_other)) self.stochastic.value = mc.runiform(-max_val, max_val)
delta_true = 50. N = 30 # start with a simple model with N rows of data model = data_simulation.simple_model(N) # set covariate to 0/1 values randomly model.input_data['x_cov'] = 1. * mc.rcategorical([.5, .5], size=N) # record the true age-specific rates model.ages = pl.arange(0, 101, 1) model.pi_age_true = pi_true(model.ages) # choose age groups randomly age_width = pl.zeros(N) age_mid = mc.runiform(age_width / 2, 100 - age_width / 2, size=N) age_start = pl.array(age_mid - age_width / 2, dtype=int) age_end = pl.array(age_mid + age_width / 2, dtype=int) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end # choose effective sample size uniformly at random n = mc.runiform(100, 10000, size=N) model.input_data['effective_sample_size'] = n # find true rate, with covariate p = model.pi_age_true[age_start] * pl.exp( model.input_data['x_cov'] * beta_true) # sample observed rate values from negative binomial distribution
def validate_covariate_model_fe(N=100, delta_true=3, pi_true=.01, beta_true=[.5, -.5, 0.], replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) # add fixed effect to simulated data X = mc.rnormal(0., 1.**-2, size=(N, len(beta_true))) Y_true = pl.dot(X, beta_true) for i in range(len(beta_true)): model.input_data['x_%d' % i] = X[:, i] model.input_data['true'] = pi_true * pl.exp(Y_true) model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] add_quality_metrics(model.input_data) model.beta = pandas.DataFrame(index=model.vars['p']['X'].columns) model.beta['true'] = 0. for i in range(len(beta_true)): model.beta['true']['x_%d' % i] = beta_true[i] model.beta['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['beta'] ] model.beta['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['beta'] ] add_quality_metrics(model.beta) print '\nbeta' print model.beta model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'beta') model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta add_to_results(model, 'delta') print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % ( pl.median(pl.absolute(model.beta['abs_err'].dropna())), model.beta.dropna()['covered?'].mean()) add_to_results(model, 'input_data') add_to_results(model, 'beta') model.results = pandas.DataFrame(model.results) return model
def test_data_model_sim(): # generate simulated data n = 50 sigma_true = .025 # start with truth a = pl.arange(0, 100, 1) pi_age_true = .0001 * (a * (100. - a) + 100.) # choose age intervals to measure age_start = pl.array(mc.runiform(0, 100, n), dtype=int) age_start.sort() # sort to make it easy to discard the edges when testing age_end = pl.array(mc.runiform(age_start + 1, pl.minimum(age_start + 10, 100)), dtype=int) # find truth for the integral across the age intervals import scipy.integrate pi_interval_true = [ scipy.integrate.trapz(pi_age_true[a_0i:(a_1i + 1)]) / (a_1i - a_0i) for a_0i, a_1i in zip(age_start, age_end) ] # generate covariates that add explained variation X = mc.rnormal(0., 1.**2, size=(n, 3)) beta_true = [-.1, .1, .2] Y_true = pl.dot(X, beta_true) # calculate the true value of the rate in each interval pi_true = pi_interval_true * pl.exp(Y_true) # simulate the noisy measurement of the rate in each interval p = mc.rnormal(pi_true, 1. / sigma_true**2.) # store the simulated data in a pandas DataFrame data = pandas.DataFrame( dict(value=p, age_start=age_start, age_end=age_end, x_0=X[:, 0], x_1=X[:, 1], x_2=X[:, 2])) data['effective_sample_size'] = pl.maximum(p * (1 - p) / sigma_true**2, 1.) data['standard_error'] = pl.nan data['upper_ci'] = pl.nan data['lower_ci'] = pl.nan data['year_start'] = 2005. # TODO: make these vary data['year_end'] = 2005. data['sex'] = 'total' data['area'] = 'all' # generate a moderately complicated hierarchy graph for the model hierarchy = nx.DiGraph() hierarchy.add_node('all') hierarchy.add_edge('all', 'super-region-1', weight=.1) hierarchy.add_edge('super-region-1', 'NAHI', weight=.1) hierarchy.add_edge('NAHI', 'CAN', weight=.1) hierarchy.add_edge('NAHI', 'USA', weight=.1) output_template = pandas.DataFrame( dict(year=[1990, 1990, 2005, 2005, 2010, 2010] * 2, sex=['male', 'female'] * 3 * 2, x_0=[.5] * 6 * 2, x_1=[0.] * 6 * 2, x_2=[.5] * 6 * 2, pop=[50.] * 6 * 2, area=['CAN'] * 6 + ['USA'] * 6)) # create model and priors vars = data_model.data_model('test', data, hierarchy, 'all') # fit model mc.MAP(vars).fit(method='fmin_powell', verbose=1) m = mc.MCMC(vars) m.use_step_method(mc.AdaptiveMetropolis, [m.gamma_bar, m.gamma, m.beta]) m.sample(30000, 15000, 15) # check estimates pi_usa = data_model.predict_for(output_template, hierarchy, 'all', 'USA', 'male', 1990, vars) assert pl.allclose(pi_usa.mean(), (m.mu_age.trace() * pl.exp(.05)).mean(), rtol=.1) # check convergence print 'gamma mc error:', m.gamma_bar.stats()['mc error'].round( 2), m.gamma.stats()['mc error'].round(2) # plot results for a_0i, a_1i, p_i in zip(age_start, age_end, p): pl.plot([a_0i, a_1i], [p_i, p_i], 'rs-', mew=1, mec='w', ms=4) pl.plot(a, pi_age_true, 'g-', linewidth=2) pl.plot(pl.arange(101), m.mu_age.stats()['mean'], 'k-', drawstyle='steps-post', linewidth=3) pl.plot(pl.arange(101), m.mu_age.stats()['95% HPD interval'], 'k', linestyle='steps-post:') pl.plot(pl.arange(101), pi_usa.mean(0), 'r-', linewidth=2, drawstyle='steps-post') pl.savefig('age_integrating_sim.png') # compare estimate to ground truth (skip endpoints, because they are extra hard to get right) assert pl.allclose(m.pi.stats()['mean'][10:-10], pi_true[10:-10], rtol=.2) lb, ub = m.pi.stats()['95% HPD interval'].T assert pl.mean((lb <= pi_true)[10:-10] & (pi_true <= ub)[10:-10]) > .75
def multipoly_sample(n, mp, test=None, verbose=0): """ Returns uniformly-distributed points on the earth's surface conditioned to be inside a multipolygon. Not particularly fast. """ # b = basemap.Basemap(-180,-90,180,90) if isinstance(mp, geometry.MultiPolygon): if verbose>0: print 'Breaking down multipolygon' areas = [shapely_poly_area(p) for p in mp.geoms] areas = np.array(areas) areas /= np.sum(areas) # ns = pm.rmultinomial(n, areas) stair = np.round(np.concatenate(([0],np.cumsum(areas*n)))).astype('int') ns = np.diff(stair) locs = [multipoly_sample(ns[i], mp.geoms[i], test) for i in np.where(ns>0)[0]] lons = np.concatenate([loc[0] for loc in locs]) lats = np.concatenate([loc[1] for loc in locs]) if len(lons) != n or len(lats) != n: raise ValueError return lons, lats lons = np.empty(n) lats = np.empty(n) done = 0 xmin = mp.bounds[0]*np.pi/180 ymin = mp.bounds[1]*np.pi/180 xmax = mp.bounds[2]*np.pi/180 ymax = mp.bounds[3]*np.pi/180 if verbose>0: print 'Starting: n=%i'%n while done < n: x = np.atleast_1d(pm.runiform(xmin,xmax, size=n)) y = np.atleast_1d(np.arcsin(pm.runiform(np.sin(ymin),np.sin(ymax),size=n))) points=[geom.Point([x[i]*180./np.pi,y[i]*180./np.pi]) for i in xrange(len(x))] good = list(iterops.contains(mp, points, True)) if test: good = filter(lambda p: test(p.coords[0][0], p.coords[0][1]),good) n_good = min(n,len(good)) lons[done:done+n_good] = [p.coords[0][0] for p in good][:n-done] lats[done:done+n_good] = [p.coords[0][1] for p in good][:n-done] done += n_good if verbose>0: print '\tDid %i, %i remaining.'%(n_good,n-done) if verbose>0: print 'Filled' if test: if not np.all([test(lon,lat) for lon,lat in zip(lons,lats)]): raise ValueError, 'Test failed at some outputs' if len(lons)!=n or len(lats)!=n: raise ValueError return lons, lats
def test_data_model_sim(): # generate simulated data n = 50 sigma_true = .025 # start with truth a = pl.arange(0, 100, 1) pi_age_true = .0001 * (a * (100. - a) + 100.) # choose age intervals to measure age_start = pl.array(mc.runiform(0, 100, n), dtype=int) age_start.sort() # sort to make it easy to discard the edges when testing age_end = pl.array(mc.runiform(age_start+1, pl.minimum(age_start+10,100)), dtype=int) # find truth for the integral across the age intervals import scipy.integrate pi_interval_true = [scipy.integrate.trapz(pi_age_true[a_0i:(a_1i+1)]) / (a_1i - a_0i) for a_0i, a_1i in zip(age_start, age_end)] # generate covariates that add explained variation X = mc.rnormal(0., 1.**2, size=(n,3)) beta_true = [-.1, .1, .2] Y_true = pl.dot(X, beta_true) # calculate the true value of the rate in each interval pi_true = pi_interval_true*pl.exp(Y_true) # simulate the noisy measurement of the rate in each interval p = mc.rnormal(pi_true, 1./sigma_true**2.) # store the simulated data in a pandas DataFrame data = pandas.DataFrame(dict(value=p, age_start=age_start, age_end=age_end, x_0=X[:,0], x_1=X[:,1], x_2=X[:,2])) data['effective_sample_size'] = pl.maximum(p*(1-p)/sigma_true**2, 1.) data['standard_error'] = pl.nan data['upper_ci'] = pl.nan data['lower_ci'] = pl.nan data['year_start'] = 2005. # TODO: make these vary data['year_end'] = 2005. data['sex'] = 'total' data['area'] = 'all' # generate a moderately complicated hierarchy graph for the model hierarchy = nx.DiGraph() hierarchy.add_node('all') hierarchy.add_edge('all', 'super-region-1', weight=.1) hierarchy.add_edge('super-region-1', 'NAHI', weight=.1) hierarchy.add_edge('NAHI', 'CAN', weight=.1) hierarchy.add_edge('NAHI', 'USA', weight=.1) output_template=pandas.DataFrame(dict(year=[1990, 1990, 2005, 2005, 2010, 2010]*2, sex=['male', 'female']*3*2, x_0=[.5]*6*2, x_1=[0.]*6*2, x_2=[.5]*6*2, pop=[50.]*6*2, area=['CAN']*6 + ['USA']*6)) # create model and priors vars = data_model.data_model('test', data, hierarchy, 'all') # fit model mc.MAP(vars).fit(method='fmin_powell', verbose=1) m = mc.MCMC(vars) m.use_step_method(mc.AdaptiveMetropolis, [m.gamma_bar, m.gamma, m.beta]) m.sample(30000, 15000, 15) # check estimates pi_usa = data_model.predict_for(output_template, hierarchy, 'all', 'USA', 'male', 1990, vars) assert pl.allclose(pi_usa.mean(), (m.mu_age.trace()*pl.exp(.05)).mean(), rtol=.1) # check convergence print 'gamma mc error:', m.gamma_bar.stats()['mc error'].round(2), m.gamma.stats()['mc error'].round(2) # plot results for a_0i, a_1i, p_i in zip(age_start, age_end, p): pl.plot([a_0i, a_1i], [p_i,p_i], 'rs-', mew=1, mec='w', ms=4) pl.plot(a, pi_age_true, 'g-', linewidth=2) pl.plot(pl.arange(101), m.mu_age.stats()['mean'], 'k-', drawstyle='steps-post', linewidth=3) pl.plot(pl.arange(101), m.mu_age.stats()['95% HPD interval'], 'k', linestyle='steps-post:') pl.plot(pl.arange(101), pi_usa.mean(0), 'r-', linewidth=2, drawstyle='steps-post') pl.savefig('age_integrating_sim.png') # compare estimate to ground truth (skip endpoints, because they are extra hard to get right) assert pl.allclose(m.pi.stats()['mean'][10:-10], pi_true[10:-10], rtol=.2) lb, ub = m.pi.stats()['95% HPD interval'].T assert pl.mean((lb <= pi_true)[10:-10] & (pi_true <= ub)[10:-10]) > .75
# <codecell> print '$m_{\all}$ is as low as %.0f per 10,000 PY at age %d, but rises to %.0f per 10,000 PY at age %d.' % (min_mx*10000, age_min_mx, max_mx*10000, age_max_mx) # <codecell> mc.np.random.seed(1234567) ages = pl.arange(101) knots = [0, 15, 60, 100] import scipy.interpolate Y_true = pl.exp(scipy.interpolate.interp1d(knots, pl.log([1.2, .3, .6, 1.5]), kind='linear')(ages)) N = 50 tau = .1**-2 X = pl.array(mc.runiform(pl.arange(0., 100., 100./N), 100./N + pl.arange(0., 100., 100./N), size=N), dtype=int) Y = mc.rnormal(Y_true[X], tau) # <codecell> def decorate_figure(): pl.legend(loc='lower right', fancybox=True, shadow=True)#, prop={'size':'x-large'}) #pl.xticks(fontsize='x-large') pl.yticks([0., .5, 1., 1.5])#, fontsize='x-large') pl.ylabel('$h(a)$', rotation=0)#, fontsize='xx-large') pl.subplots_adjust(.1, .175, .98, .875, .275) pl.axis([-5, 105, 0., 1.7]) # <codecell>
def validate_consistent_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t*age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type==t] = p_t[data_type==t] # add covariate shifts import dismod3 import simplejson as json gbd_model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.hierarchy = gbd_model.hierarchy from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = {} for t in types: alpha[t] = alpha_true_sim(model, area_list, sigma_true) print json.dumps(alpha, indent=2) model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): t = data_type[i] p[i] = p[i] * pl.exp(pl.sum([alpha[t][n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) #model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=101, burn=0, thin=1, tune_interval=100) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i+1) pl.plot(range(101), sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(range(101), sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr'] model.delta['sigma_pred'] = [pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr'] data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame() model.sigma = pandas.DataFrame() for t in types: alpha_t = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) alpha_t['true'] = pandas.Series(dict(alpha[t])) alpha_t['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['type'] = t model.alpha = model.alpha.append(alpha_t.dropna(), ignore_index=True) sigma_t = pandas.DataFrame(dict(true=sigma_true)) sigma_t['mu_pred'] = [n.stats()['mean'] for n in model.vars[t]['sigma_alpha']] sigma_t['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars[t]['sigma_alpha']] model.sigma = model.sigma.append(sigma_t.dropna(), ignore_index=True) data_simulation.add_quality_metrics(model.alpha) data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame(dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats()['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.mu['abs_err'].mean(), pl.median(pl.absolute(model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_age_integrating_model_sim(N=500, delta_true=.15, pi_true=quadratic): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) model = data_simulation.simple_model(N) #model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) #model.parameters['p']['smoothness'] = dict(amount='Very') age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] n = mc.runiform(100, 10000, size=N) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(a, pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame( dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats() ['standard deviation'])) data_simulation.add_quality_metrics(model.mu) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) print model.results return model
where p(x) stands for the prior for the true input and p(a,b) the prior for the regression parameters. """ from pymc import stochastic, observed, deterministic, uniform_like, runiform, rnormal, Sampler, Normal, Uniform from numpy import inf, log, cos,array import pylab # ------------------------------------------------------------------------------ # Synthetic values # Replace by real data # ------------------------------------------------------------------------------ slope = 1.5 intercept = 4 N = 30 true_x = runiform(0,50, N) true_y = slope*true_x + intercept data_y = rnormal(true_y, 2) data_x = rnormal(true_x, 2) # ------------------------------------------------------------------------------ # Calibration of straight line parameters from data # ------------------------------------------------------------------------------ @stochastic def theta(value=array([2.,5.])): """Slope and intercept parameters for a straight line. The likelihood corresponds to the prior probability of the parameters."""
# start with a simple model with N rows of data model = data_simulation.simple_model(N) # set covariate to 0/1 values randomly model.input_data['x_cov'] = 1. * mc.rcategorical([.5, .5], size=N) # record the true age-specific rates model.ages = pl.arange(0, 101, 1) model.pi_age_true = pi_true(model.ages) # choose age groups randomly age_width = pl.zeros(N) age_mid = mc.runiform(age_width/2, 100-age_width/2, size=N) age_start = pl.array(age_mid - age_width/2, dtype=int) age_end = pl.array(age_mid + age_width/2, dtype=int) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end # choose effective sample size uniformly at random n = mc.runiform(100, 10000, size=N) model.input_data['effective_sample_size'] = n # find true rate, with covariate p = model.pi_age_true[age_start] * pl.exp(model.input_data['x_cov']*beta_true)
import numpy as np import matplotlib.pyplot as plt import pymc as mc import scipy.stats as stats import math # http://blog.yhathq.com/posts/estimating-user-lifetimes-with-pymc.html # artificial data N = 20 true_alpha = 2 true_beta = 5 lifetime = mc.rweibull(true_alpha, true_beta, size=N) birth = mc.runiform(0, 10, N) # an individual is right censored if this is true censor = (birth + lifetime) > 10 lifetime_ = np.ma.masked_array(lifetime, censor) lifetime_.set_fill_value(10) plt.clf() y = np.arange(0, N) for b, l, yy in zip(birth, lifetime, y): plt.plot([b, b + l], [yy, yy]) plt.plot(birth + lifetime, y, linestyle="", marker="o") plt.draw() plt.show(block=False) # begin the model # just use uniform priors alpha = mc.Uniform("alpha", 0, 20)
### @export 'initialize' df = pandas.read_csv('ssas_mx.csv', index_col=None) ages = pl.arange(101) knots = [0, 15, 60, 100] import scipy.interpolate Y_true = pl.exp( scipy.interpolate.interp1d(knots, pl.log([1.2, .3, .6, 1.5]), kind='linear')(ages)) N = 50 tau = .1**-2 X = pl.array(mc.runiform(pl.arange(0., 100., 100. / N), 100. / N + pl.arange(0., 100., 100. / N), size=N), dtype=int) Y = mc.rnormal(Y_true[X], tau) ### @export 'initial-rates' pl.figure(figsize=(17., 11), dpi=72) dismod3.graphics.plot_data_bars(df, 'talk') pl.semilogy([0], [.1], '-') pl.title( 'All-cause mortality rate\nin 1990 for females\nin sub-Saharan Africa, Southern.', size=55) pl.ylabel('Rate (Per PY)', size=48) pl.xlabel('Age (Years)', size=48)
def validate_consistent_model_sim(N=500, delta_true=.5, true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t * age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type == t] = p_t[data_type == t] n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i + 1) pl.plot(a, sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(a, sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) #graphics.plot_one_type(model, model.vars['p'], {}, 'p') #pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][ data_type == t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type == t] = model.vars['p'][ 'p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame( dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr' ] model.delta['sigma_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr' ] data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame( dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats() ['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.mu['abs_err'].mean(), pl.median(pl.absolute( model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.finalize_results(model) print model.results return model
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]): ## generate simulated data a = pl.arange(0, 100, 1) pi_age_true = pi_true * pl.ones_like(a) model = data.ModelData() model.parameters['p']['parameter_age_mesh'] = [0, 100] model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0 delta = delta_true * pl.exp(pl.dot(Z, zeta_true)) for i in range(len(zeta_true)): model.input_data['z_%d'%i] = Z[:,i] model.input_data['true'] = pi_true model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] add_quality_metrics(model.input_data) model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns) model.zeta['true'] = zeta_true model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean'] model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats()['standard deviation'] add_quality_metrics(model.zeta) print '\nzeta' print model.zeta model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.zeta['abs_err'].dropna())), model.zeta.dropna()['covered?'].mean()) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'delta') add_to_results(model, 'input_data') add_to_results(model, 'zeta') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) return model
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1, .1, .1, .1, .1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons( json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights * 1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array([ 'all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR' ]) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical( pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp( pl.sum([ alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha ])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame( index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series( [n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series( [n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['sigma_alpha'] ] model.sigma['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha'] ] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame( dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats() ['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model