def generate_and_append_data(data, data_type, truth, age_intervals, gbd_region='Asia, Southeast', country='Thailand', year=2005, sex='male'): """ create simulated data""" for a0, a1 in age_intervals: d = { 'condition': 'type_2_diabetes', 'data_type': data_type, 'gbd_region': gbd_region, 'region': country, 'year_start': year, 'year_end': year, 'sex': sex, 'age_start': a0, 'age_end': a1, 'age_weights': list(np.ones(a1 + 1 - a0)), 'id': len(data) } p0 = dismod3.utils.rate_for_range(truth, range(a0, a1 + 1), np.ones(a1 + 1 - a0)) p1 = mc.rbeta(p0 * dispersion, (1 - p0) * dispersion) p2 = mc.rbinomial(n, p1) / n d['value'] = p2 d['standard_error'] = np.sqrt(p2 * (1 - p2) / n) data.append(d)
def simdata_postproc(sp_sub, survey_plan): """ This function should take a value for the Gaussian random field in the submodel sp_sub, evaluated at the survey plan locations, and return a simulated dataset. """ p = pm.invlogit(sp_sub) n = survey_plan.n return pm.rbinomial(n, p)
def plot_beta_binomial_funnel(alpha, beta): pi_true = alpha/(alpha+beta) pi = mc.rbeta(alpha, beta, size=10000) n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) k = mc.rbinomial(pl.array(n, dtype=int), pi) r = k/n pl.vlines([pi_true], .1*n.min(), 10*n.max(), linewidth=2, linestyle='-', color='w', zorder=9) pl.vlines([pi_true], .1*n.min(), 10*n.max(), linewidth=1, linestyle='--', color='black', zorder=10) pl.plot(r, n, 'ko', mew=0, alpha=.25) pl.semilogy(schiz['r'], schiz['n'], 'ks', mew=1, mec='white', ms=4, label='Observed values') pl.xlabel('Rate (per PY)') pl.ylabel('Study size (PY)') pl.xticks([0, .005, .01]) pl.axis([-.0001, .0101, 50., 1500000]) pl.title(r'$\alpha=%d$, $\beta=%d$' % (alpha, beta))
def generate_and_append_data(data, data_type, truth, age_intervals, gbd_region='Asia, Southeast', country='Thailand', year=2005, sex='male'): """ create simulated data""" for a0, a1 in age_intervals: d = { 'condition': 'type_2_diabetes', 'data_type': data_type, 'gbd_region': gbd_region, 'region': country, 'year_start': year, 'year_end': year, 'sex': sex, 'age_start': a0, 'age_end': a1, 'age_weights': list(np.ones(a1 + 1 - a0)), 'id': len(data)} p0 = dismod3.utils.rate_for_range(truth, range(a0, a1 + 1), np.ones(a1 + 1 - a0)) p1 = mc.rbeta(p0 * dispersion, (1 - p0) * dispersion) p2 = mc.rbinomial(n, p1) / n d['value'] = p2 d['standard_error'] = np.sqrt(p2 * (1 - p2) / n) data.append(d)
def plot_beta_binomial_funnel(alpha, beta): pi_true = alpha / (alpha + beta) pi = mc.rbeta(alpha, beta, size=10000) n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) k = mc.rbinomial(pl.array(n, dtype=int), pi) r = k / n pl.vlines([pi_true], .1 * n.min(), 10 * n.max(), linewidth=2, linestyle='-', color='w', zorder=9) pl.vlines([pi_true], .1 * n.min(), 10 * n.max(), linewidth=1, linestyle='--', color='black', zorder=10) pl.plot(r, n, 'ko', mew=0, alpha=.25) pl.semilogy(schiz['r'], schiz['n'], 'ks', mew=1, mec='white', ms=4, label='Observed values') pl.xlabel('Rate (per PY)') pl.ylabel('Study size (PY)') pl.xticks([0, .005, .01]) pl.axis([-.0001, .0101, 50., 1500000]) pl.title(r'$\alpha=%d$, $\beta=%d$' % (alpha, beta))
def pred(alpha=alpha, beta=beta): return mc.rbinomial(n_pred, mc.rbeta(alpha, beta)) / float(n_pred)
def pred(pi=pi): return mc.rbinomial(n_pred, pi) / float(n_pred)
def pred(pi=pi): return mc.rbinomial(n, pi)
def simdata_postproc(sp_sub, survey_plan, a1, a2): p = pm.stukel_invlogit(sp_sub, a1, a2) n = survey_plan.n return pm.rbinomial(n, p)
def p_pred(pi=pi, n=n_nonzero): return mc.rbinomial(n, pi + 1.0e-9) / (1.0 * n)
def p_pred(pi=pi, n=n_nonzero): return mc.rbinomial(n, pi + 1.e-9) / (1. * n)
def f(sp_sub, n=n): return pm.rbinomial(n=n,p=pm.invlogit(sp_sub))
x = np.vstack((lon,lat,t)).T cov1 = gencirc(x,r1) cov2 = gencirc(x,r2) M = c1*cov1+c2*cov2 S = pm.gp.FullRankCovariance(pm.gp.cov_funs.exponential.aniso_geo_rad, amp=.5, scale=.08, inc=.5, ecc=.5).cholesky(x[:,:2]) y = pm.rmv_normal_chol(M,S.T)+np.random.normal(N)*.1 z = pm.flib.invlogit(y) lo_age = np.ones(N)*2 up_age = np.ones(N)*10 n = np.random.randint(10,500,size=N) pos = pm.rbinomial(n, z) neg = n-pos data_file = np.rec.fromarrays([pos,neg,lo_age,up_age,lon,lat,t,cov1,cov2],names='pos,neg,lo_age,up_age,lon,lat,t,cov1,cov2') # where_0 = np.where(M==0) # where_1 = np.where(M==1) # where_n1 = np.where(M==-1) # # pl.figure(1) # pl.clf() # pl.hist(z[where_0]) # # pl.figure(2) # pl.clf()
import numpy import pymc from pymc import rbinomial,Binomial,Normal,Gamma import pylab import scipy.stats #numpy.random.seed(15) Nsubj = 4 Ntrls = 100 # the data signal_resp = rbinomial(n=Ntrls, p=0.80, size=Nsubj) noise_resp = rbinomial(n=Ntrls, p=0.10, size=Nsubj) # the model prior_md = Normal('prior_md', mu=0.0, tau=0.001, value=0.0) prior_mc = Normal('prior_mc', mu=0.0, tau=0.001, value=0.0) prior_taud = Gamma('prior_taud', alpha=0.001, beta=0.001, value=0.01) prior_tauc = Gamma('prior_tauc', alpha=0.001, beta=0.001, value=0.01) dprm = Normal('dprm', mu=Pmd, tau=Ptaud, size=Nsubj, value=[0,0,0,0]) bias = Normal('bias', mu=Pmc, tau=Ptauc, size=Nsubj, value=[0,0,0,0]) Phi = scipy.stats.norm.cdf @pymc.deterministic def hi(d=dprm, c=bias): return Phi(+0.5*d - c) @pymc.deterministic
import pylab as pl import pymc as mc import dismod3 import book_graphics reload(book_graphics) # set font book_graphics.set_font() ### @export 'binomial-model-funnel' pi_binomial_funnel = 0.004 n = pl.exp(mc.rnormal(10, 2 ** -2, size=10000)) k = mc.rbinomial(pl.array(n.round(), dtype=int), pi_binomial_funnel) r = k / n pl.figure(**book_graphics.half_page_params) pl.vlines( [pi_binomial_funnel], 0.1 * n.min(), 10 * n.max(), linewidth=2, linestyle="-", color="w", zorder=9, label="_nolegend_", ) pl.vlines( [pi_binomial_funnel], 0.1 * n.min(), 10 * n.max(), linewidth=1, linestyle="--", color="k", zorder=10, label="$\pi$"
def pred(alpha=alpha, beta=beta, phi=phi): if pl.rand() < phi: return 0 else: return mc.rbinomial(n_pred, mc.rbeta(alpha, beta)) / float(n_pred)
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) # load data model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/') data = model.get_data('p') #data = data.ix[:20, :] # replace data with synthetic data if requested if data_type == 'epilepsy': # no replacement needed pass elif data_type == 'schiz': import pandas as pd data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv') elif data_type == 'binom': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N elif data_type == 'poisson': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rpoisson(N*mu, size=len(data.index)) / N elif data_type == 'normal': mu = data['value'].mean() sigma = .125*mu data['standard_error'] = sigma data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index)) elif data_type == 'log_normal': mu = data['value'].mean() sigma = .25 data['standard_error'] = sigma*mu data['value'] = pl.exp(mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index))) else: raise TypeError, 'Unknown data type "%s"' % data_type # sample prevalence data i_test = mc.rbernoulli(.25, size=len(data.index)) i_nan = pl.isnan(data['effective_sample_size']) data['lower_ci'] = pl.nan data['upper_ci'] = pl.nan data.ix[i_nan, 'effective_sample_size'] = 0. data['standard_error'] = pl.sqrt(data['value']*(1-data['value'])) / data['effective_sample_size'] data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf data['standard_error'][i_test] = pl.inf data['effective_sample_size'][i_test] = 0. data['value'] = pl.maximum(data['value'], 1.e-12) model.input_data = data # create model # TODO: set parameters in model.parameters['p'] dict # then have simple method to create age specific rate model #model.parameters['p'] = ... #model.vars += dismod3.ism.age_specific_rate(model, 'p') model.parameters['p']['parameter_age_mesh'] = [0,100] model.parameters['p']['heterogeneity'] = 'Very' model.vars['p'] = dismod3.data_model.data_model( 'p', model, 'p', 'all', 'total', 'all', None, None, None, rate_type=rate_type, interpolation_method='zero', include_covariates=False) # add upper bound on sigma in log normal model to help convergence #if rate_type == 'log_normal': # model.vars['p']['sigma'].parents['upper'] = 1.5 # add upper bound on sigma, zeta in offset log normal #if rate_type == 'offset_log_normal': # model.vars['p']['sigma'].parents['upper'] = .1 # model.vars['p']['p_zeta'].value = 5.e-9 # model.vars['p']['p_zeta'].parents['upper'] = 1.e-8 # fit model dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000) #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0) # compare estimate to hold-out data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,0] data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,1] import data_simulation model.test = data[i_test] data = model.test data['true'] = data['value'] data_simulation.add_quality_metrics(data) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'test') data_simulation.finalize_results(model) return model
def f(sp_sub, n=n): return pm.rbinomial(n=n, p=pm.invlogit(sp_sub))
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) # load data model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/') data = model.get_data('p') #data = data.ix[:20, :] # replace data with synthetic data if requested if data_type == 'epilepsy': # no replacement needed pass elif data_type == 'schiz': import pandas as pd data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv') elif data_type == 'binom': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N elif data_type == 'poisson': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rpoisson(N * mu, size=len(data.index)) / N elif data_type == 'normal': mu = data['value'].mean() sigma = .125 * mu data['standard_error'] = sigma data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index)) elif data_type == 'log_normal': mu = data['value'].mean() sigma = .25 data['standard_error'] = sigma * mu data['value'] = pl.exp( mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index))) else: raise TypeError, 'Unknown data type "%s"' % data_type # sample prevalence data i_test = mc.rbernoulli(.25, size=len(data.index)) i_nan = pl.isnan(data['effective_sample_size']) data['lower_ci'] = pl.nan data['upper_ci'] = pl.nan data.ix[i_nan, 'effective_sample_size'] = 0. data['standard_error'] = pl.sqrt( data['value'] * (1 - data['value'])) / data['effective_sample_size'] data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf data['standard_error'][i_test] = pl.inf data['effective_sample_size'][i_test] = 0. data['value'] = pl.maximum(data['value'], 1.e-12) model.input_data = data # create model # TODO: set parameters in model.parameters['p'] dict # then have simple method to create age specific rate model #model.parameters['p'] = ... #model.vars += dismod3.ism.age_specific_rate(model, 'p') model.parameters['p']['parameter_age_mesh'] = [0, 100] model.parameters['p']['heterogeneity'] = 'Very' model.vars['p'] = dismod3.data_model.data_model( 'p', model, 'p', 'all', 'total', 'all', None, None, None, rate_type=rate_type, interpolation_method='zero', include_covariates=False) # add upper bound on sigma in log normal model to help convergence #if rate_type == 'log_normal': # model.vars['p']['sigma'].parents['upper'] = 1.5 # add upper bound on sigma, zeta in offset log normal #if rate_type == 'offset_log_normal': # model.vars['p']['sigma'].parents['upper'] = .1 # model.vars['p']['p_zeta'].value = 5.e-9 # model.vars['p']['p_zeta'].parents['upper'] = 1.e-8 # fit model dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000) #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0) # compare estimate to hold-out data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:, 0] data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:, 1] import data_simulation model.test = data[i_test] data = model.test data['true'] = data['value'] data_simulation.add_quality_metrics(data) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'test') data_simulation.finalize_results(model) return model
def f(sp_sub, a, b, n=n): p = pm.invlogit(sp_sub) h = pm.rbeta(a, b, size=len(sp_sub)) p_def = g6pd.p_fem_def(p, h) return pm.rbinomial(n=n, p=p)
def simdata_postproc(sp_sub, survey_plan): p = pm.invlogit(sp_sub) n = survey_plan.n return pm.rbinomial(n, p)
def f(sp_sub, a, b, n=n): p = pm.invlogit(sp_sub) h = pm.rbeta(a,b,size=len(sp_sub)) p_def = g6pd.p_fem_def(p,h) return pm.rbinomial(n=n, p=p)
if len(names)>2: for name in names[:-2]: cv[name] = np.random.normal(size=n_data+n_pred)*on#np.ones(n_data) cv['m'] = np.ones(n_data+n_pred)*on cv['t'] = t*on C = pm.gp.FullRankCovariance(my_st, amp=1, scale=1, inc=np.pi/4, ecc=.3,st=.1, sd=.5, tlc=.2, sf = .1) dm = np.vstack((lon,lat,t)).T C_eval = C(dm,dm) f = pm.rmv_normal_cov(np.sum([cv[name]*vals[name] for name in names],axis=0), C_eval) + np.random.normal(size=n_data+n_pred)*np.sqrt(V) p = pm.flib.invlogit(f) ns = 100 pos = pm.rbinomial(ns, p) neg = ns - pos print p ra_data = np.rec.fromarrays((pos[:n_data], neg[:n_data], lon[:n_data], lat[:n_data]) + tuple([cv[name][:n_data] for name in names]), names=['pos','neg','lon','lat']+names) pl.rec2csv(ra_data,'test_data.csv') ra_pred = np.rec.fromarrays((pos[n_data:], neg[n_data:], lon[n_data:], lat[n_data:]) + tuple([cv[name][n_data:] for name in names]), names=['pos','neg','lon','lat']+names) pl.rec2csv(ra_pred,'test_pred.csv') os.system('infer cov_test test_db test_data.csv -t 10 -n 8 -i 100000') # os.system('cov-test-predict test test_pred.csv 1000 100') # # # ra_data = pl.csv2rec('test_data.csv') # # ra_pred = pl.csv2rec('test_pred.csv')
def p_pred(pi=pi_latent, n=n_nonzero): return mc.rbinomial(n, pi) / (1. * n)
import pylab as pl import pymc as mc import dismod3 import book_graphics reload(book_graphics) # set font book_graphics.set_font() ### @export 'binomial-model-funnel' pi_binomial_funnel = .004 n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) k = mc.rbinomial(pl.array(n.round(), dtype=int), pi_binomial_funnel) r = k / n pl.figure(**book_graphics.half_page_params) pl.vlines([pi_binomial_funnel], .1 * n.min(), 10 * n.max(), linewidth=2, linestyle='-', color='w', zorder=9, label='_nolegend_') pl.vlines([pi_binomial_funnel], .1 * n.min(), 10 * n.max(), linewidth=1,
def p_pred(pi=pi_latent, n=n_nonzero): return mc.rbinomial(n, pi) / (1.0 * n)
############################### # Simulate data. ############################### # How many datapoints? n = 250 # Put down a random scattering of data locations on the unit sphere. X = spherical.well_spaced_mesh(n) # Generate some binomial data. Prevalence is going to be high at the equator, low at the poles. p_true = np.exp(-X[:,2]**2*5) # Number sampled and number positive. N = 100 k = pm.rbinomial(N, p_true) ################################ # Fit the model. ################################ M = pm.MCMC(make_model(N,k,X,cholmod,spherical),db='hdf5') scalar_variables = filter(lambda x:not x.observed, [M.m, M.amp, M.kappa]) if len(scalar_variables)>0: M.use_step_method(pm.AdaptiveMetropolis, scalar_variables) # Comment to use the default AdaptiveMetropolis step method. # GMRFMetropolis kind of scales better to high dimensions, but may mix worse in low. M.use_step_method(pymc_objects.GMRFMetropolis, M.S, M.likelihood_string, M.M, M.Q, M.likelihood_variables, n_sweeps=100) M.isample(1000,0,10) ################################
def f(sp_sub, a, n=n): return pm.rbinomial(n=n, p=pm.stukel_invlogit(sp_sub, *a))
def deaths_sim(n=n, p=theta): """deaths_sim = rbinomial(n, p)""" return pm.rbinomial(n, p)
def create_test_rates(rate_function_str='(age/100.0)**2', rate_type='prevalence data', age_list=None, num_subjects=1000): import dismod3.models as models if not age_list: #age_list = range(0,101,10) age_list = np.random.random_integers(0,90,20) params = {} params['disease'], flag = models.Disease.objects.get_or_create(name='Test Disease') params['region'], flag = models.Region.objects.get_or_create(name='World') params['rate_type'] = rate_type params['sex'] = 'total' params['country'] = 'Canada' params['epoch_start'] = 2000 params['epoch_end'] = 2000 rate_list = [] # TODO: make this safe and robust if isinstance(rate_function_str, str): rate_function = eval('lambda age: %s'%rate_function_str) else: from scipy.interpolate import interp1d rf_vals = np.array(rate_function_str) # it is actually an Nx2 array rate_function = interp1d(rf_vals[:,0], rf_vals[:,1], kind='cubic') rate_vec = np.array([rate_function(a) for a in range(101)]) for a in age_list: #params['age_start'] = a-5 params['age_start'] = a #params['age_end'] = params['age_start'] #params['age_end'] = a+5 params['age_end'] = np.random.random_integers(a, 100) params['denominator'] = num_subjects params['numerator'] = 0 new_rate = models.Rate(**params) new_rate.params['Notes'] = 'Simulated data, created using function %s' % rate_function_str new_rate.params['Urbanicity'] = (np.random.randn() > 0) and 'Urban' or 'Rural' new_rate.save() p = probabilistic_utils.rate_for_range(rate_vec, new_rate.age_start, new_rate.age_end, new_rate.population()) # adjust p to make data heterogeneous according to 'Urbanicity' covariate if new_rate.params['Urbanicity'] == 'Urban': p *= 1.5 #multiplicative_noise = 1. #multiplicative_noise = 1 + 0.1*np.random.randn() #new_rate.numerator = multiplicative_noise * \ # new_rate.denominator * probabilistic_utils.rate_for_range(rate_vec, new_rate.age_start, new_rate.age_end, new_rate.population()) new_rate.numerator = mc.rbinomial(new_rate.denominator, p) new_rate.save() rate_list.append(new_rate) return rate_list, rate_function