def step(self): self.compute_y_logp() t_last = 0 for stoch in self.state_seq: t_end = t_last + np.alen(stoch.value) time_range = xrange(t_last, t_end) trans_mat = stoch.parents['trans_mat'] trans_mat = getattr(trans_mat, 'value', trans_mat) P = np.column_stack((trans_mat, 1. - trans_mat.sum(axis=1))) p0 = stoch.parents['p0'] p0 = getattr(p0, 'value', p0) if p0 is None: p0 = compute_steady_state(trans_mat) p_run = p0 # Very inefficient forward pass: for t in time_range: logp_k_t = self.logp_filtered[t] for k in xrange(stoch.K): # This is the forward step (in log scale): # p(S_t=k \mid y_{1:t}) \propto p(y_t \mid S_t=k) * # p(S_t=k \mid y_{1:t-1}) logp_k_t[k] = self.y_logp_vals[k, t] +\ pymc.categorical_like(k, p_run) # Here we normalize across k logp_k_t -= reduce(np.logaddexp, logp_k_t) # This computes p(S_{t+1} \mid y_{1:t}) p_run = np.dot(np.exp(logp_k_t), P) np.exp(self.logp_filtered, out=self.p_filtered) # An inefficient backward pass: # Sample p(S_T \mid y_{1:T}) new_values = np.empty_like(stoch.value, dtype=stoch.value.dtype) new_values[t_end-1] = pymc.rcategorical(self.p_filtered[t_end-1][:-1]) for t in xrange(t_end-2, t_last-1, -1): # Now, sample p(S_t \mid S_{t+1}, y_{1:T}) via the relation # p(S_t=j \mid S_{t+1}=k, y_{1:T}) \propto # p(S_t=j \mid S_{t_1}=k, y_{1:t}) \propto # p(S_{t+1}=k \mid S_t=j, y_{1:t}) * p(S_t=j \mid y_{1:t}) p_back = P[:, int(new_values[t + 1])] * self.p_filtered[t] p_back /= p_back.sum() new_values[t-t_last] = pymc.rcategorical(p_back[:-1]) stoch.value = new_values t_last += np.alen(stoch.value)
def states_random(Ptrans=Ptrans, N_chain=N_chain): P = np.column_stack((Ptrans, 1. - Ptrans.sum(axis=1))) Pinit = unconditionalProbability(Ptrans) states = np.empty(N_chain, dtype=np.uint8) states[0] = pymc.rcategorical(Pinit) for i in range(1, N_chain): states[i] = pymc.rcategorical(P[states[i - 1]]) return states
def ages_and_data(N_exam, f_samp, correction_factor_array, age_lims): """Called by pred_samps. Simulates ages of survey participants and data given f.""" N_samp = len(f_samp) N_age_samps = correction_factor_array.shape[1] # Get samples for the age distribution at the observation points. age_distribution = [] for i in xrange(N_samp): l = age_lims[i] age_distribution.append(S_trace[np.random.randint(S_trace.shape[0]), 0, l[0]:l[1] + 1]) age_distribution[-1] /= np.sum(age_distribution[-1]) # Draw age for each individual, draw an age-correction profile for each location, # compute probability of positive for each individual, see how many individuals are # positive. A = [] pos = [] for s in xrange(N_samp): A.append( np.array(pm.rcategorical(age_distribution[s], size=N_exam[s]), dtype=int) + age_lims[s][0]) P_samp = pm.invlogit(f_samp[s].ravel( )) * correction_factor_array[:, np.random.randint(N_age_samps)][A[-1]] pos.append(pm.rbernoulli(P_samp)) return A, pos, age_distribution
def test_fixed_effect_priors(): model = data.ModelData() # set prior on sex parameters = dict(fixed_effects={'x_sex': dict(dist='TruncatedNormal', mu=1., sigma=.5, lower=-10, upper=10)}) # simulate normal data n = 32. sex_list = pl.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] beta_true = dict(male=-1., total=0., female=1.) pi_true = pl.exp([beta_true[s] for s in sex]) sigma_true = .05 p = mc.rnormal(pi_true, 1./sigma_true**2.) model.input_data = pandas.DataFrame(dict(value=p, sex=sex)) model.input_data['area'] = 'all' model.input_data['year_start'] = 2010 model.input_data['year_start'] = 2010 # create model and priors vars = {} vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, parameters, model, 'all', 'total', 'all')) print vars['beta'] assert vars['beta'][0].parents['mu'] == 1.
def test_covariate_model_dispersion(): # simulate normal data n = 100 model = data.ModelData() model.hierarchy, model.output_template = data_simulation.small_output() Z = mc.rcategorical([.5, 5.], n) zeta_true = -.2 pi_true = .1 ess = 10000.*pl.ones(n) eta_true = pl.log(50) delta_true = 50 + pl.exp(eta_true) p = mc.rnegative_binomial(pi_true*ess, delta_true*pl.exp(Z*zeta_true)) / ess model.input_data = pandas.DataFrame(dict(value=p, z_0=Z)) model.input_data['area'] = 'all' model.input_data['sex'] = 'total' model.input_data['year_start'] = 2000 model.input_data['year_end'] = 2000 # create model and priors vars = dict(mu=mc.Uninformative('mu_test', value=pi_true)) vars.update(covariate_model.mean_covariate_model('test', vars['mu'], model.input_data, {}, model, 'all', 'total', 'all')) vars.update(covariate_model.dispersion_covariate_model('test', model.input_data, .1, 10.)) vars.update(rate_model.neg_binom_model('test', vars['pi'], vars['delta'], p, ess)) # fit model m = mc.MCMC(vars) m.sample(2)
def step(self): # The right-hand sides for the linear constraints self.rhs = dict(zip(self.constraint_offdiags, [np.asarray(np.dot(pm.utils.value(od), self.g.value)).squeeze() for od in self.constraint_offdiags])) for i in xrange(self.n): try: lb, ub, rhs = self.get_bounds(i) except ConstraintError: warnings.warn('Bounds could not be set, this element is very highly constrained') continue newgs = np.hstack((self.g.value[i], pm.rtruncnorm(0,1,lb,ub,size=self.n_draws))) lpls = np.hstack((self.get_likelihood_only(), np.empty(self.n_draws))) for j, newg in enumerate(newgs[1:]): self.set_g_value(newg, i) # The newgs are drawn from the prior, taking the canstraints into account, so # accept them based on the 'likelihood children' only. try: lpls[j+1] = self.get_likelihood_only() except pm.ZeroProbability: lpls[j+1] = -np.inf lpls -= pm.flib.logsum(lpls) newg = newgs[pm.rcategorical(np.exp(lpls))] self.set_g_value(newg, i) for od in self.constraint_offdiags: rhs[od] += np.asarray(pm.utils.value(od))[:,i].squeeze() * newg self.rhs = rhs
def test_fixed_effect_priors(): model = dismod_mr.data.ModelData() # set prior on sex parameters = dict( fixed_effects={ 'x_sex': dict(dist='TruncatedNormal', mu=1., sigma=.5, lower=-10, upper=10) }) # simulate normal data n = 32 sex_list = np.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] beta_true = dict(male=-1., total=0., female=1.) pi_true = np.exp([beta_true[s] for s in sex]) sigma_true = .05 p = mc.rnormal(pi_true, 1. / sigma_true**2.) model.input_data = pd.DataFrame(dict(value=p, sex=sex)) model.input_data['area'] = 'all' model.input_data['year_start'] = 2010 model.input_data['year_start'] = 2010 # create model and priors vars = {} vars.update( dismod_mr.model.covariates.mean_covariate_model( 'test', 1, model.input_data, parameters, model, 'all', 'total', 'all')) print(vars['beta']) assert vars['beta'][0].parents['mu'] == 1.
def step(self): x0 = np.copy(self.stochastic.value) dx = pymc.rnormal(np.zeros(np.shape(x0)), self.proposal_tau) logp = [self.logp_plus_loglike] x_prime = [x0] for direction in [-1, 1]: for i in xrange(25): delta = direction * np.exp(.1 * i) * dx try: self.stochastic.value = x0 + delta logp.append(self.logp_plus_loglike) x_prime.append(x0 + delta) except pymc.ZeroProbability: self.stochastic.value = x0 i = pymc.rcategorical(np.exp(np.array(logp) - pymc.flib.logsum(logp))) self.stochastic.value = x_prime[i] if i == 0: self.rejected += 1 if self.verbose > 2: print self._id + ' rejecting' else: self.accepted += 1 if self.verbose > 2: print self._id + ' accepting'
def test_random_effect_priors(): model = dismod_mr.data.ModelData() # set prior on sex parameters = dict(random_effects={ 'USA': dict(dist='TruncatedNormal', mu=.1, sigma=.5, lower=-10, upper=10) }) # simulate normal data n = 32 area_list = np.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = np.exp([alpha_true[a] for a in area]) sigma_true = .05 p = mc.rnormal(pi_true, 1. / sigma_true**2.) model.input_data = pd.DataFrame(dict(value=p, area=area)) model.input_data['sex'] = 'male' model.input_data['year_start'] = 2010 model.input_data['year_end'] = 2010 model.hierarchy.add_edge('all', 'USA') model.hierarchy.add_edge('all', 'CAN') # create model and priors vars = {} vars.update( dismod_mr.model.covariates.mean_covariate_model( 'test', 1, model.input_data, parameters, model, 'all', 'total', 'all')) print(vars['alpha']) print(vars['alpha'][1].parents['mu'])
def step(self): x0 = np.copy(self.stochastic.value) dx = pymc.rnormal(np.zeros(np.shape(x0)), self.proposal_tau) logp = [self.logp_plus_loglike] x_prime = [x0] for direction in [-1, 1]: for i in xrange(25): delta = direction*np.exp(.1*i)*dx try: self.stochastic.value = x0 + delta logp.append(self.logp_plus_loglike) x_prime.append(x0 + delta) except pymc.ZeroProbability: self.stochastic.value = x0 i = pymc.rcategorical(np.exp(np.array(logp) - pymc.flib.logsum(logp))) self.stochastic.value = x_prime[i] if i == 0: self.rejected += 1 if self.verbose > 2: print self._id + ' rejecting' else: self.accepted += 1 if self.verbose > 2: print self._id + ' accepting'
def step(self): x0 = self.value[self.n] u = pymc.rnormal(np.zeros(self.N), 1.) dx = np.dot(u, self.value) self.stochastic.value = x0 logp = [self.logp_plus_loglike] x_prime = [x0] for direction in [-1, 1]: for i in xrange(25): delta = direction * np.exp(.1 * i) * dx try: self.stochastic.value = x0 + delta logp.append(self.logp_plus_loglike) x_prime.append(x0 + delta) except pymc.ZeroProbability: self.stochastic.value = x0 i = pymc.rcategorical(np.exp(np.array(logp) - pymc.flib.logsum(logp))) self.value[self.n] = x_prime[i] self.stochastic.value = x_prime[i] if i == 0: self.rejected += 1 if self.verbose > 2: print self._id + ' rejecting' else: self.accepted += 1 if self.verbose > 2: print self._id + ' accepting' self.n += 1 if self.n == self.N: self.n = 0
def test_random_effect_priors(): model = data.ModelData() # set prior on sex parameters = dict(random_effects={'USA': dict(dist='TruncatedNormal', mu=.1, sigma=.5, lower=-10, upper=10)}) # simulate normal data n = 32. area_list = pl.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = pl.exp([alpha_true[a] for a in area]) sigma_true = .05 p = mc.rnormal(pi_true, 1./sigma_true**2.) model.input_data = pandas.DataFrame(dict(value=p, area=area)) model.input_data['sex'] = 'male' model.input_data['year_start'] = 2010 model.input_data['year_end'] = 2010 model.hierarchy.add_edge('all', 'USA') model.hierarchy.add_edge('all', 'CAN') # create model and priors vars = {} vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, parameters, model, 'all', 'total', 'all')) print vars['alpha'] print vars['alpha'][1].parents['mu'] assert vars['alpha'][1].parents['mu'] == .1
def step(self): x0 = self.value[self.n] u = pm.rnormal(np.zeros(self.N), 1.) dx = np.dot(u, self.value) self.stochastic.value = x0 logp = [self.logp_plus_loglike] x_prime = [x0] for direction in [-1, 1]: for i in xrange(25): delta = direction*np.exp(.1*i)*dx try: self.stochastic.value = x0 + delta logp.append(self.logp_plus_loglike) x_prime.append(x0 + delta) except pm.ZeroProbability: self.stochastic.value = x0 i = pm.rcategorical(np.exp(np.array(logp) - pm.flib.logsum(logp))) self.value[self.n] = x_prime[i] self.stochastic.value = x_prime[i] if i == 0: self.rejected += 1 else: self.accepted += 1 self.n += 1 if self.n == self.N: self.n = 0
def test_covariate_model_dispersion(): # simulate normal data n = 100 model = dismod_mr.data.ModelData() model.hierarchy, model.output_template = dismod_mr.testing.data_simulation.small_output() Z = mc.rcategorical([.5, 5.], n) zeta_true = -.2 pi_true = .1 ess = 10000.*np.ones(n) eta_true = np.log(50) delta_true = 50 + np.exp(eta_true) p = mc.rnegative_binomial(pi_true*ess, delta_true*np.exp(Z*zeta_true)) / ess model.input_data = pd.DataFrame(dict(value=p, z_0=Z)) model.input_data['area'] = 'all' model.input_data['sex'] = 'total' model.input_data['year_start'] = 2000 model.input_data['year_end'] = 2000 # create model and priors variables = dict(mu=mc.Uninformative('mu_test', value=pi_true)) variables.update(dismod_mr.model.covariates.mean_covariate_model('test', variables['mu'], model.input_data, {}, model, 'all', 'total', 'all')) variables.update(dismod_mr.model.covariates.dispersion_covariate_model('test', model.input_data, .1, 10.)) variables.update(dismod_mr.model.likelihood.neg_binom('test', variables['pi'], variables['delta'], p, ess)) # fit model m = mc.MCMC(variables) m.sample(2)
def sim_ordinal(I, J, K, alpha=None, beta=None): # test input params here Is = range(I) Js = range(J) Ks = range(K) N = I * J Ns = range(N) if alpha == None: alpha = alloc_mat(K, K) for k1 in Ks: for k2 in Ks: alpha[k1][k2] = max(1,(K + (0.5 if k1 == k2 else 0) \ - abs(k1 - k2))**4) if beta == None: beta = alloc_vec(K, 2.0) # simulated params beta = alloc_vec(K, 2.0) prevalence = pymc.rdirichlet(beta).tolist() prevalence.append(1.0 - sum(prevalence)) # complete category = [] for i in Is: category.append(pymc.rcategorical(prevalence).tolist()) accuracy = alloc_tens(J, K, K) for j in Js: for k in Ks: accuracy[j][k] = pymc.rdirichlet(alpha[k]).tolist() accuracy[j][k].append(1.0 - sum(accuracy[j][k])) # simulated data item = [] anno = [] label = [] for i in Is: for j in Js: item.append(i) anno.append(j) label.append(pymc.rcategorical(accuracy[j][category[i]]).tolist()) N = len(item) return (prevalence, category, accuracy, item, anno, label)
def step(self): self._index += 1 if self._index % self.sleep_interval == 0: v = pm.value(self.v) m = pm.value(self.m) val = self.stochastic.value lp = pm.logp_of_set(self.other_children) # Choose a direction along which to step. dirvec = np.random.normal(size=self.n) dirvec /= np.sqrt(np.sum(dirvec**2)) # Orthogonalize orthoproj = gramschmidt(dirvec) scaled_orthoproj = v*orthoproj.T pck = np.dot(dirvec, scaled_orthoproj.T) kck = np.linalg.inv(np.dot(scaled_orthoproj,orthoproj)) pckkck = np.dot(pck,kck) # Figure out conditional variance condvar = np.dot(dirvec, dirvec*v) - np.dot(pck, pckkck) # condmean = np.dot(dirvec, m) + np.dot(pckkck, np.dot(orthoproj.T, (val-m))) # Compute slice of log-probability surface tries = np.linspace(-4*np.sqrt(condvar), 4*np.sqrt(condvar), 501) lps = 0*tries for i in xrange(len(tries)): new_val = val + tries[i]*dirvec self.stochastic.value = new_val try: lps[i] = self.f_fr.logp + self.stochastic.logp except: lps[i] = -np.inf if np.all(np.isinf(lps)): raise ValueError, 'All -inf.' lps -= pm.flib.logsum(lps[True-np.isinf(lps)]) ps = np.exp(lps) index = pm.rcategorical(ps) new_val = val + tries[index]*dirvec self.stochastic.value = new_val try: lpp = pm.logp_of_set(self.other_children) if np.log(np.random.random()) < lpp - lp: self.accepted += 1 else: self.stochastic.value = val self.rejected += 1 except pm.ZeroProbability: self.stochastic.value = val self.rejected += 1 self.logp_plus_loglike
def test_covariate_model_sim_w_hierarchy(): n = 50 # setup hierarchy hierarchy, output_template = data_simulation.small_output() # simulate normal data area_list = np.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] sex_list = np.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] year = np.array(mc.runiform(1990, 2010, n), dtype=int) alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = np.exp([alpha_true[a] for a in area]) sigma_true = .05 * np.ones_like(pi_true) p = mc.rnormal(pi_true, 1. / sigma_true**2.) model = dismod_mr.data.ModelData() model.input_data = pd.DataFrame( dict(value=p, area=area, sex=sex, year_start=year, year_end=year)) model.hierarchy, model.output_template = hierarchy, output_template # create model and priors vars = {} vars.update( dismod_mr.model.covariates.mean_covariate_model( 'test', 1, model.input_data, {}, model, 'all', 'total', 'all')) vars.update( dismod_mr.model.likelihood.normal('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2) assert 'sex' not in vars['U'] assert 'x_sex' in vars['X'] assert len(vars['beta']) == 1
def step(self): direction = self.choose_direction(norm=False) current_value = self.get_current_value() x_prime = np.vstack((current_value, np.outer(np.linspace(-self.xprime_sds,self.xprime_sds,self.xprime_n),direction) + current_value)) lps = np.empty(self.xprime_n+1) lps[0] = self.logp_plus_loglike for i in xrange(self.xprime_n): self.set_current_value(x_prime[i+1]) lps[i+1] = self.logp_plus_loglike next_value = x_prime[pm.rcategorical(np.exp(lps-pm.flib.logsum(lps)))] self.set_current_value(next_value) self.store(next_value)
def test_covariate_model_sim_w_hierarchy(): n = 50 # setup hierarchy hierarchy, output_template = data_simulation.small_output() # simulate normal data area_list = pl.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] sex_list = pl.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] year = pl.array(mc.runiform(1990, 2010, n), dtype=int) alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = pl.exp([alpha_true[a] for a in area]) sigma_true = .05*pl.ones_like(pi_true) p = mc.rnormal(pi_true, 1./sigma_true**2.) model = data.ModelData() model.input_data = pandas.DataFrame(dict(value=p, area=area, sex=sex, year_start=year, year_end=year)) model.hierarchy, model.output_template = hierarchy, output_template # create model and priors vars = {} vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, {}, model, 'all', 'total', 'all')) vars.update(rate_model.normal_model('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2) assert 'sex' not in vars['U'] assert 'x_sex' in vars['X'] assert len(vars['beta']) == 1
def states_random(trans_mat, N_obs, p0, size=None): """ Samples states from an HMM. Parameters ========== trans_mat: ndarray A transition probability matrix for K-many states with shape (K, K-1). N_obs: int Number of observations. p0: ndarray Initial state probabilities. If `None`, the steady state is computed and used. size: int Not used. Returns ======= A ndarray of length N_obs containing sampled state numbers/indices/labels. """ P = np.column_stack((trans_mat, 1. - trans_mat.sum(axis=1))) p = p0 if p is None: p = compute_steady_state(trans_mat) states = np.empty(N_obs, dtype=np.uint8) states[0] = pymc.rcategorical(p) for i in range(1, N_obs): states[i] = pymc.rcategorical(P[int(states[i - 1])]) return states
def step(self): direction = self.choose_direction(norm=False) current_value = self.get_current_value() x_prime = np.vstack( (current_value, np.outer( np.linspace(-self.xprime_sds, self.xprime_sds, self.xprime_n), direction) + current_value)) lps = np.empty(self.xprime_n + 1) lps[0] = self.logp_plus_loglike for i in xrange(self.xprime_n): self.set_current_value(x_prime[i + 1]) lps[i + 1] = self.logp_plus_loglike next_value = x_prime[pm.rcategorical(np.exp(lps - pm.flib.logsum(lps)))] self.set_current_value(next_value) self.store(next_value)
def SIR_simplex_sample(mu, tau, cutoff, sum_val, N, N_proposals=1000, N_samps=1000): """ Returns raw log-weights, indices chosen and SIR samples for sets of N draws from a truncated lognormal distribution, conditioned so that their sum is equal to sum_val. This SIR algorithm will fail miserably unless sum_val is relatively likely given N and the parameters of the lognormal distribution. :Parameters: - mu : float The mean parameter. - tau : float The precision parameter. - cutoff : float The truncation value. - sum_val : float The sum that is being conditioned on. - N : integer The number of variates in each vector - N_proposals : integer The number of vectors to propose. - N_samps : integer The number of vectors to return. """ # Draw samples, compute missing values, evaluate log-weights. samps = np.exp(geto_truncnorm(mu, tau, log(cutoff), (N-1,N_proposals))) last_vals = sum_val - np.sum(samps,axis=0) weights = np.array([pm.lognormal_like(last_val_now, mu, tau) for last_val_now in last_vals]) # Check that there are at least some positive weights. where_pos = np.where(weights>-1e308) if len(where_pos[0])==0: raise RuntimeError, 'No weights are positive. You have used a shitty value for N.' # Normalize and exponentiate log-weights. weights[where(last_vals>cutoff)]=-np.Inf weights -= log_sum(weights) # Append missing values to samples. samps = np.vstack((samps,last_vals)) # Slice and return. ind=np.array(pm.rcategorical(p=np.exp(weights),size=N_samps),dtype=int) return weights, ind, samps[:,ind]
def gmm_model(data, K, mu_0=0.0, alpha_0=0.1, beta_0=0.1, alpha=1.0): """ K: number of component n_samples: number of n_samples n_features: number of features mu_0: prior mean of mu_k alpha_0: alpha of Inverse Gamma tau_k beta_0: beta of Inverse Gamma tau_k alpha = prior of dirichlet distribution phi_0 latent variable: phi_0: shape = (K-1, ), dirichlet distribution phi: shape = (K, ), add K-th value back to phi_0 z: shape = (n_samples, ), Categorical distribution, z[k] is component indicator mu_k: shape = (K, n_features), normal distribution, mu_k[k] is mean of k-th component tau_k : shape = (K, n_features), inverse-gamma distribution, tau_k[k] is variance of k-th component """ n_samples, n_features = data.shape # latent variables tau_k = pm.InverseGamma('tau_k', alpha_0 * np.ones((K, n_features)), beta_0 * np.ones((K, n_features)), value=beta_0 * np.ones((K, n_features))) mu_k = pm.Normal('mu_k', np.ones((K, n_features)) * mu_0, tau_k, value=np.ones((K, n_features)) * mu_0) phi_0 = pm.Dirichlet('phi_0', theta=np.ones(K) * alpha) @pm.deterministic(dtype=float) def phi(value=np.ones(K) / K, phi_0=phi_0): val = np.hstack((phi_0, (1 - np.sum(phi_0)))) return val z = pm.Categorical('z', p=phi, value=pm.rcategorical(np.ones(K) / K, size=n_samples)) # observed variables x = pm.Normal('x', mu=mu_k[z], tau=tau_k[z], value=data, observed=True) return pm.Model([mu_k, tau_k, phi_0, phi, z, x])
def gmm_model(data, K, mu_0=0.0, alpha_0=0.1, beta_0=0.1, alpha=1.0): """ K: number of component n_samples: number of n_samples n_features: number of features mu_0: prior mean of mu_k alpha_0: alpha of Inverse Gamma tau_k beta_0: beta of Inverse Gamma tau_k alpha = prior of dirichlet distribution phi_0 latent variable: phi_0: shape = (K-1, ), dirichlet distribution phi: shape = (K, ), add K-th value back to phi_0 z: shape = (n_samples, ), Categorical distribution, z[k] is component indicator mu_k: shape = (K, n_features), normal distribution, mu_k[k] is mean of k-th component tau_k : shape = (K, n_features), inverse-gamma distribution, tau_k[k] is variance of k-th component """ n_samples, n_features = data.shape # latent variables tau_k = pm.InverseGamma( "tau_k", alpha_0 * np.ones((K, n_features)), beta_0 * np.ones((K, n_features)), value=beta_0 * np.ones((K, n_features)), ) mu_k = pm.Normal("mu_k", np.ones((K, n_features)) * mu_0, tau_k, value=np.ones((K, n_features)) * mu_0) phi_0 = pm.Dirichlet("phi_0", theta=np.ones(K) * alpha) @pm.deterministic(dtype=float) def phi(value=np.ones(K) / K, phi_0=phi_0): val = np.hstack((phi_0, (1 - np.sum(phi_0)))) return val z = pm.Categorical("z", p=phi, value=pm.rcategorical(np.ones(K) / K, size=n_samples)) # observed variables x = pm.Normal("x", mu=mu_k[z], tau=tau_k[z], value=data, observed=True) return pm.Model([mu_k, tau_k, phi_0, phi, z, x])
def ages_and_data(N_exam, f_samp, correction_factor_array, age_lims): """Called by pred_samps. Simulates ages of survey participants and data given f.""" N_samp = len(f_samp) N_age_samps = correction_factor_array.shape[1] # Get samples for the age distribution at the observation points. age_distribution = [] for i in xrange(N_samp): l = age_lims[i] age_distribution.append(S_trace[np.random.randint(S_trace.shape[0]),0,l[0]:l[1]+1]) age_distribution[-1] /= np.sum(age_distribution[-1]) # Draw age for each individual, draw an age-correction profile for each location, # compute probability of positive for each individual, see how many individuals are # positive. A = [] pos = [] for s in xrange(N_samp): A.append(np.array(pm.rcategorical(age_distribution[s], size=N_exam[s]),dtype=int) + age_lims[s][0]) P_samp = pm.invlogit(f_samp[s].ravel())*correction_factor_array[:,np.random.randint(N_age_samps)][A[-1]] pos.append(pm.rbernoulli(P_samp)) return A, pos, age_distribution
reload(dismod3) # set font book_graphics.set_font() pi_true = scipy.interpolate.interp1d([0, 20, 40, 60, 100], [.4, .425, .6, .5, .4]) beta_true = .3 delta_true = 50. N = 30 # start with a simple model with N rows of data model = data_simulation.simple_model(N) # set covariate to 0/1 values randomly model.input_data['x_cov'] = 1. * mc.rcategorical([.5, .5], size=N) # record the true age-specific rates model.ages = pl.arange(0, 101, 1) model.pi_age_true = pi_true(model.ages) # choose age groups randomly age_width = pl.zeros(N) age_mid = mc.runiform(age_width/2, 100-age_width/2, size=N) age_start = pl.array(age_mid - age_width/2, dtype=int) age_end = pl.array(age_mid + age_width/2, dtype=int) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end
def validate_covariate_model_re(N=500, delta_true=.15, pi_true=.01, sigma_true = [.1,.1,.1,.1,.1], ess=1000): ## set simulation parameters import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.parameters['p']['parameter_age_mesh'] = [0, 100] model.parameters['p']['heterogeneity'] = 'Slightly' # ensure heterogeneity is slightly area_list = [] for sr in sorted(model.hierarchy.successors('all')): area_list.append(sr) for r in sorted(model.hierarchy.successors(sr)): area_list.append(r) area_list += sorted(model.hierarchy.successors(r))[:5] area_list = pl.array(area_list) ## generate simulation data model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) alpha = alpha_true_sim(model, area_list, sigma_true) # choose observed prevalence values model.input_data['effective_sample_size'] = ess model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] model.input_data['true'] = pl.nan for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = pi_true * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=20000, burn=10000, thin=10, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] add_quality_metrics(model.input_data) model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) add_quality_metrics(model.alpha) print '\nalpha' print model.alpha.dropna() model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']] model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']] add_quality_metrics(model.sigma) print 'sigma_alpha' print model.sigma model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'sigma') model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta add_to_results(model, 'delta') print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.alpha['abs_err'].dropna())), model.alpha.dropna()['covered?'].mean()) add_to_results(model, 'input_data') add_to_results(model, 'alpha') model.results = pandas.DataFrame(model.results) return model
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights*1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']] model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_consistent_model_sim(N=500, delta_true=.5, true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t * age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type == t] = p_t[data_type == t] n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i + 1) pl.plot(a, sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(a, sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) #graphics.plot_one_type(model, model.vars['p'], {}, 'p') #pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][ data_type == t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type == t] = model.vars['p'][ 'p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame( dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr' ] model.delta['sigma_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr' ] data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame( dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats() ['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.mu['abs_err'].mean(), pl.median(pl.absolute( model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.finalize_results(model) print model.results return model
def validate_consistent_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t*age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type==t] = p_t[data_type==t] # add covariate shifts import dismod3 import simplejson as json gbd_model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.hierarchy = gbd_model.hierarchy from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = {} for t in types: alpha[t] = alpha_true_sim(model, area_list, sigma_true) print json.dumps(alpha, indent=2) model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): t = data_type[i] p[i] = p[i] * pl.exp(pl.sum([alpha[t][n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) #model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=101, burn=0, thin=1, tune_interval=100) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i+1) pl.plot(range(101), sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(range(101), sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr'] model.delta['sigma_pred'] = [pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr'] data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame() model.sigma = pandas.DataFrame() for t in types: alpha_t = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) alpha_t['true'] = pandas.Series(dict(alpha[t])) alpha_t['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['type'] = t model.alpha = model.alpha.append(alpha_t.dropna(), ignore_index=True) sigma_t = pandas.DataFrame(dict(true=sigma_true)) sigma_t['mu_pred'] = [n.stats()['mean'] for n in model.vars[t]['sigma_alpha']] sigma_t['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars[t]['sigma_alpha']] model.sigma = model.sigma.append(sigma_t.dropna(), ignore_index=True) data_simulation.add_quality_metrics(model.alpha) data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame(dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats()['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.mu['abs_err'].mean(), pl.median(pl.absolute(model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1, .1, .1, .1, .1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons( json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights * 1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array([ 'all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR' ]) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical( pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp( pl.sum([ alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha ])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame( index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series( [n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series( [n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['sigma_alpha'] ] model.sigma['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha'] ] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame( dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats() ['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def validate_covariate_model_re(N=500, delta_true=.15, pi_true=.01, sigma_true=[.1, .1, .1, .1, .1], ess=1000): ## set simulation parameters import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons( json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.parameters['p']['parameter_age_mesh'] = [0, 100] model.parameters['p'][ 'heterogeneity'] = 'Slightly' # ensure heterogeneity is slightly area_list = [] for sr in sorted(model.hierarchy.successors('all')): area_list.append(sr) for r in sorted(model.hierarchy.successors(sr)): area_list.append(r) area_list += sorted(model.hierarchy.successors(r))[:5] area_list = pl.array(area_list) ## generate simulation data model.input_data = pandas.DataFrame(index=range(N)) initialize_input_data(model.input_data) alpha = alpha_true_sim(model, area_list, sigma_true) # choose observed prevalence values model.input_data['effective_sample_size'] = ess model.input_data['area'] = area_list[mc.rcategorical( pl.ones(len(area_list)) / float(len(area_list)), N)] model.input_data['true'] = pl.nan for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = pi_true * pl.exp( pl.sum([ alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha ])) n = model.input_data['effective_sample_size'] p = model.input_data['true'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=20000, burn=10000, thin=10, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] add_quality_metrics(model.input_data) model.alpha = pandas.DataFrame( index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series( [n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series( [n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) add_quality_metrics(model.alpha) print '\nalpha' print model.alpha.dropna() model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['sigma_alpha'] ] model.sigma['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha'] ] add_quality_metrics(model.sigma) print 'sigma_alpha' print model.sigma model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) add_to_results(model, 'sigma') model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() add_quality_metrics(model.delta) print 'delta' print model.delta add_to_results(model, 'delta') print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) print 'effect prediction MAE: %.3f, coverage: %.2f' % ( pl.median(pl.absolute(model.alpha['abs_err'].dropna())), model.alpha.dropna()['covered?'].mean()) add_to_results(model, 'input_data') add_to_results(model, 'alpha') model.results = pandas.DataFrame(model.results) return model
def _untilt_sample(x,i,s,a): if isinstance(i, int): # make random sample here (this might be a bad idea) i = pm.rcategorical(np.ones(x.shape[0])/x.shape[0],size=i) return ba.untilt(x[i,:],s,a)
def bic_norm_hmm_init_params(y, X_matrices): """ Initialize a normal HMM regression mixture with a GMM mixture of a BIC determined number of states. Starting with an initial set of design matrices, this function searches for the best number of additional constant states to add to the model. Parameters ========== y: pandas.DataFrame or pandas.Series Time-indexed vector of observations. X_matrices: list of pandas.DataFrame Collection of design matrices for each initial state. Returns ======= init_params: A `NormalHMMInitialParams` object. """ N_states = len(X_matrices) from sklearn import mixture lowest_bic = np.infty bic = [] for n_components in range(N_states, 10): gmm = mixture.GMM(n_components=n_components, covariance_type="diag") _ = gmm.fit(y.dropna()) bic.append(gmm.bic(y.dropna())) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm from operator import itemgetter gmm_order = sorted(enumerate(best_gmm.means_), key=itemgetter(1)) gmm_order = np.asarray(map(itemgetter(0), gmm_order)) gmm_order_map = dict(zip(gmm_order, range(len(gmm_order)))) gmm_states = pd.DataFrame(None, index=y.index, columns=['state'], dtype=np.int) gmm_raw_predicted = best_gmm.predict(y.dropna()).astype(np.int) gmm_states[~y.isnull().values.ravel()] = gmm_raw_predicted[:, None] from functools import partial gmm_lam = partial(lambda x: gmm_order_map.get(x, np.nan)) states_ordered = gmm_states['state'].map(gmm_lam) # When best_gmm.n_components > N_states we need to map multiple # GMM estimated states to a single state (the last, really) in # the model. Below we create the map that says which states # in GMM map to which states in the model. from itertools import izip_longest from collections import defaultdict gmm_to_state_map = dict( izip_longest(range(best_gmm.n_components), range(N_states), fillvalue=N_states - 1)) state_to_gmm_map = defaultdict(list) for i, v in zip(gmm_to_state_map.values(), gmm_to_state_map.keys()): state_to_gmm_map[i].append(v) gmm_to_state_lam = partial(lambda x: gmm_to_state_map.get(x, np.nan)) states_initial = states_ordered.map(gmm_to_state_lam) trans_freqs = compute_trans_freqs(states_initial, N_states) alpha_trans_0 = calc_alpha_prior(states_initial, N_states, trans_freqs=trans_freqs) if any(y.isnull()): # Now, let's sample values for the missing observations. # TODO: Would be better if we did this according to the # initial transition probabilities, no? for t in np.arange(y.size)[y.isnull().values.ravel()]: if t == 0: p0 = compute_steady_state(trans_freqs[:, :-1]) state = pymc.rcategorical(p0) else: state = pymc.rcategorical(trans_freqs[int(states_initial[t - 1])]) states_initial[t] = state beta_prior_means = [] beta_prior_covars = [] obs_prior_vars = np.empty(N_states) for i, gmm_states in state_to_gmm_map.items(): this_order = gmm_order[gmm_states] these_weights = best_gmm.weights_[this_order] these_weights /= these_weights.sum() these_means = best_gmm.means_[this_order] these_covars = best_gmm.covars_[this_order] # Use the exact mixture variance when we have two # states to combine; otherwise, use a crude estimate. # TODO: We can get an expression for len(gmm_states) > 2. if len(gmm_states) == 2: pi_, pi_n = these_weights sigma_1, sigma_2 = these_covars mu_diff = np.ediff1d(these_means) this_cov = pi_ * (sigma_1**2 + mu_diff**2 * pi_n**2) +\ pi_n * (sigma_2**2 + mu_diff**2 * pi_**2) this_cov = float(this_cov) else: this_cov = these_covars.sum() # TODO: How should/could we use this? # this_mean = np.dot(these_weights, best_gmm.means_[this_order]) # Get the data conditional on this [estimated] state. states_cond = np.asarray( map(lambda x: True if x in gmm_states else False, states_ordered)) from sklearn import linear_model reg_model = linear_model.ElasticNetCV(fit_intercept=False) N_beta = X_matrices[i].shape[1] X_cond = X_matrices[i][states_cond] y_cond = y[states_cond].get_values().ravel() if not X_cond.empty: # TODO: Could ask how this compares to the intercept-only model above. reg_model_fit = reg_model.fit(X_cond, y_cond) reg_model_err = np.atleast_1d( np.var(reg_model_fit.predict(X_cond) - y_cond)) beta_prior_means += [np.atleast_1d(reg_model_fit.coef_)] beta_prior_covars += [np.repeat(reg_model_err, N_beta)] else: beta_prior_means += [np.zeros(N_beta)] # TODO: A better default for an "uninformed" initial value? # This is really a job for a prior distribution. beta_prior_covars += [100. * np.ones(N_beta)] obs_prior_vars[i] = this_cov init_params = NormalHMMInitialParams(alpha_trans_0, None, states_initial, beta_prior_means, beta_prior_covars, obs_prior_vars, None) return init_params
def gmm_norm_hmm_init_params(y, X_matrices): """ Generates initial parameters for the univariate normal-emissions HMM with normal mean priors. Parameters ========== y: pandas.DataFrame or pandas.Series Time-indexed vector of observations. X_matrices: list of pandas.DataFrame Collection of design matrices for each hidden state's mean. Returns ======= init_params: A `NormalHMMInitialParams` object. """ # initialize with simple gaussian mixture from sklearn.mixture import GMM N_states = len(X_matrices) gmm_model = GMM(N_states, covariance_type='diag') gmm_model_fit = gmm_model.fit(y.dropna()) from operator import itemgetter gmm_order = sorted(enumerate(gmm_model_fit.means_), key=itemgetter(1)) gmm_order = map(itemgetter(0), gmm_order) gmm_order_map = dict(zip(gmm_order, range(len(gmm_order)))) # gmm_ord_weights = np.asarray([gmm_model_fit.weights_[x] for x in # gmm_order]) # TODO: attempt conditional regression when X matrices tell us # that we'll be fitting regression terms? # For now we just set those terms to zero. gmm_ord_means = np.asarray([ np.append(gmm_model_fit.means_[x], [0.] * (X_matrices[i].shape[1] - 1)) for i, x in enumerate(gmm_order) ]) gmm_ord_obs_covars = np.asarray( [gmm_model_fit.covars_[x, 0] for x in gmm_order]) gmm_states = pd.DataFrame(None, index=y.index, columns=['state'], dtype=np.int) gmm_raw_predicted = gmm_model_fit.predict(y.dropna()).astype(np.int) gmm_states[~y.isnull().values] = gmm_raw_predicted[:, None] from functools import partial gmm_lam = partial(lambda x: gmm_order_map.get(x, np.nan)) gmm_ord_states = gmm_states['state'].map(gmm_lam) beta_prior_covars = [ np.ones(X_matrices[i].shape[1]) * 10 for i in range(len(X_matrices)) ] trans_freqs = compute_trans_freqs(gmm_ord_states, N_states) alpha_trans_0 = calc_alpha_prior(gmm_ord_states, N_states, trans_freqs=trans_freqs) if any(y.isnull()): # Now, let's sample values for the missing observations. # TODO: Would be better if we did this according to the # initial transition probabilities, no? for t in np.arange(y.size)[y.isnull().values.ravel()]: if t == 0: p0 = compute_steady_state(trans_freqs[:, :-1]) state = pymc.rcategorical(p0) else: state = pymc.rcategorical(trans_freqs[int(gmm_ord_states[t - 1])]) gmm_ord_states[t] = state init_params = NormalHMMInitialParams(alpha_trans_0, None, gmm_ord_states, gmm_ord_means, beta_prior_covars, gmm_ord_obs_covars, None) return init_params
reload(dismod3) # set font book_graphics.set_font() pi_true = scipy.interpolate.interp1d([0, 20, 40, 60, 100], [.4, .425, .6, .5, .4]) beta_true = .3 delta_true = 50. N = 30 # start with a simple model with N rows of data model = data_simulation.simple_model(N) # set covariate to 0/1 values randomly model.input_data['x_cov'] = 1. * mc.rcategorical([.5, .5], size=N) # record the true age-specific rates model.ages = pl.arange(0, 101, 1) model.pi_age_true = pi_true(model.ages) # choose age groups randomly age_width = pl.zeros(N) age_mid = mc.runiform(age_width / 2, 100 - age_width / 2, size=N) age_start = pl.array(age_mid - age_width / 2, dtype=int) age_end = pl.array(age_mid + age_width / 2, dtype=int) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end # choose effective sample size uniformly at random
def settlement_size_samples(mu, tau, cutoff, sum_mu, sum_tau, pop_accounted, N): """ Returns N samples from the distribution of unsampled settlement sizes. Settlement sizes are drawn from a truncated lognormal distribution conditional on their sum being equal to sum_val. At the SIR stage, 100 samples are proposed and 10 are retained. :Parameters: - mu : float The mean parameter. - tau : float The precision parameter. - cutoff : float The truncation value. - sum_mu : float The mean of the lognormal distribution of total population. - sum_tau : float The precision parameter of the lognormal distribution of total population. - pop_accounted : integer The total population accounted for by the GRUMP urban extents. - N : integer The number of samples to return. """ N_sum_vals = N/10 # Compute moments and characteristic function for single population size, # to be used in all posterior evaluations. lnorms = np.exp(geto_truncnorm(mu, tau, log(cutoff), 10000)) sum_moments = np.mean(lnorms), np.var(lnorms) oFT = robust_CF(mu, tau, cutoff) # Generate values for total population in region not accounted for by # GRUMP urban extents. sum_vals = pm.rlognormal(sum_mu, sum_tau, size=N_sum_vals)-pop_accounted where_not_OK = np.where(sum_vals < 0) while len(where_not_OK[0]) > 0: sum_vals[where_not_OK] = pm.rlognormal(sum_mu, sum_tau, size=len(where_not_OK[0]))-pop_accounted where_not_OK = np.where(sum_vals < 0) # Create 10 samples using SIR for each sum. samps = [] for sum_val in sum_vals: tries = 0 while tries < 10: try: # Find posterior of N given this sum, and draw a single sample from it. Nmesh, p = robust_posterior(mu, tau, cutoff, sum_val, prior_fun=None, sum_moments=sum_moments, oFT=oFT) N = Nmesh[int(pm.rcategorical(p))] # Draw 10 samples for the sizes of the constituent populations given their number and # the total population size. w,i,s = SIR_simplex_sample(mu, tau, cutoff, sum_val, N, N_proposals=1000, N_samps=10) break except RuntimeError: print 'Failed at N=%f, Nmesh=%s, p=%s. Trying again'%(N,Nmesh,p) tries += 1 if tries==10: import sys a,b,c = sys.exc_info() raise a,b,c samps.extend(list(s.T)) # Return, you're done! return samps