def logit_normal_draw(cf_mean, std, N, J): std = pl.array(std) if mc.__version__ == '2.0rc2': # version on Omak X = [mc.invlogit(mc.rnormal(mu=cf_mean, tau=std**-2)) for n in range(N)] Y = pl.array(X) else: X = mc.rnormal(mu=cf_mean, tau=std**-2, size=(N,J)) Y = mc.invlogit(X) return Y
def main(): x_t = pm.rnormal(0, 1, 200) x_t[0] = 0 y_t = np.zeros(200) for i in range(1, 200): y_t[i] = pm.rnormal(y_t[i - 1], 1) plt.plot(y_t, label="$y_t$", lw=3) plt.plot(x_t, label="$x_t$", lw=3) plt.xlabel("time, $t$") plt.legend() plt.show() colors = ["#348ABD", "#A60628", "#7A68A6"] x = np.arange(1, 200) plt.bar(x, autocorr(y_t)[1:], width=1, label="$y_t$", edgecolor=colors[0], color=colors[0]) plt.bar(x, autocorr(x_t)[1:], width=1, label="$x_t$", color=colors[1], edgecolor=colors[1]) plt.legend(title="Autocorrelation") plt.ylabel("measured correlation \nbetween $y_t$ and $y_{t-k}$.") plt.xlabel("k (lag)") plt.title("Autocorrelation plot of $y_t$ and $x_t$ for differing $k$ lags.") plt.show()
def simple_hierarchical_data(n): """ Generate data based on the simple one-way hierarchical model given in section 3.1.1:: y[i,j] | alpha[j], sigma^2 ~ N(alpha[j], sigma^2) i = 1, ..., n_j, j = 1, ..., J; alpha[j] | mu, tau^2 ~ N(mu, tau^2) j = 1, ..., J. sigma^2 ~ Inv-Chi^2(5, 20) mu ~ N(5, 5^2) tau^2 ~ Inv-Chi^2(2, 10) Parameters ---------- n : list, len(n) = J, n[j] = num observations in group j """ inv_sigma_sq = mc.rgamma(alpha=2.5, beta=50.0) mu = mc.rnormal(mu=5.0, tau=5.0 ** -2.0) inv_tau_sq = mc.rgamma(alpha=1.0, beta=10.0) J = len(n) alpha = mc.rnormal(mu=mu, tau=inv_tau_sq, size=J) y = [mc.rnormal(mu=alpha[j], tau=inv_sigma_sq, size=n[j]) for j in range(J)] mu_by_tau = mu * pl.sqrt(inv_tau_sq) alpha_by_sigma = alpha * pl.sqrt(inv_sigma_sq) alpha_bar = alpha.sum() alpha_bar_by_sigma = alpha_bar * pl.sqrt(inv_sigma_sq) return vars()
def main(): x_t = pm.rnormal(0, 1, 200) x_t[0] = 0 y_t = np.zeros(200) for i in range(1, 200): y_t[i] = pm.rnormal(y_t[i - 1], 1) plt.plot(y_t, label="$y_t$", lw=3) plt.plot(x_t, label="$x_t$", lw=3) plt.xlabel("time, $t$") plt.legend() plt.show() colors = ["#348ABD", "#A60628", "#7A68A6"] x = np.arange(1, 200) plt.bar(x, autocorr(y_t)[1:], width=1, label="$y_t$", edgecolor=colors[0], color=colors[0]) plt.bar(x, autocorr(x_t)[1:], width=1, label="$x_t$", color=colors[1], edgecolor=colors[1]) plt.legend(title="Autocorrelation") plt.ylabel("measured correlation \nbetween $y_t$ and $y_{t-k}$.") plt.xlabel("k (lag)") plt.title( "Autocorrelation plot of $y_t$ and $x_t$ for differing $k$ lags.") plt.show()
def simple_hierarchical_data(n): """ Generate data based on the simple one-way hierarchical model given in section 3.1.1:: y[i,j] | alpha[j], sigma^2 ~ N(alpha[j], sigma^2) i = 1, ..., n_j, j = 1, ..., J; alpha[j] | mu, tau^2 ~ N(mu, tau^2) j = 1, ..., J. sigma^2 ~ Inv-Chi^2(5, 20) mu ~ N(5, 5^2) tau^2 ~ Inv-Chi^2(2, 10) Parameters ---------- n : list, len(n) = J, n[j] = num observations in group j """ inv_sigma_sq = mc.rgamma(alpha=2.5, beta=50.) mu = mc.rnormal(mu=5., tau=5.**-2.) inv_tau_sq = mc.rgamma(alpha=1., beta=10.) J = len(n) alpha = mc.rnormal(mu=mu, tau=inv_tau_sq, size=J) y = [mc.rnormal(mu=alpha[j], tau=inv_sigma_sq, size=n[j]) for j in range(J)] mu_by_tau = mu * pl.sqrt(inv_tau_sq) alpha_by_sigma = alpha * pl.sqrt(inv_sigma_sq) alpha_bar = alpha.sum() alpha_bar_by_sigma = alpha_bar * pl.sqrt(inv_sigma_sq) return vars()
def test_covariate_model_sim_no_hierarchy(): # simulate normal data model = data.ModelData() model.hierarchy, model.output_template = data_simulation.small_output() X = mc.rnormal(0., 1.**2, size=(128,3)) beta_true = [-.1, .1, .2] Y_true = pl.dot(X, beta_true) pi_true = pl.exp(Y_true) sigma_true = .01*pl.ones_like(pi_true) p = mc.rnormal(pi_true, 1./sigma_true**2.) model.input_data = pandas.DataFrame(dict(value=p, x_0=X[:,0], x_1=X[:,1], x_2=X[:,2])) model.input_data['area'] = 'all' model.input_data['sex'] = 'total' model.input_data['year_start'] = 2000 model.input_data['year_end'] = 2000 # create model and priors vars = {} vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, {}, model, 'all', 'total', 'all')) vars.update(rate_model.normal_model('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2)
def test_covariate_model_sim_no_hierarchy(): # simulate normal data model = dismod_mr.data.ModelData() model.hierarchy, model.output_template = data_simulation.small_output() X = mc.rnormal(0., 1.**2, size=(128, 3)) beta_true = [-.1, .1, .2] Y_true = np.dot(X, beta_true) pi_true = np.exp(Y_true) sigma_true = .01 * np.ones_like(pi_true) p = mc.rnormal(pi_true, 1. / sigma_true**2.) model.input_data = pd.DataFrame( dict(value=p, x_0=X[:, 0], x_1=X[:, 1], x_2=X[:, 2])) model.input_data['area'] = 'all' model.input_data['sex'] = 'total' model.input_data['year_start'] = 2000 model.input_data['year_end'] = 2000 # create model and priors vars = {} vars.update( dismod_mr.model.covariates.mean_covariate_model( 'test', 1, model.input_data, {}, model, 'all', 'total', 'all')) vars.update( dismod_mr.model.likelihood.normal('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2)
def sim_data(N, true_cf=[[.3, .6, .1], [.3, .5, .2]], true_std=[[.2, .05, .05], [.3, 0.1, 0.1]], sum_to_one=True): """ Create an NxTxJ matrix of simulated data (T is determined by the length of true_cf, J by the length of the elements of true_cf). true_cf - a list of lists of true cause fractions (each must sum to one) true_std - a list of lists of the standard deviations corresponding to the true csmf's for each time point. Can either be a list of length J inside a list of length 1 (in this case, the same standard deviation is used for all time points) or can be T lists of length J (in this case, the a separate standard deviation is specified and used for each time point). """ if sum_to_one == True: assert pl.allclose(pl.sum(true_cf, 1), 1), 'The sum of elements of true_cf must equal 1' T = len(true_cf) J = len(true_cf[0]) ## if only one std provided, duplicate for all time points if len(true_std) == 1 and len(true_cf) > 1: true_std = [true_std[0] for i in range(len(true_cf))] ## transform the mean and std to logit space transformed_std = [] for t in range(T): pi_i = pl.array(true_cf[t]) sigma_pi_i = pl.array(true_std[t]) transformed_std.append( ((1 / (pi_i * (pi_i - 1)))**2 * sigma_pi_i**2)**0.5) ## find minimum standard deviation (by cause across time) and draw from this min = pl.array(transformed_std).min(0) common_perturbation = [ pl.ones([T, J]) * mc.rnormal(mu=0, tau=min**-2) for n in range(N) ] ## draw from remaining variation tau = pl.array(transformed_std)**2 - min**2 tau[tau == 0] = 0.000001 additional_perturbation = [ [mc.rnormal(mu=0, tau=tau[t]**-1) for t in range(T)] for n in range(N) ] result = pl.zeros([N, T, J]) for n in range(N): result[n, :, :] = [ mc.invlogit( mc.logit(true_cf[t]) + common_perturbation[n][t] + additional_perturbation[n][t]) for t in range(T) ] return result
def pred(a1=alpha1, mu_int=mu_int, tau_int=tau_int, mu_slope=mu_slope, tau_slope=tau_slope, tau_iq=tau_iq, values=(70,75,80,85)): """Estimate the probability of IQ<85 for different covariate values""" b0 = rnormal(mu_int, tau_int, size=len(phe_pred)) a0 = rnormal(mu_slope, tau_slope, size=len(phe_pred)) b1 = a0 + a1*crit_pred iq = rnormal(b0 + b1*phe_pred, tau_iq) return [iq<v for v in values]
def logit_normal_draw(cf_mean, std, N, J): std = pl.array(std) if mc.__version__ == '2.0rc2': # version on Omak X = [ mc.invlogit(mc.rnormal(mu=cf_mean, tau=std**-2)) for n in range(N) ] Y = pl.array(X) else: X = mc.rnormal(mu=cf_mean, tau=std**-2, size=(N, J)) Y = mc.invlogit(X) return Y
def propose(self): tau = 1./(self.adaptive_scale_factor * self.proposal_sd)**2 time = pymc.rnormal(self.stochastic.value.time, tau) n = pymc.rnormal(len(self.stochastic.value), tau) if n <= 0: n = 0 times = [rand.random() for _ in range(n)] total = float(sum(times)) times = [item*time/total for item in times] events = [Event(time=item, censored=False) for item in times] self.stochastic.value = MultiEvent(events)
def simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true): # choose age intervals to measure age_start = np.array(mc.runiform(0, 100, n), dtype=int) age_start.sort() # sort to make it easy to discard the edges when testing age_end = np.array(mc.runiform(age_start + 1, np.minimum(age_start + 10, 100)), dtype=int) # find truth for the integral across the age intervals import scipy.integrate pi_interval_true = [ scipy.integrate.trapz(pi_age_true[a_0i:(a_1i + 1)]) / (a_1i - a_0i) for a_0i, a_1i in zip(age_start, age_end) ] # generate covariates that add explained variation X = mc.rnormal(0., 1.**2, size=(n, 3)) beta_true = [-.1, .1, .2] beta_true = [0, 0, 0] Y_true = np.dot(X, beta_true) # calculate the true value of the rate in each interval pi_true = pi_interval_true * np.exp(Y_true) # simulate the noisy measurement of the rate in each interval p = np.maximum(0., mc.rnormal(pi_true, 1. / sigma_true**2.)) # store the simulated data in a pandas DataFrame data = pandas.DataFrame( dict(value=p, age_start=age_start, age_end=age_end, x_0=X[:, 0], x_1=X[:, 1], x_2=X[:, 2])) data['effective_sample_size'] = np.maximum(p * (1 - p) / sigma_true**2, 1.) data['standard_error'] = np.nan data['upper_ci'] = np.nan data['lower_ci'] = np.nan data['year_start'] = 2005. # TODO: make these vary data['year_end'] = 2005. data['sex'] = 'total' data['area'] = 'all' data['data_type'] = data_type return data
def test_fixed_effect_priors(): model = data.ModelData() # set prior on sex parameters = dict(fixed_effects={'x_sex': dict(dist='TruncatedNormal', mu=1., sigma=.5, lower=-10, upper=10)}) # simulate normal data n = 32. sex_list = pl.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] beta_true = dict(male=-1., total=0., female=1.) pi_true = pl.exp([beta_true[s] for s in sex]) sigma_true = .05 p = mc.rnormal(pi_true, 1./sigma_true**2.) model.input_data = pandas.DataFrame(dict(value=p, sex=sex)) model.input_data['area'] = 'all' model.input_data['year_start'] = 2010 model.input_data['year_start'] = 2010 # create model and priors vars = {} vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, parameters, model, 'all', 'total', 'all')) print vars['beta'] assert vars['beta'][0].parents['mu'] == 1.
def main(): """ Demonstrating thinning of two autocorrelated inputs (representing posterior probabilities). The key point is the thinned - every 2nd / 3rd point - functions approach zero quicker. More thinning is better (but expensive) """ # x_t = pm.rnormal(0, 1, 200) # x_t[0] = 0 y_t = np.zeros(200) for i in range(1, 200): y_t[i] = pm.rnormal(y_t[i - 1], 1) max_x = 200 / 3 + 1 x = np.arange(1, max_x) colors = ["#348ABD", "#A60628", "#7A68A6"] plt.bar(x, autocorr(y_t)[1:max_x], edgecolor=colors[0], label="no thinning", color=colors[0], width=1) plt.bar(x, autocorr(y_t[::2])[1:max_x], edgecolor=colors[1], label="keeping every 2nd sample", color=colors[1], width=1) plt.bar(x, autocorr(y_t[::3])[1:max_x], width=1, edgecolor=colors[2], label="keeping every 3rd sample", color=colors[2]) plt.autoscale(tight=True) plt.legend(title="Autocorrelation plot for $y_t$", loc="lower left") plt.ylabel("measured correlation \nbetween $y_t$ and $y_{t-k}$.") plt.xlabel("k (lag)") plt.title("Autocorrelation of $y_t$ (no thinning vs. thinning) \ at differing $k$ lags.") plt.show()
def test_random_effect_priors(): model = data.ModelData() # set prior on sex parameters = dict(random_effects={'USA': dict(dist='TruncatedNormal', mu=.1, sigma=.5, lower=-10, upper=10)}) # simulate normal data n = 32. area_list = pl.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = pl.exp([alpha_true[a] for a in area]) sigma_true = .05 p = mc.rnormal(pi_true, 1./sigma_true**2.) model.input_data = pandas.DataFrame(dict(value=p, area=area)) model.input_data['sex'] = 'male' model.input_data['year_start'] = 2010 model.input_data['year_end'] = 2010 model.hierarchy.add_edge('all', 'USA') model.hierarchy.add_edge('all', 'CAN') # create model and priors vars = {} vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, parameters, model, 'all', 'total', 'all')) print vars['alpha'] print vars['alpha'][1].parents['mu'] assert vars['alpha'][1].parents['mu'] == .1
def plot_funnel(pi_true, delta_str): delta = float(delta_str) n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) p = pi_true * pl.ones_like(n) # old way: #delta = delta * p * n nb = rate_model.neg_binom_model('funnel', pi_true, delta, p, n) r = nb['p_pred'].value pl.vlines([pi_true], .1 * n.min(), 10 * n.max(), linewidth=5, linestyle='--', color='black', zorder=10) pl.plot(r, n, 'o', color=colors[0], ms=10, mew=0, alpha=.25) pl.semilogy(schiz['r'], schiz['n'], 's', mew=1, mec='white', ms=15, color=colors[1], label='Observed Values') pl.xlabel('Rate (Per 1000 PY)', size=32) pl.ylabel('Study Size (PY)', size=32) pl.axis([-.0001, .0101, 50., 15000000]) pl.title(r'$\delta = %s$' % delta_str, size=48) pl.xticks([0, .005, .01], [0, 5, 10], size=30) pl.yticks(size=30)
def step(self): x0 = np.copy(self.stochastic.value) dx = pymc.rnormal(np.zeros(np.shape(x0)), self.proposal_tau) logp = [self.logp_plus_loglike] x_prime = [x0] for direction in [-1, 1]: for i in xrange(25): delta = direction * np.exp(.1 * i) * dx try: self.stochastic.value = x0 + delta logp.append(self.logp_plus_loglike) x_prime.append(x0 + delta) except pymc.ZeroProbability: self.stochastic.value = x0 i = pymc.rcategorical(np.exp(np.array(logp) - pymc.flib.logsum(logp))) self.stochastic.value = x_prime[i] if i == 0: self.rejected += 1 if self.verbose > 2: print self._id + ' rejecting' else: self.accepted += 1 if self.verbose > 2: print self._id + ' accepting'
def plot_funnel(pi_true, delta_str): delta = float(delta_str) n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) p = pi_true*pl.ones_like(n) # old way: #delta = delta * p * n nb = rate_model.neg_binom_model('funnel', pi_true, delta, p, n) r = nb['p_pred'].value pl.vlines([pi_true], .1*n.min(), 10*n.max(), linewidth=5, linestyle='--', color='black', zorder=10) pl.plot(r, n, 'o', color=colors[0], ms=10, mew=0, alpha=.25) pl.semilogy(schiz['r'], schiz['n'], 's', mew=1, mec='white', ms=15, color=colors[1], label='Observed Values') pl.xlabel('Rate (Per 1000 PY)', size=32) pl.ylabel('Study Size (PY)', size=32) pl.axis([-.0001, .0101, 50., 15000000]) pl.title(r'$\delta = %s$'%delta_str, size=48) pl.xticks([0, .005, .01], [0, 5, 10], size=30) pl.yticks(size=30)
def pred(a1=alpha1, mu_int=mu_int, tau_int=tau_int, mu_slope=mu_slope, tau_slope=tau_slope, tau_iq=tau_iq, values=(70, 75, 80, 85)): """Estimate the probability of IQ<85 for different covariate values""" b0 = rnormal(mu_int, tau_int, size=len(phe_pred)) a0 = rnormal(mu_slope, tau_slope, size=len(phe_pred)) b1 = a0 + a1 * crit_pred iq = rnormal(b0 + b1 * phe_pred, tau_iq) return [iq < v for v in values]
def test_fixed_effect_priors(): model = dismod_mr.data.ModelData() # set prior on sex parameters = dict( fixed_effects={ 'x_sex': dict(dist='TruncatedNormal', mu=1., sigma=.5, lower=-10, upper=10) }) # simulate normal data n = 32 sex_list = np.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] beta_true = dict(male=-1., total=0., female=1.) pi_true = np.exp([beta_true[s] for s in sex]) sigma_true = .05 p = mc.rnormal(pi_true, 1. / sigma_true**2.) model.input_data = pd.DataFrame(dict(value=p, sex=sex)) model.input_data['area'] = 'all' model.input_data['year_start'] = 2010 model.input_data['year_start'] = 2010 # create model and priors vars = {} vars.update( dismod_mr.model.covariates.mean_covariate_model( 'test', 1, model.input_data, parameters, model, 'all', 'total', 'all')) print(vars['beta']) assert vars['beta'][0].parents['mu'] == 1.
def test_random_effect_priors(): model = dismod_mr.data.ModelData() # set prior on sex parameters = dict(random_effects={ 'USA': dict(dist='TruncatedNormal', mu=.1, sigma=.5, lower=-10, upper=10) }) # simulate normal data n = 32 area_list = np.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = np.exp([alpha_true[a] for a in area]) sigma_true = .05 p = mc.rnormal(pi_true, 1. / sigma_true**2.) model.input_data = pd.DataFrame(dict(value=p, area=area)) model.input_data['sex'] = 'male' model.input_data['year_start'] = 2010 model.input_data['year_end'] = 2010 model.hierarchy.add_edge('all', 'USA') model.hierarchy.add_edge('all', 'CAN') # create model and priors vars = {} vars.update( dismod_mr.model.covariates.mean_covariate_model( 'test', 1, model.input_data, parameters, model, 'all', 'total', 'all')) print(vars['alpha']) print(vars['alpha'][1].parents['mu'])
def __init__(self, stochastic, proposal_sd=None, verbose=None): pm.Metropolis.__init__(self, stochastic, proposal_sd=proposal_sd, verbose=verbose, tally=False) self.proposal_tau = self.proposal_sd**-2. self.n = 0 self.N = 11 self.value = pm.rnormal(self.stochastic.value, self.proposal_tau, size=tuple([self.N] + list(self.stochastic.value.shape)))
def step(self): x0 = self.value[self.n] u = pm.rnormal(np.zeros(self.N), 1.) dx = np.dot(u, self.value) self.stochastic.value = x0 logp = [self.logp_plus_loglike] x_prime = [x0] for direction in [-1, 1]: for i in xrange(25): delta = direction*np.exp(.1*i)*dx try: self.stochastic.value = x0 + delta logp.append(self.logp_plus_loglike) x_prime.append(x0 + delta) except pm.ZeroProbability: self.stochastic.value = x0 i = pm.rcategorical(np.exp(np.array(logp) - pm.flib.logsum(logp))) self.value[self.n] = x_prime[i] self.stochastic.value = x_prime[i] if i == 0: self.rejected += 1 else: self.accepted += 1 self.n += 1 if self.n == self.N: self.n = 0
def step(self): x0 = np.copy(self.stochastic.value) dx = pymc.rnormal(np.zeros(np.shape(x0)), self.proposal_tau) logp = [self.logp_plus_loglike] x_prime = [x0] for direction in [-1, 1]: for i in xrange(25): delta = direction*np.exp(.1*i)*dx try: self.stochastic.value = x0 + delta logp.append(self.logp_plus_loglike) x_prime.append(x0 + delta) except pymc.ZeroProbability: self.stochastic.value = x0 i = pymc.rcategorical(np.exp(np.array(logp) - pymc.flib.logsum(logp))) self.stochastic.value = x_prime[i] if i == 0: self.rejected += 1 if self.verbose > 2: print self._id + ' rejecting' else: self.accepted += 1 if self.verbose > 2: print self._id + ' accepting'
def test_age_pattern_model_sim(): # simulate normal data a = np.arange(0, 100, 5) pi_true = .0001 * (a * (100. - a) + 100.) sigma_true = .025 * np.ones_like(pi_true) p = np.maximum(0., mc.rnormal(pi_true, 1. / sigma_true**2.)) # create model and priors vars = {} vars.update( dismod_mr.model.spline.spline('test', ages=np.arange(101), knots=np.arange(0, 101, 5), smoothing=.1)) vars['pi'] = mc.Lambda('pi', lambda mu=vars['mu_age'], a=a: mu[a]) vars.update( dismod_mr.model.likelihood.normal('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2)
def step(self): x0 = self.value[self.n] u = pymc.rnormal(np.zeros(self.N), 1.) dx = np.dot(u, self.value) self.stochastic.value = x0 logp = [self.logp_plus_loglike] x_prime = [x0] for direction in [-1, 1]: for i in xrange(25): delta = direction * np.exp(.1 * i) * dx try: self.stochastic.value = x0 + delta logp.append(self.logp_plus_loglike) x_prime.append(x0 + delta) except pymc.ZeroProbability: self.stochastic.value = x0 i = pymc.rcategorical(np.exp(np.array(logp) - pymc.flib.logsum(logp))) self.value[self.n] = x_prime[i] self.stochastic.value = x_prime[i] if i == 0: self.rejected += 1 if self.verbose > 2: print self._id + ' rejecting' else: self.accepted += 1 if self.verbose > 2: print self._id + ' accepting' self.n += 1 if self.n == self.N: self.n = 0
def sim_data(N, true_cf=[[.3, .6, .1], [.3, .5, .2]], true_std=[[.2, .05, .05], [.3, 0.1, 0.1]], sum_to_one=True): """ Create an NxTxJ matrix of simulated data (T is determined by the length of true_cf, J by the length of the elements of true_cf). true_cf - a list of lists of true cause fractions (each must sum to one) true_std - a list of lists of the standard deviations corresponding to the true csmf's for each time point. Can either be a list of length J inside a list of length 1 (in this case, the same standard deviation is used for all time points) or can be T lists of length J (in this case, the a separate standard deviation is specified and used for each time point). """ if sum_to_one == True: assert pl.allclose(pl.sum(true_cf, 1), 1), 'The sum of elements of true_cf must equal 1' T = len(true_cf) J = len(true_cf[0]) ## if only one std provided, duplicate for all time points if len(true_std)==1 and len(true_cf)>1: true_std = [true_std[0] for i in range(len(true_cf))] ## transform the mean and std to logit space transformed_std = [] for t in range(T): pi_i = pl.array(true_cf[t]) sigma_pi_i = pl.array(true_std[t]) transformed_std.append( ((1/(pi_i*(pi_i-1)))**2 * sigma_pi_i**2)**0.5 ) ## find minimum standard deviation (by cause across time) and draw from this min = pl.array(transformed_std).min(0) common_perturbation = [pl.ones([T,J])*mc.rnormal(mu=0, tau=min**-2) for n in range(N)] ## draw from remaining variation tau=pl.array(transformed_std)**2 - min**2 tau[tau==0] = 0.000001 additional_perturbation = [[mc.rnormal(mu=0, tau=tau[t]**-1) for t in range(T)] for n in range(N)] result = pl.zeros([N, T, J]) for n in range(N): result[n, :, :] = [mc.invlogit(mc.logit(true_cf[t]) + common_perturbation[n][t] + additional_perturbation[n][t]) for t in range(T)] return result
def test_log_normal_model_sim(N=16): # simulate negative binomial data pi_true = 2. sigma_true = .1 n = pl.array(pl.exp(mc.rnormal(10, 1**-2, size=N)), dtype=int) p = pl.exp(mc.rnormal(pl.log(pi_true), 1./(sigma_true**2 + 1./n), size=N)) # create model and priors vars = dict(mu_age=mc.Uniform('mu_age', 0., 1000., value=.01), sigma=mc.Uniform('sigma', 0., 10000., value=1000.)) vars['mu_interval'] = mc.Lambda('mu_interval', lambda mu=vars['mu_age']: mu*pl.ones(N)) vars.update(rate_model.log_normal_model('sim', vars['mu_interval'], vars['sigma'], p, 1./pl.sqrt(n))) # fit model m = mc.MCMC(vars) m.sample(1)
def make_model(n_fmesh=11, fmesh_is_obsmesh=False): x = np.arange(-1., 1., .1) # Prior parameters of C nu = pm.Uniform('nu', 1., 3, value=1.5) phi = pm.Lognormal('phi', mu=.4, tau=1, value=1) theta = pm.Lognormal('theta', mu=.5, tau=1, value=1) # The covariance dtrm C is valued as a Covariance object. @pm.deterministic def C(eval_fun=gp.matern.euclidean, diff_degree=nu, amp=phi, scale=theta): return gp.NearlyFullRankCovariance(eval_fun, diff_degree=diff_degree, amp=amp, scale=scale) # Prior parameters of M a = pm.Normal('a', mu=1., tau=1., value=1) b = pm.Normal('b', mu=.5, tau=1., value=0) c = pm.Normal('c', mu=2., tau=1., value=0) # The mean M is valued as a Mean object. def linfun(x, a, b, c): return a * x ** 2 + b * x + c @pm.deterministic def M(eval_fun=linfun, a=a, b=b, c=c): return gp.Mean(eval_fun, a=a, b=b, c=c) # The actual observation locations actual_obs_locs = np.linspace(-.8, .8, 4) if fmesh_is_obsmesh: o = actual_obs_locs fmesh = o else: # The unknown observation locations o = pm.Normal('o', actual_obs_locs, 1000., value=actual_obs_locs) fmesh = np.linspace(-1, 1, n_fmesh) # The GP submodel sm = gp.GPSubmodel('sm', M, C, fmesh) # Observation variance V = pm.Lognormal('V', mu=-1, tau=1, value=.0001) observed_values = pm.rnormal(actual_obs_locs ** 2, 10000) # The data d is just array-valued. It's normally distributed about # GP.f(obs_x). d = pm.Normal( 'd', mu=sm.f(o), tau=1. / V, value=observed_values, observed=True) return locals()
def make_model(n_fmesh=11, fmesh_is_obsmesh=False): x = np.arange(-1., 1., .1) # Prior parameters of C nu = pm.Uniform('nu', 1., 3, value=1.5) phi = pm.Lognormal('phi', mu=.4, tau=1, value=1) theta = pm.Lognormal('theta', mu=.5, tau=1, value=1) # The covariance dtrm C is valued as a Covariance object. @pm.deterministic def C(eval_fun=gp.matern.euclidean, diff_degree=nu, amp=phi, scale=theta): return gp.NearlyFullRankCovariance(eval_fun, diff_degree=diff_degree, amp=amp, scale=scale) # Prior parameters of M a = pm.Normal('a', mu=1., tau=1., value=1) b = pm.Normal('b', mu=.5, tau=1., value=0) c = pm.Normal('c', mu=2., tau=1., value=0) # The mean M is valued as a Mean object. def linfun(x, a, b, c): return a * x**2 + b * x + c @pm.deterministic def M(eval_fun=linfun, a=a, b=b, c=c): return gp.Mean(eval_fun, a=a, b=b, c=c) # The actual observation locations actual_obs_locs = np.linspace(-.8, .8, 4) if fmesh_is_obsmesh: o = actual_obs_locs fmesh = o else: # The unknown observation locations o = pm.Normal('o', actual_obs_locs, 1000., value=actual_obs_locs) fmesh = np.linspace(-1, 1, n_fmesh) # The GP submodel sm = gp.GPSubmodel('sm', M, C, fmesh) # Observation variance V = pm.Lognormal('V', mu=-1, tau=1, value=.0001) observed_values = pm.rnormal(actual_obs_locs**2, 10000) # The data d is just array-valued. It's normally distributed about GP.f(obs_x). d = pm.Normal('d', mu=sm.f(o), tau=1. / V, value=observed_values, observed=True) return locals()
def complex_hierarchical_data(n): """ Generate data based on the much more complicated model given in section 3.2.1:: y_ij ~ N(mu_j - exp(beta_j)t_ij - exp(gamma_j)t_ij^2, sigma_j^2) gamma_j | sigma^2, xi, X_j ~ N(eta_0 + eta_1 X_j + eta_2 X_j^2, omega^2) beta_j | gamma_j, sigma^2, xi, X_j ~ N(delta_beta_0 + delta_beta_1 X_j + delta_beta_2 X_j^2 + delta_beta_3 gamma_j, omega_beta^2) mu_j | gamma_j, beta_j, sigma^2, xi, X_j ~ N(delta_mu_0 + delta_mu_1 X_j + delta_mu_2 X_j^2 + delta_mu_3 gamma_j + delta_mu_4 beta_j, omega_mu^2) eta = (eta_0, eta_1, eta_2, log(omega))' delta_beta = (delta_beta_0, delta_beta_1, delta_beta_2, delta_beta_3, log(omega_beta))' delta_mu = (delta_mu_0, delta_mu_1, delta_mu_2, delta_mu_3, log(omega_mu))' xi = (eta, delta_beta, delta_mu) eta ~ MVNormal(M, C) delta_beta, delta_mu ~ Normal(m, s) Parameters ---------- n : list, len(n) = J, n[j] = num observations in group j """ J = len(n) # covariate data, not entirely specified in paper X = mc.rnormal(0, .1**-2, size=J) t = [pl.arange(n[j]) for j in range(J)] # hyper-priors, not specified in detail in paper m = 0. s = 1. M = pl.zeros(4) r = [[ 1, .57, .18, .56], [.57, 1, .72, .16], [.18, .72, 1, .14], [.56, .16, .14, 1]] eta = mc.rmv_normal_cov(M, r) omega = .0001 #pl.exp(eta[-1]) delta_beta = mc.rnormal(m, s**-2, size=5) omega_beta = .0001 #pl.exp(delta_beta[-1]) delta_mu = mc.rnormal(m, s**-2, size=5) omega_mu = .0001 #pl.exp(delta_mu[-1]) gamma = mc.rnormal(eta[0] + eta[1]*X + eta[2]*X**2, omega**-2.) beta = mc.rnormal(delta_beta[0] + delta_beta[1]*X + delta_beta[2]*X**2 + delta_beta[3]*gamma, omega_beta**-2) mu = mc.rnormal(delta_mu[0] + delta_mu[1]*X + delta_mu[2]*X**2 + delta_mu[3]*gamma + delta_mu[4]*beta, omega_mu**-2) # stochastic error, not specified in paper sigma = .01*pl.ones(J) y = [mc.rnormal(mu[j] - pl.exp(beta[j])*t[j] - pl.exp(gamma[j])*t[j]**2, sigma[j]**-2) for j in range(J)] eta_cross_eta = [eta[0]*eta[1], eta[0]*eta[2], eta[0]*eta[3], eta[1]*eta[2], eta[1]*eta[2], eta[2]*eta[3]] return vars()
def simulate_age_group_data(N=50, delta_true=150, pi_true=true_rate_function): """ generate simulated data """ # start with a simple model with N rows of data model = data_simulation.simple_model(N) # record the true age-specific rates model.ages = pl.arange(0, 101, 1) model.pi_age_true = pi_true(model.ages) # choose age groups randomly age_width = mc.runiform(1, 100, size=N) age_mid = mc.runiform(age_width / 2, 100 - age_width / 2, size=N) age_width[:10] = 10 age_mid[:10] = pl.arange(5, 105, 10) #age_width[10:20] = 10 #age_mid[10:20] = pl.arange(5, 105, 10) age_start = pl.array(age_mid - age_width / 2, dtype=int) age_end = pl.array(age_mid + age_width / 2, dtype=int) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end # choose effective sample size uniformly at random n = mc.runiform(100, 10000, size=N) model.input_data['effective_sample_size'] = n # integrate true age-specific rate across age groups to find true group rate model.input_data['true'] = pl.nan model.input_data['age_weights'] = '' for i in range(N): beta = mc.rnormal(0., .025**-2) # TODO: clean this up, it is computing more than is necessary age_weights = pl.exp(beta * model.ages) sum_pi_wt = pl.cumsum(model.pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) model.input_data.ix[i, 'true'] = p[i] model.input_data.ix[i, 'age_weights'] = ';'.join( ['%.4f' % w for w in age_weights[age_start[i]:(age_end[i] + 1)]]) # sample observed rate values from negative binomial distribution model.input_data['value'] = mc.rnegative_binomial( n * model.input_data['true'], delta_true) / n print model.input_data.drop(['standard_error', 'upper_ci', 'lower_ci'], axis=1) return model
def simulate_age_group_data(N=50, delta_true=150, pi_true=true_rate_function): """ generate simulated data """ # start with a simple model with N rows of data model = data_simulation.simple_model(N) # record the true age-specific rates model.ages = pl.arange(0, 101, 1) model.pi_age_true = pi_true(model.ages) # choose age groups randomly age_width = mc.runiform(1, 100, size=N) age_mid = mc.runiform(age_width/2, 100-age_width/2, size=N) age_width[:10] = 10 age_mid[:10] = pl.arange(5, 105, 10) #age_width[10:20] = 10 #age_mid[10:20] = pl.arange(5, 105, 10) age_start = pl.array(age_mid - age_width/2, dtype=int) age_end = pl.array(age_mid + age_width/2, dtype=int) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end # choose effective sample size uniformly at random n = mc.runiform(100, 10000, size=N) model.input_data['effective_sample_size'] = n # integrate true age-specific rate across age groups to find true group rate model.input_data['true'] = pl.nan model.input_data['age_weights'] = '' for i in range(N): beta = mc.rnormal(0., .025**-2) # TODO: clean this up, it is computing more than is necessary age_weights = pl.exp(beta*model.ages) sum_pi_wt = pl.cumsum(model.pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) model.input_data.ix[i, 'true'] = p[i] model.input_data.ix[i, 'age_weights'] = ';'.join(['%.4f'%w for w in age_weights[age_start[i]:(age_end[i]+1)]]) # sample observed rate values from negative binomial distribution model.input_data['value'] = mc.rnegative_binomial(n*model.input_data['true'], delta_true) / n print model.input_data.drop(['standard_error', 'upper_ci', 'lower_ci'], axis=1) return model
def alpha_true_sim(model, area_list, sigma_true): # choose alpha^true alpha = dict(all=0.) sum_sr = 0. last_sr = -1 for sr in model.hierarchy['all']: if sr not in area_list: continue sum_r = 0. last_r = -1 for r in model.hierarchy[sr]: if r not in area_list: continue sum_c = 0. last_c = -1 for c in model.hierarchy[r]: if c not in area_list: continue alpha[c] = mc.rnormal(0., sigma_true[3]**-2.) sum_c += alpha[c] last_c = c if last_c >= 0: alpha[last_c] -= sum_c alpha[r] = mc.rnormal(0., sigma_true[2]**-2.) sum_r += alpha[r] last_r = r if last_r >= 0: alpha[last_r] -= sum_r alpha[sr] = mc.rnormal(0., sigma_true[1]**-2.) sum_sr += alpha[sr] last_sr = sr if last_sr >= 0: alpha[last_sr] -= sum_sr return alpha
def simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true): # choose age intervals to measure age_start = np.array(mc.runiform(0, 100, n), dtype=int) age_start.sort() # sort to make it easy to discard the edges when testing age_end = np.array(mc.runiform(age_start+1, np.minimum(age_start+10,100)), dtype=int) # find truth for the integral across the age intervals import scipy.integrate pi_interval_true = [scipy.integrate.trapz(pi_age_true[a_0i:(a_1i+1)]) / (a_1i - a_0i) for a_0i, a_1i in zip(age_start, age_end)] # generate covariates that add explained variation X = mc.rnormal(0., 1.**2, size=(n,3)) beta_true = [-.1, .1, .2] beta_true = [0, 0, 0] Y_true = np.dot(X, beta_true) # calculate the true value of the rate in each interval pi_true = pi_interval_true*np.exp(Y_true) # simulate the noisy measurement of the rate in each interval p = np.maximum(0., mc.rnormal(pi_true, 1./sigma_true**2.)) # store the simulated data in a pandas DataFrame data = pandas.DataFrame(dict(value=p, age_start=age_start, age_end=age_end, x_0=X[:,0], x_1=X[:,1], x_2=X[:,2])) data['effective_sample_size'] = np.maximum(p*(1-p)/sigma_true**2, 1.) data['standard_error'] = np.nan data['upper_ci'] = np.nan data['lower_ci'] = np.nan data['year_start'] = 2005. # TODO: make these vary data['year_end'] = 2005. data['sex'] = 'total' data['area'] = 'all' data['data_type'] = data_type return data
def __init__(self, stochastic, proposal_sd=None, verbose=None): pymc.Metropolis.__init__(self, stochastic, proposal_sd=proposal_sd, verbose=verbose, tally=False) self.proposal_tau = self.proposal_sd**-2. self.n = 0 self.N = 11 self.value = pymc.rnormal( self.stochastic.value, self.proposal_tau, size=tuple([self.N] + list(self.stochastic.value.shape)))
def data_gen_for_rnn(samples_n=1, tau_start=75, tau_end=100, gamma=0.01, var=5): alpha = 1.0 / gamma lam = alpha for i in xrange(samples_n): con = [] tau = pm.rdiscrete_uniform(tau_start, tau_end) for j in xrange(tau): if j == 0: val = round(pm.rnormal(lam, var), 2) con.append(val) elif j == 1: val = con[0] + pm.rnormal(0, var) val = round(val, 2) con.append(val) else: # n = len(con) # lam_n = float(np.array(con).sum())/n val = 0.7 * con[-1] + 0.3 * con[-2] + pm.rnormal(0, var) val = round(val, 2) con.append(val) # print val, lam_n yield con
def step(self): # We're going to do this in a way that allows easy extension # to multivariate beta (and even y with non-diagonal covariances, # for whatever that's worth). y = np.atleast_1d(np.squeeze(self.y_obs.value)) if np.alen(y) == 0: self.stochastic.random() return X = getattr(self.X, 'value', self.X) # Gotta broadcast when the parameters are scalars. bcast_beta = np.ones_like(self.stochastic.value) a_beta = bcast_beta * getattr(self.a_beta, 'value', self.a_beta) tau_beta = bcast_beta * np.atleast_1d(getattr(self.tau_beta, 'value', self.tau_beta)) tau_y = getattr(self.tau_y, 'value', self.tau_y) # # This is how we get the posterior mean: # C^{-1} m = R^{-1} a + F V^{-1} y # rhs = np.dot(tau_beta, a_beta) + np.dot(X.T * tau_y, y) tau_post = np.diag(tau_beta) + np.dot(X.T * tau_y, X) a_post = np.linalg.solve(tau_post, rhs) tau_post = np.diag(tau_post) # TODO: These could be symbolic/Deterministic, no? parents_post = {'mu': a_post, 'tau': tau_post} self.stochastic.parents_post = parents_post # TODO: If self.V_inv, sample normal-gamma dist if self.post_a is not None and self.post_b is not None: parents_post['a'] = self.post_a parents_post['b'] = self.post_b res = pymc.rtruncated_normal(**parents_post) # pymc's truncated distribution(s) doesn't handle # the limit values correctly, so we have to clip # the values. self.stochastic.value = res.clip(self.post_a, self.post_b) else: self.stochastic.value = pymc.rnormal(**parents_post)
def setup_and_sample(vars, step, iters=5000): mod = mc.MCMC(vars) if step == 'AdaptiveMetropolis': mod.use_step_method(mc.AdaptiveMetropolis, mod.X) elif step == 'Hit-and-Run': mod.use_step_method(steppers.HitAndRun, mod.X, proposal_sd=.1) elif step == 'H-RAM': #mod.use_step_method(steppers.HRAM, mod.X, proposal_sd=.01) mod.use_step_method(history_steps.HRAM, mod.X, init_history=mc.rnormal(mod.X.value, 1., size=(20, len(mod.X.value))), xprime_sds=2, xprime_n=51) elif step == 'Metropolis': mod.use_step_method(mc.Metropolis, mod.X, proposal_sd=.1) else: raise Exception, 'Unrecognized Step Method' mod.sample(iters) return mod
def test_neg_binom_model_sim(N=16): # simulate negative binomial data pi_true = .01 delta_true = 50 n = pl.array(pl.exp(mc.rnormal(10, 1**-2, size=N)), dtype=int) k = pl.array(mc.rnegative_binomial(n*pi_true, delta_true, size=N), dtype=float) p = k/n # create NB model and priors vars = dict(mu_age=mc.Uniform('mu_age', 0., 1000., value=.01), sigma=mc.Uniform('sigma', 0., 10000., value=1000.)) vars['mu_interval'] = mc.Lambda('mu_interval', lambda mu=vars['mu_age']: mu*pl.ones(N)) vars.update(rate_model.log_normal_model('sim', vars['mu_interval'], vars['sigma'], p, 1./pl.sqrt(n))) # fit NB model m = mc.MCMC(vars) m.sample(1)
def test_covariate_model_sim_w_hierarchy(): n = 50 # setup hierarchy hierarchy, output_template = data_simulation.small_output() # simulate normal data area_list = np.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] sex_list = np.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] year = np.array(mc.runiform(1990, 2010, n), dtype=int) alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = np.exp([alpha_true[a] for a in area]) sigma_true = .05 * np.ones_like(pi_true) p = mc.rnormal(pi_true, 1. / sigma_true**2.) model = dismod_mr.data.ModelData() model.input_data = pd.DataFrame( dict(value=p, area=area, sex=sex, year_start=year, year_end=year)) model.hierarchy, model.output_template = hierarchy, output_template # create model and priors vars = {} vars.update( dismod_mr.model.covariates.mean_covariate_model( 'test', 1, model.input_data, {}, model, 'all', 'total', 'all')) vars.update( dismod_mr.model.likelihood.normal('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2) assert 'sex' not in vars['U'] assert 'x_sex' in vars['X'] assert len(vars['beta']) == 1
def test_age_pattern_model_sim(): # simulate normal data a = pl.arange(0, 100, 5) pi_true = .0001 * (a * (100. - a) + 100.) sigma_true = .025*pl.ones_like(pi_true) p = pl.maximum(0., mc.rnormal(pi_true, 1./sigma_true**2.)) # create model and priors vars = {} vars.update(age_pattern.age_pattern('test', ages=pl.arange(101), knots=pl.arange(0,101,5), smoothing=.1)) vars['pi'] = mc.Lambda('pi', lambda mu=vars['mu_age'], a=a: mu[a]) vars.update(rate_model.normal_model('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2)
def plot_funnel(pi_true, delta_str): n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) delta = float(delta_str) * pl.ones_like(n) p = pi_true * pl.ones_like(n) # old way: #delta = delta * p * n nb = rate_model.neg_binom('funnel', p, delta, p, n) r = nb['p_pred'].value pl.vlines([pi_true], .1 * n.min(), 10 * n.max(), linewidth=2, linestyle='-', color='w', zorder=9) pl.vlines([pi_true], .1 * n.min(), 10 * n.max(), linewidth=1, linestyle='--', color='black', zorder=10) pl.plot(r, n, 'ko', mew=0, alpha=.25) pl.semilogy(schiz['r'], schiz['n'], 'ks', mew=1, mec='white', ms=4, label='Observed values') pl.xlabel('Rate (per PY)') pl.ylabel('Study size (PY)') pl.xticks([0, .005, .01]) pl.axis([-.0001, .0101, 50., 15000000]) pl.title(r'$\delta = %s$' % delta_str)
def plot_funnel(pi_true, delta_str): n = pl.exp(mc.rnormal(10, 2 ** -2, size=10000)) delta = float(delta_str) * pl.ones_like(n) p = pi_true * pl.ones_like(n) # old way: # delta = delta * p * n nb = rate_model.neg_binom("funnel", p, delta, p, n) r = nb["p_pred"].value pl.vlines([pi_true], 0.1 * n.min(), 10 * n.max(), linewidth=2, linestyle="-", color="w", zorder=9) pl.vlines([pi_true], 0.1 * n.min(), 10 * n.max(), linewidth=1, linestyle="--", color="black", zorder=10) pl.plot(r, n, "ko", mew=0, alpha=0.25) pl.semilogy(schiz["r"], schiz["n"], "ks", mew=1, mec="white", ms=4, label="Observed values") pl.xlabel("Rate (per PY)") pl.ylabel("Study size (PY)") pl.xticks([0, 0.005, 0.01]) pl.axis([-0.0001, 0.0101, 50.0, 15000000]) pl.title(r"$\delta = %s$" % delta_str)
def plot_beta_binomial_funnel(alpha, beta): pi_true = alpha/(alpha+beta) pi = mc.rbeta(alpha, beta, size=10000) n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) k = mc.rbinomial(pl.array(n, dtype=int), pi) r = k/n pl.vlines([pi_true], .1*n.min(), 10*n.max(), linewidth=2, linestyle='-', color='w', zorder=9) pl.vlines([pi_true], .1*n.min(), 10*n.max(), linewidth=1, linestyle='--', color='black', zorder=10) pl.plot(r, n, 'ko', mew=0, alpha=.25) pl.semilogy(schiz['r'], schiz['n'], 'ks', mew=1, mec='white', ms=4, label='Observed values') pl.xlabel('Rate (per PY)') pl.ylabel('Study size (PY)') pl.xticks([0, .005, .01]) pl.axis([-.0001, .0101, 50., 1500000]) pl.title(r'$\alpha=%d$, $\beta=%d$' % (alpha, beta))
def test_covariate_model_sim_w_hierarchy(): n = 50 # setup hierarchy hierarchy, output_template = data_simulation.small_output() # simulate normal data area_list = pl.array(['all', 'USA', 'CAN']) area = area_list[mc.rcategorical([.3, .3, .4], n)] sex_list = pl.array(['male', 'female', 'total']) sex = sex_list[mc.rcategorical([.3, .3, .4], n)] year = pl.array(mc.runiform(1990, 2010, n), dtype=int) alpha_true = dict(all=0., USA=.1, CAN=-.2) pi_true = pl.exp([alpha_true[a] for a in area]) sigma_true = .05*pl.ones_like(pi_true) p = mc.rnormal(pi_true, 1./sigma_true**2.) model = data.ModelData() model.input_data = pandas.DataFrame(dict(value=p, area=area, sex=sex, year_start=year, year_end=year)) model.hierarchy, model.output_template = hierarchy, output_template # create model and priors vars = {} vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, {}, model, 'all', 'total', 'all')) vars.update(rate_model.normal_model('test', vars['pi'], 0., p, sigma_true)) # fit model m = mc.MCMC(vars) m.sample(2) assert 'sex' not in vars['U'] assert 'x_sex' in vars['X'] assert len(vars['beta']) == 1
def normal(s): cur_var = 1.0 while True: result = [] for utility in s.utilities: while True: cur_result = utility + pymc.rnormal(0, 1/cur_var) if s.h_utilities_within_range([cur_result]): break result.append(cur_result) if comparitor(s.utilities, result) >= s.threshold: break cur_var /= 2 # DEBUG print 'cur_var: ' + str(cur_var) print 's.utilities: ' + str(s.utilities) print 'result: ' + str(result) print 'similarity: ' + str(comparitor(s.utilities, result)) print return result
def plot_funnel(pi_true, sigma_str): n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) sigma = float(sigma_str)*pl.ones_like(n) p = pi_true*pl.ones_like(n) oln = rate_model.offset_log_normal('funnel', p, sigma, p, pl.sqrt(p*(1-p)/n)) r = oln['p_pred'].value pl.vlines([pi_true], .1*n.min(), 10*n.max(), linewidth=2, linestyle='-', color='w', zorder=9) pl.vlines([pi_true], .1*n.min(), 10*n.max(), linewidth=1, linestyle='--', color='black', zorder=10) pl.plot(r, n, 'ko', mew=0, alpha=.25) pl.semilogy(schiz['r'], schiz['n'], 'ks', mew=1, mec='white', ms=4, label='Observed values') pl.xlabel('Rate (per PY)') pl.ylabel('Study size (PY)') pl.xticks([0, .005, .01]) pl.axis([-.0001, .0101, 50., 15000000]) pl.title(r'$\sigma = %s$'%sigma_str)
def test_non_missing(self): """ Test to ensure that masks without any missing values are not imputed. """ fake_data = rnormal(0, 1, size=10) m = ma.masked_array(fake_data, fake_data == -999) # Priors mu = Normal('mu', mu=0, tau=0.0001) s = Uniform('s', lower=0, upper=100, value=10) tau = s**-2 # Likelihood with missing data x = Normal('x', mu=mu, tau=tau, value=m, observed=True) # Instantiate sampler M = MCMC([mu, s, tau, x]) # Run sampler M.sample(20000, 19000, progress_bar=0) # Ensure likelihood does not have a trace assert_raises(AttributeError, x.__getattribute__, 'trace')
def plot_beta_binomial_funnel(alpha, beta): pi_true = alpha / (alpha + beta) pi = mc.rbeta(alpha, beta, size=10000) n = pl.exp(mc.rnormal(10, 2**-2, size=10000)) k = mc.rbinomial(pl.array(n, dtype=int), pi) r = k / n pl.vlines([pi_true], .1 * n.min(), 10 * n.max(), linewidth=2, linestyle='-', color='w', zorder=9) pl.vlines([pi_true], .1 * n.min(), 10 * n.max(), linewidth=1, linestyle='--', color='black', zorder=10) pl.plot(r, n, 'ko', mew=0, alpha=.25) pl.semilogy(schiz['r'], schiz['n'], 'ks', mew=1, mec='white', ms=4, label='Observed values') pl.xlabel('Rate (per PY)') pl.ylabel('Study size (PY)') pl.xticks([0, .005, .01]) pl.axis([-.0001, .0101, 50., 1500000]) pl.title(r'$\alpha=%d$, $\beta=%d$' % (alpha, beta))
def test_non_missing(self): """ Test to ensure that masks without any missing values are not imputed. """ fake_data = rnormal(0, 1, size=10) m = ma.masked_array(fake_data, fake_data == -999) # Priors mu = Normal('mu', mu=0, tau=0.0001) s = Uniform('s', lower=0, upper=100, value=10) tau = s ** -2 # Likelihood with missing data x = Normal('x', mu=mu, tau=tau, value=m, observed=True) # Instantiate sampler M = MCMC([mu, s, tau, x]) # Run sampler M.sample(20000, 19000, progress_bar=0) # Ensure likelihood does not have a trace assert_raises(AttributeError, x.__getattribute__, 'trace')
def iq_pred(mu=mu_iq, tau=tau_iq): """Simulated data for posterior predictive checks""" return rnormal(mu, tau, size=len(obs_indiv['iq']))
regression parameters. """ from pymc import stochastic, observed, deterministic, uniform_like, runiform, rnormal, Sampler, Normal, Uniform from numpy import inf, log, cos, array import pylab # ------------------------------------------------------------------------------ # Synthetic values # Replace by real data # ------------------------------------------------------------------------------ slope = 1.5 intercept = 4 N = 30 true_x = runiform(0, 50, N) true_y = slope * true_x + intercept data_y = rnormal(true_y, 2) data_x = rnormal(true_x, 2) # ------------------------------------------------------------------------------ # Calibration of straight line parameters from data # ------------------------------------------------------------------------------ @stochastic def theta(value=array([2., 5.])): """Slope and intercept parameters for a straight line. The likelihood corresponds to the prior probability of the parameters.""" slope, intercept = value prob_intercept = uniform_like(intercept, -10, 10) prob_slope = log(1. / cos(slope)**2) return prob_intercept + prob_slope
ages = pl.arange(101) knots = [0, 15, 60, 100] import scipy.interpolate Y_true = pl.exp( scipy.interpolate.interp1d(knots, pl.log([1.2, .3, .6, 1.5]), kind='linear')(ages)) N = 50 tau = .1**-2 X = pl.array(mc.runiform(pl.arange(0., 100., 100. / N), 100. / N + pl.arange(0., 100., 100. / N), size=N), dtype=int) Y = mc.rnormal(Y_true[X], tau) ### @export 'initial-rates' pl.figure(figsize=(17., 11), dpi=72) dismod3.graphics.plot_data_bars(df, 'talk') pl.semilogy([0], [.1], '-') pl.title( 'All-cause mortality rate\nin 1990 for females\nin sub-Saharan Africa, Southern.', size=55) pl.ylabel('Rate (Per PY)', size=48) pl.xlabel('Age (Years)', size=48) pl.subplots_adjust(.1, .175, .98, .7) pl.axis([-5, 105, 2.e-4, .8])
def predict_for(model, parameters, root_area, root_sex, root_year, area, sex, year, population_weighted, vars, lower, upper): """ Generate draws from posterior predicted distribution for a specific (area, sex, year) :Parameters: - `model` : data.DataModel - `root_area` : str, area for which this model was fit consistently - `root_sex` : str, area for which this model was fit consistently - `root_year` : str, area for which this model was fit consistently - `area` : str, area to predict for - `sex` : str, sex to predict for - `year` : str, year to predict for - `population_weighted` : bool, should prediction be population weighted if it is the aggregation of units area RE hierarchy? - `vars` : dict, including entries for alpha, beta, mu_age, U, and X - `lower, upper` : float, bounds on predictions from expert priors :Results: - Returns array of draws from posterior predicted distribution """ area_hierarchy = model.hierarchy output_template = model.output_template.copy() # find number of samples from posterior len_trace = len(vars['mu_age'].trace()) # compile array of draws from posterior distribution of alpha (random effect covariate values) # a row for each draw from the posterior distribution # a column for each random effect (e.g. countries with data, regions with countries with data, etc) # # there are several cases to handle, or at least at one time there were: # vars['alpha'] is a pymc Stochastic with an array for its value (no longer used?) # vars['alpha'] is a list of pymc Nodes # vars['alpha'] is a list of floats # vars['alpha'] is a list of some floats and some pymc Nodes # 'alpha' is not in vars # # when vars['alpha'][i] is a float, there is also information on the uncertainty in this value, stored in # vars['const_alpha_sigma'][i], which is not used when fitting the model, but should be incorporated in # the prediction if 'alpha' in vars and isinstance(vars['alpha'], mc.Node): assert 0, 'No longer used' alpha_trace = vars['alpha'].trace() elif 'alpha' in vars and isinstance(vars['alpha'], list): alpha_trace = [] for n, sigma in zip(vars['alpha'], vars['const_alpha_sigma']): if isinstance(n, mc.Node): alpha_trace.append(n.trace()) else: # uncertainty of constant alpha incorporated here sigma = max(sigma, 1.e-9) # make sure sigma is non-zero assert not np.isnan(sigma) alpha_trace.append(mc.rnormal(float(n), sigma**-2, size=len_trace)) alpha_trace = np.vstack(alpha_trace).T else: alpha_trace = np.array([]) # compile array of draws from posterior distribution of beta (fixed effect covariate values) # a row for each draw from the posterior distribution # a column for each fixed effect # # there are several cases to handle, or at least at one time there were: # vars['beta'] is a pymc Stochastic with an array for its value (no longer used?) # vars['beta'] is a list of pymc Nodes # vars['beta'] is a list of floats # vars['beta'] is a list of some floats and some pymc Nodes # 'beta' is not in vars # # when vars['beta'][i] is a float, there is also information on the uncertainty in this value, stored in # vars['const_beta_sigma'][i], which is not used when fitting the model, but should be incorporated in # the prediction # # TODO: refactor to reduce duplicate code (this is very similar to code for alpha above) if 'beta' in vars and isinstance(vars['beta'], mc.Node): assert 0, 'No longer used' beta_trace = vars['beta'].trace() elif 'beta' in vars and isinstance(vars['beta'], list): beta_trace = [] for n, sigma in zip(vars['beta'], vars['const_beta_sigma']): if isinstance(n, mc.Node): beta_trace.append(n.trace()) else: # uncertainty of constant beta incorporated here sigma = max(sigma, 1.e-9) # make sure sigma is non-zero assert not np.isnan(sigma) beta_trace.append(mc.rnormal(float(n), sigma**-2., size=len_trace)) beta_trace = np.vstack(beta_trace).T else: beta_trace = np.array([]) # the prediction for the requested area is produced by aggregating predictions for all of the childred # of that area in the area_hierarchy (a networkx.DiGraph) leaves = [n for n in nx.traversal.bfs_tree(area_hierarchy, area) if area_hierarchy.successors(n) == []] if len(leaves) == 0: # networkx returns an empty list when the bfs tree is a single node leaves = [area] # initialize covariate_shift and total_population covariate_shift = np.zeros(len_trace) total_population = 0. # group output_template for easy access output_template = output_template.groupby(['area', 'sex', 'year']).mean() # if there are fixed effects, the effect coefficients are stored as an array in vars['X'] # use this to put together a covariate matrix for the predictions, according to the output_template # covariate values # # the resulting array is covs if 'X' in vars: covs = output_template.filter(vars['X'].columns) if 'x_sex' in vars['X'].columns: covs['x_sex'] = sex_value[sex] assert np.all(covs.columns == vars['X_shift'].index), 'covariate columns and unshift index should match up' for x_i in vars['X_shift'].index: covs[x_i] -= vars['X_shift'][x_i] # shift covariates so that the root node has X_ar,sr,yr == 0 else: covs = pd.DataFrame(index=output_template.index) # if there are random effects, put together an indicator based on # their hierarchical relationships # if 'U' in vars: p_U = area_hierarchy.number_of_nodes() # random effects for area U_l = pd.DataFrame(np.zeros((1, p_U)), columns=area_hierarchy.nodes()) U_l = U_l.filter(vars['U'].columns) else: U_l = pd.DataFrame(index=[0]) # loop through leaves of area_hierarchy subtree rooted at 'area', # make prediction for each using appropriate random # effects and appropriate fixed effect covariates # for l in leaves: log_shift_l = np.zeros(len_trace) U_l.ix[0,:] = 0. root_to_leaf = nx.shortest_path(area_hierarchy, root_area, l) for node in root_to_leaf[1:]: if node not in U_l.columns: ## Add a columns U_l[node] = rnormal(0, appropriate_tau) level = len(nx.shortest_path(area_hierarchy, 'all', node))-1 if 'sigma_alpha' in vars: tau_l = vars['sigma_alpha'][level].trace()**-2 U_l[node] = 0. # if this node was not already included in the alpha_trace array, add it # there are several cases for adding: # if the random effect has a distribution of Constant # add it, using a sigma as well # otherwise, sample from a normal with mean zero and standard deviation tau_l if parameters.get('random_effects', {}).get(node, {}).get('dist') == 'Constant': mu = parameters['random_effects'][node]['mu'] sigma = parameters['random_effects'][node]['sigma'] sigma = max(sigma, 1.e-9) # make sure sigma is non-zero alpha_node = mc.rnormal(mu, sigma**-2, size=len_trace) else: if 'sigma_alpha' in vars: alpha_node = mc.rnormal(0., tau_l) else: alpha_node = np.zeros(len_trace) if len(alpha_trace) > 0: alpha_trace = np.vstack((alpha_trace.T, alpha_node)).T else: alpha_trace = np.atleast_2d(alpha_node).T # TODO: implement a more robust way to align alpha_trace and U_l U_l.ix[0, node] = 1. # 'shift' the random effects matrix to have the intended # level of the hierarchy as the reference value if 'U_shift' in vars: for node in vars['U_shift']: U_l -= vars['U_shift'][node] # add the random effect intercept shift (len_trace draws) log_shift_l += np.dot(alpha_trace, U_l.T).flatten() # make X_l if len(beta_trace) > 0: X_l = covs.ix[l, sex, year] log_shift_l += np.dot(beta_trace, X_l.T).flatten() if population_weighted: # combine in linear-space with population weights shift_l = np.exp(log_shift_l) covariate_shift += shift_l * output_template['pop'][l,sex,year] total_population += output_template['pop'][l,sex,year] else: # combine in log-space without weights covariate_shift += log_shift_l total_population += 1. if population_weighted: covariate_shift /= total_population else: covariate_shift = np.exp(covariate_shift / total_population) parameter_prediction = (vars['mu_age'].trace().T * covariate_shift).T # clip predictions to bounds from expert priors parameter_prediction = parameter_prediction.clip(lower, upper) return parameter_prediction