def main(): tau = pm.rdiscrete_uniform(0, 80) print tau alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) print lambda_1, lambda_2 data = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)] def plot_artificial_sms_dataset(): tau = pm.rdiscrete_uniform(0, 80) alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) data = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)] plt.bar(np.arange(80), data, color="#348ABD") plt.bar(tau - 1, data[tau - 1], color="r", label="user behaviour changed") plt.xlim(0, 80) plt.title("More example of artificial datasets") for i in range(1, 5): plt.subplot(4, 1, i) plot_artificial_sms_dataset() plt.show()
def plot_artificial_sms_dataset(): tau = pm.rdiscrete_uniform(0, 80) alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) data = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)] plt.bar(np.arange(80), data, color="#348ABD") plt.bar(tau - 1, data[tau - 1], color="r", label="user behaviour changed") plt.xlim(0, 80)
def plot_artificial_sms_dataset(): maxdays = 80 tau = pm.rdiscrete_uniform( 0, maxdays ) alpha = 1 / 20. lambda_1, lambda_2 = pm.rexponential( alpha, 2 ) data = np.r_[ pm.rpoisson( lambda_1, tau ), pm.rpoisson( lambda_2, maxdays-tau )] plt.bar( np.arange(maxdays), data ) plt.bar( tau - 1, data[tau-1], color = 'r', label='change point' ) plt.xlim( 0, 80 )
def D_N(n): """ This function approx. D_n, the average variance of using n samples. """ Z = pm.rpoisson(lambda_, size=(n, N_Y)) average_Z = Z.mean(axis=0) return np.sqrt(((average_Z - expected_value) ** 2).mean())
def data_gen(samples_n=10, tau_start=75, tau_end=100, gamma=0.1): alpha = 1.0 / gamma for x in xrange(samples_n): tau = pm.rdiscrete_uniform(tau_start, tau_end) # lam = pm.rexponential(alpha) lam = alpha yield pm.rpoisson(lam, tau)
def plot_artifical_sms_dataset(): # specify when the user's behaviour (amount of sms received) switches by sampling from DiscreteUniform tau = rdiscrete_uniform(0, 80) print('τ = {}'.format(tau,)) alpha = 1. / 20. lambda_1, lambda_2 = rexponential(alpha, 2) print(lambda_1, lambda_2) # for days before tau, repr. the user's received sms count by sampling from a # Poisson(lambda_1), and for days after tau by sampling from Poisson(lambda_2) data = np.r_[rpoisson(lambda_1, tau), rpoisson(lambda_2, 80 - tau)] print(data) # plot artificial data set pyplot.bar(np.arange(80), data, color="#348ABD") pyplot.bar(tau - 1, data[tau - 1], color="r", label="user behaviour changed") pyplot.xlabel("time (days)") pyplot.ylabel("count of sms received") pyplot.xlim(0, 80) pyplot.legend()
def plot_artificail_sms_dataset(): #---------------------------------- # initialize both deterministic and stochastic variables tau = pm.rdiscrete_uniform(0, 80) print("tau = {0}".format(tau)) alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) print("lambda_1 = {0}\nlambda_2 = {1}".format(lambda_1, lambda_2)) lambda_ = np.r_[lambda_1 * np.ones(tau), lambda_2 * np.ones(80 - tau)] print("lambda = \n{0}".format(lambda_)) data = pm.rpoisson(lambda_) print("data = \n{0}".format(data)) #----------------------------------- # plot the artificial plt.bar(np.arange(80), data, color="#348ABD") plt.bar(tau - 1, data[tau - 1], color="r", label="user behavior changed") plt.xlabel("Time(days)") plt.ylabel("Text messages received") plt.xlim(0, 80)
def main(): sample_size = 100000 expected_value = lambda_ = 4.5 N_samples = range(1, sample_size, 100) for k in range(3): samples = pm.rpoisson(lambda_, size=sample_size) partial_average = [samples[:i].mean() for i in N_samples] label = "average of $n$ samples; seq. %d" % k plt.plot(N_samples, partial_average, lw=1.5, label=label) plt.plot(N_samples, expected_value * np.ones_like(partial_average), ls="--", label="true expected value", c="k") plt.ylim(4.35, 4.65) plt.title("Convergence of the average of \n random variables to its" + "expected value") plt.ylabel("average of $n$ samples") plt.xlabel("# of samples, $n$") plt.legend() plt.show()
""" zip.py Zero-inflated Poisson example using simulated data. """ import numpy as np from pymc import Uniform, Beta, observed, rpoisson, poisson_like # True parameter values mu_true = 5 psi_true = 0.75 n = 100 # Simulate some data data = np.array( [rpoisson(mu_true) * (np.random.random() < psi_true) for i in range(n)]) # Uniorm prior on Poisson mean mu = Uniform('mu', 0, 20) # Beta prior on psi psi = Beta('psi', alpha=1, beta=1) @observed(dtype=int, plot=False) def zip(value=data, mu=mu, psi=psi): """ Zero-inflated Poisson likelihood """ # Initialize likeihood like = 0.0 # Loop over data
# # Model class - analyze variables as a single unit model = pm.Model( [obs, lambda_, lambda_1, lambda_2, taus] ) # # Creating new datasets # maxdays = 80 tau = pm.rdiscrete_uniform( 0, maxdays ) alpha = 1 / 20. lambda_1, lambda_2 = pm.rexponential( alpha, 2 ) data = np.r_[ pm.rpoisson( lambda_1, tau ), pm.rpoisson( lambda_2, maxdays-tau )] plt.bar( np.arange(maxdays), data ) plt.bar( tau - 1, data[tau-1], color = 'r', label='change point' ) plt.xlabel( "Time (days)" ) plt.ylabel( "count" ) plt.title( "Artificial Data" ) plt.xlim( 0, 80 ) plt.legend() plt.show() def plot_artificial_sms_dataset(): maxdays = 80 tau = pm.rdiscrete_uniform( 0, maxdays ) alpha = 1 / 20.
def rbivariate_poisson(l_1,l_2,l_3): l_1 = max(l_1,eps) l_2 = max(l_2,eps) l_3 = max(l_3,eps) x = pymc.rpoisson(l_3) return [pymc.rpoisson(l_1)+x,pymc.rpoisson(l_2)+x]
def p_pred(pi=pi, n=n_nonzero): return mc.rpoisson((pi * n).clip(1.0e-9, pl.inf)) / (1.0 * n)
#!/usr/bin/env python """ zip.py Zero-inflated Poisson example using simulated data. """ import numpy as np from pymc import Uniform, Beta, observed, rpoisson, poisson_like # True parameter values mu_true = 5 psi_true = 0.75 n = 100 # Simulate some data data = np.array([rpoisson(mu_true) * (np.random.random() < psi_true) for i in range(n)]) # Uniorm prior on Poisson mean mu = Uniform('mu', 0, 20) # Beta prior on psi psi = Beta('psi', alpha=1, beta=1) @observed(dtype=int, plot=False) def zip(value=data, mu=mu, psi=psi): """ Zero-inflated Poisson likelihood """ # Initialize likeihood like = 0.0
import numpy as np import pymc as pm from matplotlib import pyplot as plt alpha = 1. / 20. lambda_ = pm.rexponential(alpha) print(lambda_) data = np.r_[pm.rpoisson(lambda_, 80)] np.savetxt("txtdata_sim.csv", data) plt.bar(np.arange(80), data, color="#348ABD") plt.xlabel("Time (days)") plt.ylabel("count of text-msgs received") plt.title("Artificial dataset") plt.xlim(0, 80) plt.show()
def p_pred(pi=pi, n=n_nonzero): return mc.rpoisson((pi * n).clip(1.e-9, pl.inf)) / (1. * n)
#################################################### #### 모델에 관측 포함 #### figsize = (12.5, 4) plt.figure(figsize=figsize) plt.rcParams['savefig.dpi'] = 300 plt.rcParams['figure.dpi'] = 300 samples = [ld1.random() for i in range(20000)] plt.hist(samples, bins=70, normed=True, histtype="stepfilled") plt.xlim(0, 8) plt.show() # 고정 밸류 data = np.array([10, 25, 15, 20, 35]) obs = pm.Poisson("obs", lambda_, value=data, observed=True) obs.value ################## ##### 모델링 ##### tau = pm.rdiscrete_uniform(0, 80) alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) lambda_ = np.r_[lambda_1 * np.ones(tau), lambda_2 * np.ones(80 - tau)] data = pm.rpoisson(lambda_) plt.bar(np.arange(80), data, color="#348ABD") plt.bar(tau - 1, data[tau - 1], color='r', label='행동변화') plt.xlable("time") plt.ylabel("message") plt.xlim(0, 80) plt.legend()
def pred(pi=pi): return mc.rpoisson(pi * n_pred) / float(n_pred)
def pred(pi=pi): return mc.rpoisson(pi*n_pred) / float(n_pred)
import numpy as np import pymc as pm from matplotlib import pyplot as plt tau = pm.rdiscrete_uniform(0, 80) print(tau) alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) print(lambda_1, lambda_2) data = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)] plt.bar(np.arange(80), data, color="#348ABD") plt.bar(tau - 1, data[tau - 1], color="r", label="user behaviour changed") plt.xlabel("Time (days)") plt.ylabel("count of text-msgs received") plt.title("Artificial dataset") plt.xlim(0, 80) plt.legend(); plt.show()
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) # load data model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/') data = model.get_data('p') #data = data.ix[:20, :] # replace data with synthetic data if requested if data_type == 'epilepsy': # no replacement needed pass elif data_type == 'schiz': import pandas as pd data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv') elif data_type == 'binom': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N elif data_type == 'poisson': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rpoisson(N*mu, size=len(data.index)) / N elif data_type == 'normal': mu = data['value'].mean() sigma = .125*mu data['standard_error'] = sigma data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index)) elif data_type == 'log_normal': mu = data['value'].mean() sigma = .25 data['standard_error'] = sigma*mu data['value'] = pl.exp(mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index))) else: raise TypeError, 'Unknown data type "%s"' % data_type # sample prevalence data i_test = mc.rbernoulli(.25, size=len(data.index)) i_nan = pl.isnan(data['effective_sample_size']) data['lower_ci'] = pl.nan data['upper_ci'] = pl.nan data.ix[i_nan, 'effective_sample_size'] = 0. data['standard_error'] = pl.sqrt(data['value']*(1-data['value'])) / data['effective_sample_size'] data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf data['standard_error'][i_test] = pl.inf data['effective_sample_size'][i_test] = 0. data['value'] = pl.maximum(data['value'], 1.e-12) model.input_data = data # create model # TODO: set parameters in model.parameters['p'] dict # then have simple method to create age specific rate model #model.parameters['p'] = ... #model.vars += dismod3.ism.age_specific_rate(model, 'p') model.parameters['p']['parameter_age_mesh'] = [0,100] model.parameters['p']['heterogeneity'] = 'Very' model.vars['p'] = dismod3.data_model.data_model( 'p', model, 'p', 'all', 'total', 'all', None, None, None, rate_type=rate_type, interpolation_method='zero', include_covariates=False) # add upper bound on sigma in log normal model to help convergence #if rate_type == 'log_normal': # model.vars['p']['sigma'].parents['upper'] = 1.5 # add upper bound on sigma, zeta in offset log normal #if rate_type == 'offset_log_normal': # model.vars['p']['sigma'].parents['upper'] = .1 # model.vars['p']['p_zeta'].value = 5.e-9 # model.vars['p']['p_zeta'].parents['upper'] = 1.e-8 # fit model dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000) #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0) # compare estimate to hold-out data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,0] data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:,1] import data_simulation model.test = data[i_test] data = model.test data['true'] = data['value'] data_simulation.add_quality_metrics(data) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'test') data_simulation.finalize_results(model) return model
plt.title("Prior distribution for $\lambda_1$") plt.xlim(0, 8); # Take the case of the sms data in the previous chapter, knowing what we do # about parent and child variables and taking an omniscient view on the data # and determining a modeling procedure we can work backwards to create the # data mimicing the expected creation of the data. i.e. tau = pm.rdiscrete_uniform(0, 80) print( tau ) alpha = 1. / 20. lambda_1, lambda_2 = pm.rexponential(alpha, 2) print( lambda_1, lambda_2) data = np.r_[pm.rpoisson(lambda_1, tau), pm.rpoisson(lambda_2, 80 - tau)] # Plot the distribution plt.bar(np.arange(80), data, color="#348ABD") plt.bar(tau - 1, data[tau - 1], color="r", label="user behaviour changed") plt.xlabel("Time (days)") plt.ylabel("count of text-msgs received") plt.title("Artificial dataset") plt.xlim(0, 80) plt.legend(); # This becomes important, I assume when we start checking to see if our # inference was indeed correct. If we were to create a function then this # would be the case:
def disasters_sim(early_mean=early_mean, late_mean=late_mean, switchpoint=switchpoint): """Coal mining disasters sampled from the posterior predictive distribution""" return concatenate((pm.rpoisson(early_mean, size=switchpoint), pm.rpoisson(late_mean, size=n - switchpoint)))
def rbivariate_poisson(l_1, l_2, l_3): l_1 = max(l_1, eps) l_2 = max(l_2, eps) l_3 = max(l_3, eps) x = pymc.rpoisson(l_3) return [pymc.rpoisson(l_1) + x, pymc.rpoisson(l_2) + x]
def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0): # set random seed for reproducibility mc.np.random.seed(1234567 + replicate) # load data model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/') data = model.get_data('p') #data = data.ix[:20, :] # replace data with synthetic data if requested if data_type == 'epilepsy': # no replacement needed pass elif data_type == 'schiz': import pandas as pd data = pd.read_csv('/homes/abie/gbd_dev/gbd/tests/schiz.csv') elif data_type == 'binom': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N elif data_type == 'poisson': N = 1.e6 data['effective_sample_size'] = N mu = data['value'].mean() data['value'] = mc.rpoisson(N * mu, size=len(data.index)) / N elif data_type == 'normal': mu = data['value'].mean() sigma = .125 * mu data['standard_error'] = sigma data['value'] = mc.rnormal(mu, sigma**-2, size=len(data.index)) elif data_type == 'log_normal': mu = data['value'].mean() sigma = .25 data['standard_error'] = sigma * mu data['value'] = pl.exp( mc.rnormal(pl.log(mu), sigma**-2, size=len(data.index))) else: raise TypeError, 'Unknown data type "%s"' % data_type # sample prevalence data i_test = mc.rbernoulli(.25, size=len(data.index)) i_nan = pl.isnan(data['effective_sample_size']) data['lower_ci'] = pl.nan data['upper_ci'] = pl.nan data.ix[i_nan, 'effective_sample_size'] = 0. data['standard_error'] = pl.sqrt( data['value'] * (1 - data['value'])) / data['effective_sample_size'] data.ix[pl.isnan(data['standard_error']), 'standard_error'] = pl.inf data['standard_error'][i_test] = pl.inf data['effective_sample_size'][i_test] = 0. data['value'] = pl.maximum(data['value'], 1.e-12) model.input_data = data # create model # TODO: set parameters in model.parameters['p'] dict # then have simple method to create age specific rate model #model.parameters['p'] = ... #model.vars += dismod3.ism.age_specific_rate(model, 'p') model.parameters['p']['parameter_age_mesh'] = [0, 100] model.parameters['p']['heterogeneity'] = 'Very' model.vars['p'] = dismod3.data_model.data_model( 'p', model, 'p', 'all', 'total', 'all', None, None, None, rate_type=rate_type, interpolation_method='zero', include_covariates=False) # add upper bound on sigma in log normal model to help convergence #if rate_type == 'log_normal': # model.vars['p']['sigma'].parents['upper'] = 1.5 # add upper bound on sigma, zeta in offset log normal #if rate_type == 'offset_log_normal': # model.vars['p']['sigma'].parents['upper'] = .1 # model.vars['p']['p_zeta'].value = 5.e-9 # model.vars['p']['p_zeta'].parents['upper'] = 1.e-8 # fit model dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000) #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0) # compare estimate to hold-out data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] data['lb_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:, 0] data['ub_pred'] = model.vars['p']['p_pred'].stats()['95% HPD interval'][:, 1] import data_simulation model.test = data[i_test] data = model.test data['true'] = data['value'] data_simulation.add_quality_metrics(data) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'test') data_simulation.finalize_results(model) return model