Exemple #1
0
    def test_pred_samps(self):
        "A dry run in Kenya with only one sample point. This test should not work with N>1."

        N = 1

        lat_pred = np.atleast_1d(pm.runiform(-5., 5., size=N) * deg_to_rad)
        # lat_pred = array([8.89, 9.5, 1.17, 1.39])
        lon_pred = np.atleast_1d(pm.runiform(33., 40., size=N) * deg_to_rad)
        # lon_pred = array([-1.54, .08, 39.44, 38.12])
        t_pred = np.atleast_1d(array([2007] * N) - 2009)

        pred_mesh = vstack((lon_pred, lat_pred, t_pred)).T
        age_lims = [(lo_age, up_age)] * len(lon_pred)

        N_exam = ones(len(lat_pred)) * 1000

        input_pts = [{'lon': lon_pred[i], 'lat': lat_pred[i], 'month': 1, 'year': 2009, 'lo_age': 2, 'up_age': 10, 'n': N_exam[i]}\
                        for i in range(len(lat_pred))]
        output_pts = [{
            'lon': lon_pred[i],
            'lat': lat_pred[i],
            'year': 2009,
            'month': 1,
            'lo_age': 2,
            'up_age': 10,
            'nmonths': 2
        } for i in range(len(lat_pred))]

        correction_factor_array = mbgw.correction_factors.known_age_corr_factors(
            arange(0, 27), 1000)

        ind_outer, ind_inner, Ms, Cs, Vs, likelihood_means, likelihood_variances, model_posteriors =\
            mbgw.EP.pred_samps(pred_mesh*deg_to_rad, pred_mesh*deg_to_rad, N_exam, tracefile, trace_thin, trace_burn, N_param_vals, N_per_param, N_nearest, age_lims, correction_factor_array, debug=True)
Exemple #2
0
  def step(self):
    """ 
    Slice step method
    """
    y = self.loglike - rexponential(1)

    # Stepping out procedure
    L = self.stochastic.value - self.w*runiform(0,1)
    R = L + self.w
    J = floor(self.m*runiform(0,1))
    K = (self.m-1)-J
    while(J>0 and y<self.fll(L)):
      L = L - self.w
      J = J - 1
    while(K>0 and y<self.fll(R)):
      R = R + self.w
      K = K - 1
    #self.stochastic.last_value = self.stochastic.value
    self.stochastic.value = runiform(L,R)
    try:
      y_new = self.loglike
    except ZeroProbability:
      y_new = -infty
    while(y_new<y):
      if (self.stochastic.value < self.stochastic.last_value):
        L = float(self.stochastic.value)
      else:
        R = float(self.stochastic.value)
      self.stochastic.revert()
      self.stochastic.value = runiform(L,R)
      try:
        y_new = self.loglike
      except ZeroProbability:
        y_new = -infty
Exemple #3
0
def simulate_age_group_data(N=50, delta_true=150, pi_true=true_rate_function):
    """ generate simulated data
    """
    # start with a simple model with N rows of data
    model = data_simulation.simple_model(N)

    # record the true age-specific rates
    model.ages = pl.arange(0, 101, 1)
    model.pi_age_true = pi_true(model.ages)

    # choose age groups randomly
    age_width = mc.runiform(1, 100, size=N)
    age_mid = mc.runiform(age_width / 2, 100 - age_width / 2, size=N)
    age_width[:10] = 10
    age_mid[:10] = pl.arange(5, 105, 10)
    #age_width[10:20] = 10
    #age_mid[10:20] = pl.arange(5, 105, 10)

    age_start = pl.array(age_mid - age_width / 2, dtype=int)
    age_end = pl.array(age_mid + age_width / 2, dtype=int)

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end

    # choose effective sample size uniformly at random
    n = mc.runiform(100, 10000, size=N)
    model.input_data['effective_sample_size'] = n

    # integrate true age-specific rate across age groups to find true group rate
    model.input_data['true'] = pl.nan
    model.input_data['age_weights'] = ''

    for i in range(N):
        beta = mc.rnormal(0., .025**-2)

        # TODO: clean this up, it is computing more than is necessary
        age_weights = pl.exp(beta * model.ages)
        sum_pi_wt = pl.cumsum(model.pi_age_true * age_weights)
        sum_wt = pl.cumsum(age_weights)
        p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] -
                                                           sum_wt[age_start])

        model.input_data.ix[i, 'true'] = p[i]
        model.input_data.ix[i, 'age_weights'] = ';'.join(
            ['%.4f' % w for w in age_weights[age_start[i]:(age_end[i] + 1)]])

    # sample observed rate values from negative binomial distribution
    model.input_data['value'] = mc.rnegative_binomial(
        n * model.input_data['true'], delta_true) / n

    print model.input_data.drop(['standard_error', 'upper_ci', 'lower_ci'],
                                axis=1)
    return model
Exemple #4
0
def simulate_age_group_data(N=50, delta_true=150, pi_true=true_rate_function):
    """ generate simulated data
    """
    # start with a simple model with N rows of data
    model = data_simulation.simple_model(N)


    # record the true age-specific rates
    model.ages = pl.arange(0, 101, 1)
    model.pi_age_true = pi_true(model.ages)


    # choose age groups randomly
    age_width = mc.runiform(1, 100, size=N)
    age_mid = mc.runiform(age_width/2, 100-age_width/2, size=N)
    age_width[:10] = 10
    age_mid[:10] = pl.arange(5, 105, 10)
    #age_width[10:20] = 10
    #age_mid[10:20] = pl.arange(5, 105, 10)

    age_start = pl.array(age_mid - age_width/2, dtype=int)
    age_end = pl.array(age_mid + age_width/2, dtype=int)

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end


    # choose effective sample size uniformly at random
    n = mc.runiform(100, 10000, size=N)
    model.input_data['effective_sample_size'] = n


    # integrate true age-specific rate across age groups to find true group rate
    model.input_data['true'] = pl.nan
    model.input_data['age_weights'] = ''

    for i in range(N):
        beta = mc.rnormal(0., .025**-2)

        # TODO: clean this up, it is computing more than is necessary
        age_weights = pl.exp(beta*model.ages)
        sum_pi_wt = pl.cumsum(model.pi_age_true*age_weights)
        sum_wt = pl.cumsum(age_weights)
        p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

        model.input_data.ix[i, 'true'] = p[i]
        model.input_data.ix[i, 'age_weights'] = ';'.join(['%.4f'%w for w in age_weights[age_start[i]:(age_end[i]+1)]])

    # sample observed rate values from negative binomial distribution
    model.input_data['value'] = mc.rnegative_binomial(n*model.input_data['true'], delta_true) / n

    print model.input_data.drop(['standard_error', 'upper_ci', 'lower_ci'], axis=1)
    return model
def multipoly_sample(n, mp):
    """
    Returns uniformly-distributed points on the earth's surface 
    conditioned to be inside a multipolygon.
    
    Not particularly fast.
    """

    # b = basemap.Basemap(-180,-90,180,90)
    
    if isinstance(mp, geometry.MultiPolygon):
        print 'Breaking down multipolygon'
        areas = [shapely_poly_area(p) for p in mp.geoms]
        areas = np.array(areas)/np.sum(areas)
        # ns = pm.rmultinomial(n, areas)
        stair = np.array(np.concatenate(([0],np.cumsum(areas*n))),dtype='int')
        ns = np.diff(stair)
        locs = [multipoly_sample(ns[i], mp.geoms[i]) for i in np.where(ns>0)[0]]
        return np.concatenate([loc[0] for loc in locs]), np.concatenate([loc[1] for loc in locs])
        
    
    lons = np.empty(n)
    lats = np.empty(n)
    
    done = 0
    
    xmin = mp.bounds[0]*np.pi/180
    ymin = mp.bounds[1]*np.pi/180
    xmax = mp.bounds[2]*np.pi/180
    ymax = mp.bounds[3]*np.pi/180
    
    print 'Starting: n=%i'%n
    while done < n:
        x = np.atleast_1d(pm.runiform(xmin,xmax, size=n))
        y = np.atleast_1d(np.arcsin(pm.runiform(np.sin(ymin),np.sin(ymax),size=n)))
        points=[geom.Point([x[i]*180./np.pi,y[i]*180./np.pi]) for i in xrange(len(x))]
        good = list(iterops.contains(mp, points, True))
        n_good = min(n,len(good))
        lons[done:done+n_good] = [p.coords[0][0] for p in good][:n-done]
        lats[done:done+n_good] = [p.coords[0][1] for p in good][:n-done]
        done += n_good
        print '\tDid %i, %i remaining.'%(n_good,n-done)
        
        # plot_unit(b,mp)
        # b.plot(x*180./np.pi,y*180./np.pi,'r.')
        # 
        # from IPython.Debugger import Pdb
        # Pdb(color_scheme='Linux').set_trace()   
    print 'Filled'
    return lons, lats
Exemple #6
0
def simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true):
    # choose age intervals to measure
    age_start = np.array(mc.runiform(0, 100, n), dtype=int)
    age_start.sort()  # sort to make it easy to discard the edges when testing
    age_end = np.array(mc.runiform(age_start + 1,
                                   np.minimum(age_start + 10, 100)),
                       dtype=int)

    # find truth for the integral across the age intervals
    import scipy.integrate
    pi_interval_true = [
        scipy.integrate.trapz(pi_age_true[a_0i:(a_1i + 1)]) / (a_1i - a_0i)
        for a_0i, a_1i in zip(age_start, age_end)
    ]

    # generate covariates that add explained variation
    X = mc.rnormal(0., 1.**2, size=(n, 3))
    beta_true = [-.1, .1, .2]
    beta_true = [0, 0, 0]
    Y_true = np.dot(X, beta_true)

    # calculate the true value of the rate in each interval
    pi_true = pi_interval_true * np.exp(Y_true)

    # simulate the noisy measurement of the rate in each interval
    p = np.maximum(0., mc.rnormal(pi_true, 1. / sigma_true**2.))

    # store the simulated data in a pandas DataFrame
    data = pandas.DataFrame(
        dict(value=p,
             age_start=age_start,
             age_end=age_end,
             x_0=X[:, 0],
             x_1=X[:, 1],
             x_2=X[:, 2]))
    data['effective_sample_size'] = np.maximum(p * (1 - p) / sigma_true**2, 1.)

    data['standard_error'] = np.nan
    data['upper_ci'] = np.nan
    data['lower_ci'] = np.nan

    data['year_start'] = 2005.  # TODO: make these vary
    data['year_end'] = 2005.
    data['sex'] = 'total'
    data['area'] = 'all'
    data['data_type'] = data_type

    return data
def generate_data(N, delta_true, pi_true, heterogeneity, bias, sigma_prior):
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    model = data_simulation.simple_model(N)
    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    model.parameters['p']['smoothness'] = dict(amount='Moderately')
    model.parameters['p']['heterogeneity'] = heterogeneity

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true*age_weights)
    sum_wt = pl.cumsum(age_weights)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    n = mc.runiform(10000, 100000, size=N)

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n * pl.exp(bias)

    emp_priors = {}
    emp_priors['p', 'mu'] = pi_age_true
    emp_priors['p', 'sigma'] = sigma_prior*pi_age_true
    model.emp_priors = emp_priors

    model.a = a
    model.pi_age_true = pi_age_true
    model.delta_true = delta_true

    return model
Exemple #8
0
def simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true):
    # choose age intervals to measure
    age_start = np.array(mc.runiform(0, 100, n), dtype=int)
    age_start.sort()  # sort to make it easy to discard the edges when testing
    age_end = np.array(mc.runiform(age_start+1, np.minimum(age_start+10,100)), dtype=int)

    # find truth for the integral across the age intervals
    import scipy.integrate
    pi_interval_true = [scipy.integrate.trapz(pi_age_true[a_0i:(a_1i+1)]) / (a_1i - a_0i) 
                        for a_0i, a_1i in zip(age_start, age_end)]

    # generate covariates that add explained variation
    X = mc.rnormal(0., 1.**2, size=(n,3))
    beta_true = [-.1, .1, .2]
    beta_true = [0, 0, 0]
    Y_true = np.dot(X, beta_true)

    # calculate the true value of the rate in each interval
    pi_true = pi_interval_true*np.exp(Y_true)

    # simulate the noisy measurement of the rate in each interval
    p = np.maximum(0., mc.rnormal(pi_true, 1./sigma_true**2.))

    # store the simulated data in a pandas DataFrame
    data = pandas.DataFrame(dict(value=p, age_start=age_start, age_end=age_end,
                                 x_0=X[:,0], x_1=X[:,1], x_2=X[:,2]))
    data['effective_sample_size'] = np.maximum(p*(1-p)/sigma_true**2, 1.)

    data['standard_error'] = np.nan
    data['upper_ci'] = np.nan
    data['lower_ci'] = np.nan

    data['year_start'] = 2005.  # TODO: make these vary
    data['year_end'] = 2005.
    data['sex'] = 'total'
    data['area'] = 'all'
    data['data_type'] = data_type
    
    return data
Exemple #9
0
    def test_pred_samps(self):
        "A dry run in Kenya with only one sample point. This test should not work with N>1."
        
        N = 1
        
        lat_pred = np.atleast_1d(pm.runiform(-5., 5., size=N) * deg_to_rad)
        # lat_pred = array([8.89, 9.5, 1.17, 1.39])
        lon_pred = np.atleast_1d(pm.runiform(33., 40., size=N) * deg_to_rad)
        # lon_pred = array([-1.54, .08, 39.44, 38.12])
        t_pred = np.atleast_1d(array([2007]*N)-2009)

        pred_mesh = vstack((lon_pred, lat_pred, t_pred)).T
        age_lims = [(lo_age, up_age)]*len(lon_pred)

        N_exam = ones(len(lat_pred))*1000
                
        input_pts = [{'lon': lon_pred[i], 'lat': lat_pred[i], 'month': 1, 'year': 2009, 'lo_age': 2, 'up_age': 10, 'n': N_exam[i]}\
                        for i in range(len(lat_pred))]
        output_pts =  [{'lon': lon_pred[i], 'lat': lat_pred[i], 'year': 2009, 'month': 1, 'lo_age': 2, 'up_age': 10, 'nmonths': 2} for i in range(len(lat_pred))]

        correction_factor_array = mbgw.correction_factors.known_age_corr_factors(arange(0,27), 1000)

        ind_outer, ind_inner, Ms, Cs, Vs, likelihood_means, likelihood_variances, model_posteriors =\
            mbgw.EP.pred_samps(pred_mesh*deg_to_rad, pred_mesh*deg_to_rad, N_exam, tracefile, trace_thin, trace_burn, N_param_vals, N_per_param, N_nearest, age_lims, correction_factor_array, debug=True)
Exemple #10
0
def test_covariate_model_sim_w_hierarchy():
    n = 50

    # setup hierarchy
    hierarchy, output_template = data_simulation.small_output()

    # simulate normal data
    area_list = np.array(['all', 'USA', 'CAN'])
    area = area_list[mc.rcategorical([.3, .3, .4], n)]

    sex_list = np.array(['male', 'female', 'total'])
    sex = sex_list[mc.rcategorical([.3, .3, .4], n)]

    year = np.array(mc.runiform(1990, 2010, n), dtype=int)

    alpha_true = dict(all=0., USA=.1, CAN=-.2)

    pi_true = np.exp([alpha_true[a] for a in area])
    sigma_true = .05 * np.ones_like(pi_true)

    p = mc.rnormal(pi_true, 1. / sigma_true**2.)

    model = dismod_mr.data.ModelData()
    model.input_data = pd.DataFrame(
        dict(value=p, area=area, sex=sex, year_start=year, year_end=year))
    model.hierarchy, model.output_template = hierarchy, output_template

    # create model and priors
    vars = {}
    vars.update(
        dismod_mr.model.covariates.mean_covariate_model(
            'test', 1, model.input_data, {}, model, 'all', 'total', 'all'))
    vars.update(
        dismod_mr.model.likelihood.normal('test', vars['pi'], 0., p,
                                          sigma_true))

    # fit model
    m = mc.MCMC(vars)
    m.sample(2)

    assert 'sex' not in vars['U']
    assert 'x_sex' in vars['X']
    assert len(vars['beta']) == 1
Exemple #11
0
def test_covariate_model_sim_w_hierarchy():
    n = 50

    # setup hierarchy
    hierarchy, output_template = data_simulation.small_output()

    # simulate normal data
    area_list = pl.array(['all', 'USA', 'CAN'])
    area = area_list[mc.rcategorical([.3, .3, .4], n)]

    sex_list = pl.array(['male', 'female', 'total'])
    sex = sex_list[mc.rcategorical([.3, .3, .4], n)]

    year = pl.array(mc.runiform(1990, 2010, n), dtype=int)
        
    alpha_true = dict(all=0., USA=.1, CAN=-.2)

    pi_true = pl.exp([alpha_true[a] for a in area])
    sigma_true = .05*pl.ones_like(pi_true)

    p = mc.rnormal(pi_true, 1./sigma_true**2.)

    model = data.ModelData()
    model.input_data = pandas.DataFrame(dict(value=p, area=area, sex=sex, year_start=year, year_end=year))
    model.hierarchy, model.output_template = hierarchy, output_template

    # create model and priors
    vars = {}
    vars.update(covariate_model.mean_covariate_model('test', 1, model.input_data, {}, model,
                                                     'all', 'total', 'all'))
    vars.update(rate_model.normal_model('test', vars['pi'], 0., p, sigma_true))

    # fit model
    m = mc.MCMC(vars)
    m.sample(2)

    assert 'sex' not in vars['U']
    assert 'x_sex' in vars['X']
    assert len(vars['beta']) == 1
Exemple #12
0
def sim_data_for_validation(N,
                            true_cf=[[0.1, 0.3, 0.6],
                                     [0.2, 0.3, 0.5]],
                            true_std=[[.2, .05, .05], 
                                      [.3, 0.1, 0.1]], 
                            std_bias=[1.,1.,1.]):
    """
    Input
    -----
    true_cf  - a list of lists of true cause fractions (each must sum to one).
    true_std - a list of lists of the standard deviations corresponding to the true csmf's 
             for each time point. Can either be a list of length J inside a list of length
             1 (in this case, the same standard deviation is used for all time points) or 
             can be T lists of length J (in this case, the a separate standard deviation 
             is specified and used for each time point). This is meant to capture how
             variable estimates of the true cause fraction will be (i.e. causes that
             are more difficult to estimate will be more variable and therefore will 
             have greater uncertainty).
    std_bias - a list of length J giving the bias for the standard deviations for each 
             cause (as a multiplier: i.e. 0.9 would imply that we will underestimate
             the standard deviation by 10% on average while 1.1 would imply that we
             will overestimate the standard deviation by 10% on average). 
    
    Output
    -----
    N JxT draws from an 'estimated' distribution for the specified causes 
    """

    if len(true_std)==1 and len(true_cf)>1: 
        true_std = [true_std[0] for i in range(len(true_cf))]
    
    est_cf = sim_data(1, true_cf, true_std)[0]
    est_error = est_cf - true_cf
    est_std = true_std*mc.runiform(pl.array(std_bias)*0.9, pl.array(std_bias)*1.1)
    sims = sim_data(N, est_cf, est_std, sum_to_one=False)
    return sims
Exemple #13
0
def sim_data_for_validation(N,
                            true_cf=[[0.1, 0.3, 0.6], [0.2, 0.3, 0.5]],
                            true_std=[[.2, .05, .05], [.3, 0.1, 0.1]],
                            std_bias=[1., 1., 1.]):
    """
    Input
    -----
    true_cf  - a list of lists of true cause fractions (each must sum to one).
    true_std - a list of lists of the standard deviations corresponding to the true csmf's 
             for each time point. Can either be a list of length J inside a list of length
             1 (in this case, the same standard deviation is used for all time points) or 
             can be T lists of length J (in this case, the a separate standard deviation 
             is specified and used for each time point). This is meant to capture how
             variable estimates of the true cause fraction will be (i.e. causes that
             are more difficult to estimate will be more variable and therefore will 
             have greater uncertainty).
    std_bias - a list of length J giving the bias for the standard deviations for each 
             cause (as a multiplier: i.e. 0.9 would imply that we will underestimate
             the standard deviation by 10% on average while 1.1 would imply that we
             will overestimate the standard deviation by 10% on average). 
    
    Output
    -----
    N JxT draws from an 'estimated' distribution for the specified causes 
    """

    if len(true_std) == 1 and len(true_cf) > 1:
        true_std = [true_std[0] for i in range(len(true_cf))]

    est_cf = sim_data(1, true_cf, true_std)[0]
    est_error = est_cf - true_cf
    est_std = true_std * mc.runiform(
        pl.array(std_bias) * 0.9,
        pl.array(std_bias) * 1.1)
    sims = sim_data(N, est_cf, est_std, sum_to_one=False)
    return sims
Exemple #14
0
        
if __name__ == '__main__':
    from tables import openFile
    from pylab import *

    N=10000
    pop=10000*np.ones(N)
    nyr = 10
    
    pop[::10] = 0

    p = BurdenPredictor('traces/Africa+_scale_0.6_model_exp.hdf5', pop, nyr, 0)
    pr_max = .6
    # p = BurdenPredictor('traces/CSE_Asia_and_Americas_scale_0.6_model_exp.hdf5', np.ones(N)*pop, nyr)
    # pr_max = .5
    
    # for i in xrange(10000):

    pr = pm.runiform(0,pr_max,size=N)
    
    # for i in xrange(p.n):
    #     clf()
    #     plot(xplot, p.cols.fplot[i],'r.',markersize=2)
    #     plot(pr, p.f[i](pr), 'b.',markersize=1)
    #     a=raw_input()
    
    b = p(pr)
    
    clf()
    plot(pr,b,'k.',markersize=1)
Exemple #15
0
def validate_covariate_model_dispersion(N=1000,
                                        delta_true=.15,
                                        pi_true=.01,
                                        zeta_true=[.5, -.5, 0.]):
    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0
    delta = delta_true * pl.exp(pl.dot(Z, zeta_true))
    for i in range(len(zeta_true)):
        model.input_data['z_%d' % i] = Z[:, i]

    model.input_data['true'] = pi_true

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n * p, delta * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=5,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    add_quality_metrics(model.input_data)

    model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns)
    model.zeta['true'] = zeta_true

    model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean']
    model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats(
    )['standard deviation']
    add_quality_metrics(model.zeta)

    print '\nzeta'
    print model.zeta

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (
        pl.median(pl.absolute(model.zeta['abs_err'].dropna())),
        model.zeta.dropna()['covered?'].mean())

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'delta')
    add_to_results(model, 'input_data')
    add_to_results(model, 'zeta')
    model.results = pandas.DataFrame(model.results,
                                     columns='param bias mae mare pc'.split())

    return model
Exemple #16
0
where p(x) stands for the prior for the true input and p(a,b) the prior for the
regression parameters.
"""
from pymc import stochastic, observed, deterministic, uniform_like, runiform, rnormal, Sampler, Normal, Uniform
from numpy import inf, log, cos, array
import pylab

# ------------------------------------------------------------------------------
# Synthetic values
# Replace by real data
# ------------------------------------------------------------------------------
slope = 1.5
intercept = 4
N = 30
true_x = runiform(0, 50, N)
true_y = slope * true_x + intercept
data_y = rnormal(true_y, 2)
data_x = rnormal(true_x, 2)

# ------------------------------------------------------------------------------
# Calibration of straight line parameters from data
# ------------------------------------------------------------------------------


@stochastic
def theta(value=array([2., 5.])):
    """Slope and intercept parameters for a straight line.
    The likelihood corresponds to the prior probability of the parameters."""
    slope, intercept = value
    prob_intercept = uniform_like(intercept, -10, 10)
Exemple #17
0
import numpy as np
import pymc as pm
from matplotlib import pyplot as plt

N = 20

#create some artificial data.
lifetime = pm.rweibull(2, 5, size=N)
birth = pm.runiform(0, 10, N)

censor = (birth +
          lifetime) > 10  #an individual is right-censored if this is True
lifetime_ = np.ma.masked_array(lifetime, censor)  #create the censorship event.
lifetime_.set_fill_value(10)  #good for computations later.

#this begins the model
alpha = pm.Uniform("alpha", 0, 20)
#lets just use uninformative priors
beta = pm.Uniform("beta", 0, 20)
obs = pm.Weibull('obs', alpha, beta, value=lifetime_, observed=True)


@pm.potential
def censor_factor(obs=obs):
    if np.any((obs + birth < 10)[lifetime_.mask]):
        return -np.inf
    else:
        return 0


#perform Markov Chain Monte Carlo - see chapter 3 of BMH
Exemple #18
0
import pymc as pm
import numpy as np

import pylab

slp=1.5
intc=4.0
N=30

true_x= pm.runiform(0, 50, N)
true_y = slp*true_x + intc

data_y=pm.rnormal(true_y, 2)

data_x=rnormal(true_x, 2)

def theta(value=array([2.,5.])):
    """Slope and intercept parameters for a straight line.
    The likelihood corresponds to the prior probability of the parameters."""
    slope, intercept = value
    prob_intercept = pm.uniform_like(intercept, -10, 10)
    prob_slope = np.log(1./np.cos(slope)**2)
    return prob_intercept+prob_slope

init_x=data_x.clip(min=0, max=50)

x=pm.Uniform('x', lower=0, upper=50, value=init_x)



def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)


    import dismod3
    import simplejson as json
    model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    gbd_hierarchy = model.hierarchy

    model = data_simulation.simple_model(N)
    model.hierarchy = gbd_hierarchy

    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    model.parameters['p']['smoothness'] = dict(amount=smoothness)
    model.parameters['p']['heterogeneity'] = heterogeneity

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true*age_weights)
    sum_wt = pl.cumsum(age_weights*1.)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N)


    from validate_covariates import alpha_true_sim
    area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR'])
    alpha = alpha_true_sim(model, area_list, sigma_true)
    print alpha

    model.input_data['true'] = pl.nan

    model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)]
    
    for i, a in model.input_data['area'].iteritems():
        model.input_data['true'][i] = p[i] * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha]))
    p = model.input_data['true']

    n = model.input_data['effective_sample_size']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None)
    #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(range(101), pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
    model.alpha['true'] = pandas.Series(dict(alpha))
    model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns)
    model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns)
    model.alpha = model.alpha.dropna()
    data_simulation.add_quality_metrics(model.alpha)

    model.sigma = pandas.DataFrame(dict(true=sigma_true))
    model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']]
    model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']]
    data_simulation.add_quality_metrics(model.sigma)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(dict(true=pi_age_true,
                                     mu_pred=model.vars['p']['mu_age'].stats()['mean'],
                                     sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.add_to_results(model, 'alpha')
    data_simulation.add_to_results(model, 'sigma')
    data_simulation.finalize_results(model)

    print model.results

    return model
import numpy  as np
import matplotlib.pyplot as plt
import pymc as mc
import scipy.stats as stats
import math

# http://blog.yhathq.com/posts/estimating-user-lifetimes-with-pymc.html

# artificial data
N = 20
true_alpha = 2
true_beta = 5
lifetime = mc.rweibull( true_alpha, true_beta, size=N )
birth = mc.runiform( 0, 10, N )

# an individual is right censored if this is true
censor = (birth + lifetime) > 10
lifetime_ = np.ma.masked_array( lifetime, censor )
lifetime_.set_fill_value( 10 )

plt.clf()
y = np.arange( 0, N )
for b,l,yy in zip( birth, lifetime, y ):
    plt.plot( [b,b+l], [yy,yy] )
plt.plot( birth+lifetime, y, linestyle="", marker="o" )
plt.draw()
plt.show( block=False )

# begin the model
# just use uniform priors
alpha = mc.Uniform( "alpha", 0, 20 )
Exemple #21
0
def validate_covariate_model_fe(N=100, delta_true=3, pi_true=.01, beta_true=[.5, -.5, 0.], replicate=0):
    # set random seed for reproducibility
    mc.np.random.seed(1234567 + replicate)
    
    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    # add fixed effect to simulated data
    X = mc.rnormal(0., 1.**-2, size=(N,len(beta_true)))
    Y_true = pl.dot(X, beta_true)

    for i in range(len(beta_true)):
        model.input_data['x_%d'%i] = X[:,i]
    model.input_data['true'] = pi_true * pl.exp(Y_true)

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n


    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    add_quality_metrics(model.input_data)


    model.beta = pandas.DataFrame(index=model.vars['p']['X'].columns)
    model.beta['true'] = 0.
    for i in range(len(beta_true)):
        model.beta['true']['x_%d'%i] = beta_true[i]
    
    model.beta['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['beta']]
    model.beta['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars['p']['beta']]
    add_quality_metrics(model.beta)

    print '\nbeta'
    print model.beta
    
    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'beta')

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta
    add_to_results(model, 'delta')

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.beta['abs_err'].dropna())),
                                                           model.beta.dropna()['covered?'].mean())
    add_to_results(model, 'input_data')
    add_to_results(model, 'beta')

    model.results = pandas.DataFrame(model.results)
    return model
Exemple #22
0
def validate_age_pattern_model_sim(N=500, delta_true=.15, pi_true=quadratic):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    model = data_simulation.simple_model(N)
    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)

    age_list = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    p = pi_age_true[age_list]
    n = mc.runiform(100, 10000, size=N)

    model.input_data['age_start'] = age_list
    model.input_data['age_end'] = age_list
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(a, pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(dict(true=pi_age_true,
                                     mu_pred=model.vars['p']['mu_age'].stats()['mean'],
                                     sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split())

    print model.results

    return model
Exemple #23
0
 def propose(self):
     x_other = [X_i.value for X_i in self.others]
     max_val = pl.sqrt(1. - pl.dot(x_other, x_other))
     self.stochastic.value = mc.runiform(-max_val, max_val)
Exemple #24
0
delta_true = 50.
N = 30

# start with a simple model with N rows of data
model = data_simulation.simple_model(N)

# set covariate to 0/1 values randomly
model.input_data['x_cov'] = 1. * mc.rcategorical([.5, .5], size=N)

# record the true age-specific rates
model.ages = pl.arange(0, 101, 1)
model.pi_age_true = pi_true(model.ages)

# choose age groups randomly
age_width = pl.zeros(N)
age_mid = mc.runiform(age_width / 2, 100 - age_width / 2, size=N)
age_start = pl.array(age_mid - age_width / 2, dtype=int)
age_end = pl.array(age_mid + age_width / 2, dtype=int)

model.input_data['age_start'] = age_start
model.input_data['age_end'] = age_end

# choose effective sample size uniformly at random
n = mc.runiform(100, 10000, size=N)
model.input_data['effective_sample_size'] = n

# find true rate, with covariate
p = model.pi_age_true[age_start] * pl.exp(
    model.input_data['x_cov'] * beta_true)

# sample observed rate values from negative binomial distribution
Exemple #25
0
 def propose(self):
     x_other = [X_i.value for X_i in self.others]
     max_val = pl.sqrt(1. - pl.dot(x_other, x_other))
     self.stochastic.value = mc.runiform(-max_val, max_val)
Exemple #26
0
def validate_covariate_model_fe(N=100,
                                delta_true=3,
                                pi_true=.01,
                                beta_true=[.5, -.5, 0.],
                                replicate=0):
    # set random seed for reproducibility
    mc.np.random.seed(1234567 + replicate)

    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    # add fixed effect to simulated data
    X = mc.rnormal(0., 1.**-2, size=(N, len(beta_true)))
    Y_true = pl.dot(X, beta_true)

    for i in range(len(beta_true)):
        model.input_data['x_%d' % i] = X[:, i]
    model.input_data['true'] = pi_true * pl.exp(Y_true)

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=5,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    add_quality_metrics(model.input_data)

    model.beta = pandas.DataFrame(index=model.vars['p']['X'].columns)
    model.beta['true'] = 0.
    for i in range(len(beta_true)):
        model.beta['true']['x_%d' % i] = beta_true[i]

    model.beta['mu_pred'] = [
        n.stats()['mean'] for n in model.vars['p']['beta']
    ]
    model.beta['sigma_pred'] = [
        n.stats()['standard deviation'] for n in model.vars['p']['beta']
    ]
    add_quality_metrics(model.beta)

    print '\nbeta'
    print model.beta

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'beta')

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta
    add_to_results(model, 'delta')

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (
        pl.median(pl.absolute(model.beta['abs_err'].dropna())),
        model.beta.dropna()['covered?'].mean())
    add_to_results(model, 'input_data')
    add_to_results(model, 'beta')

    model.results = pandas.DataFrame(model.results)
    return model
Exemple #27
0
def test_data_model_sim():
    # generate simulated data
    n = 50
    sigma_true = .025

    # start with truth
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    # choose age intervals to measure
    age_start = pl.array(mc.runiform(0, 100, n), dtype=int)
    age_start.sort()  # sort to make it easy to discard the edges when testing
    age_end = pl.array(mc.runiform(age_start + 1,
                                   pl.minimum(age_start + 10, 100)),
                       dtype=int)

    # find truth for the integral across the age intervals
    import scipy.integrate
    pi_interval_true = [
        scipy.integrate.trapz(pi_age_true[a_0i:(a_1i + 1)]) / (a_1i - a_0i)
        for a_0i, a_1i in zip(age_start, age_end)
    ]

    # generate covariates that add explained variation
    X = mc.rnormal(0., 1.**2, size=(n, 3))
    beta_true = [-.1, .1, .2]
    Y_true = pl.dot(X, beta_true)

    # calculate the true value of the rate in each interval
    pi_true = pi_interval_true * pl.exp(Y_true)

    # simulate the noisy measurement of the rate in each interval
    p = mc.rnormal(pi_true, 1. / sigma_true**2.)

    # store the simulated data in a pandas DataFrame
    data = pandas.DataFrame(
        dict(value=p,
             age_start=age_start,
             age_end=age_end,
             x_0=X[:, 0],
             x_1=X[:, 1],
             x_2=X[:, 2]))
    data['effective_sample_size'] = pl.maximum(p * (1 - p) / sigma_true**2, 1.)

    data['standard_error'] = pl.nan
    data['upper_ci'] = pl.nan
    data['lower_ci'] = pl.nan

    data['year_start'] = 2005.  # TODO: make these vary
    data['year_end'] = 2005.
    data['sex'] = 'total'
    data['area'] = 'all'

    # generate a moderately complicated hierarchy graph for the model
    hierarchy = nx.DiGraph()
    hierarchy.add_node('all')
    hierarchy.add_edge('all', 'super-region-1', weight=.1)
    hierarchy.add_edge('super-region-1', 'NAHI', weight=.1)
    hierarchy.add_edge('NAHI', 'CAN', weight=.1)
    hierarchy.add_edge('NAHI', 'USA', weight=.1)
    output_template = pandas.DataFrame(
        dict(year=[1990, 1990, 2005, 2005, 2010, 2010] * 2,
             sex=['male', 'female'] * 3 * 2,
             x_0=[.5] * 6 * 2,
             x_1=[0.] * 6 * 2,
             x_2=[.5] * 6 * 2,
             pop=[50.] * 6 * 2,
             area=['CAN'] * 6 + ['USA'] * 6))

    # create model and priors
    vars = data_model.data_model('test', data, hierarchy, 'all')

    # fit model
    mc.MAP(vars).fit(method='fmin_powell', verbose=1)
    m = mc.MCMC(vars)
    m.use_step_method(mc.AdaptiveMetropolis, [m.gamma_bar, m.gamma, m.beta])
    m.sample(30000, 15000, 15)

    # check estimates
    pi_usa = data_model.predict_for(output_template, hierarchy, 'all', 'USA',
                                    'male', 1990, vars)
    assert pl.allclose(pi_usa.mean(), (m.mu_age.trace() * pl.exp(.05)).mean(),
                       rtol=.1)

    # check convergence
    print 'gamma mc error:', m.gamma_bar.stats()['mc error'].round(
        2), m.gamma.stats()['mc error'].round(2)

    # plot results
    for a_0i, a_1i, p_i in zip(age_start, age_end, p):
        pl.plot([a_0i, a_1i], [p_i, p_i], 'rs-', mew=1, mec='w', ms=4)
    pl.plot(a, pi_age_true, 'g-', linewidth=2)
    pl.plot(pl.arange(101),
            m.mu_age.stats()['mean'],
            'k-',
            drawstyle='steps-post',
            linewidth=3)
    pl.plot(pl.arange(101),
            m.mu_age.stats()['95% HPD interval'],
            'k',
            linestyle='steps-post:')
    pl.plot(pl.arange(101),
            pi_usa.mean(0),
            'r-',
            linewidth=2,
            drawstyle='steps-post')
    pl.savefig('age_integrating_sim.png')

    # compare estimate to ground truth (skip endpoints, because they are extra hard to get right)
    assert pl.allclose(m.pi.stats()['mean'][10:-10], pi_true[10:-10], rtol=.2)
    lb, ub = m.pi.stats()['95% HPD interval'].T
    assert pl.mean((lb <= pi_true)[10:-10] & (pi_true <= ub)[10:-10]) > .75
def multipoly_sample(n, mp, test=None, verbose=0):
    """
    Returns uniformly-distributed points on the earth's surface 
    conditioned to be inside a multipolygon.
    
    Not particularly fast.
    """

    # b = basemap.Basemap(-180,-90,180,90)
    
    if isinstance(mp, geometry.MultiPolygon):
        if verbose>0:
            print 'Breaking down multipolygon'
        areas = [shapely_poly_area(p) for p in mp.geoms]
        areas = np.array(areas)
        areas /= np.sum(areas)

        # ns = pm.rmultinomial(n, areas)
        stair = np.round(np.concatenate(([0],np.cumsum(areas*n)))).astype('int')
        ns = np.diff(stair)
        locs = [multipoly_sample(ns[i], mp.geoms[i], test) for i in np.where(ns>0)[0]]
        lons = np.concatenate([loc[0] for loc in locs])
        lats = np.concatenate([loc[1] for loc in locs])

        if len(lons) != n or len(lats) != n:
            raise ValueError
        return lons, lats
    
    lons = np.empty(n)
    lats = np.empty(n)
    
    done = 0
    xmin = mp.bounds[0]*np.pi/180
    ymin = mp.bounds[1]*np.pi/180
    xmax = mp.bounds[2]*np.pi/180
    ymax = mp.bounds[3]*np.pi/180
    
    
    if verbose>0:
        print 'Starting: n=%i'%n
    while done < n:
        x = np.atleast_1d(pm.runiform(xmin,xmax, size=n))
        y = np.atleast_1d(np.arcsin(pm.runiform(np.sin(ymin),np.sin(ymax),size=n)))
        points=[geom.Point([x[i]*180./np.pi,y[i]*180./np.pi]) for i in xrange(len(x))]
        good = list(iterops.contains(mp, points, True))
        if test:
            good = filter(lambda p: test(p.coords[0][0], p.coords[0][1]),good)
        n_good = min(n,len(good))
        lons[done:done+n_good] = [p.coords[0][0] for p in good][:n-done]
        lats[done:done+n_good] = [p.coords[0][1] for p in good][:n-done]
        done += n_good
        if verbose>0:
            print '\tDid %i, %i remaining.'%(n_good,n-done)
    
    if verbose>0:
        print 'Filled'
    if test:
        if not np.all([test(lon,lat) for lon,lat in zip(lons,lats)]):
            raise ValueError, 'Test failed at some outputs'

    if len(lons)!=n or len(lats)!=n:
        raise ValueError

    return lons, lats
Exemple #29
0
def test_data_model_sim():
    # generate simulated data
    n = 50
    sigma_true = .025

    # start with truth
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    # choose age intervals to measure
    age_start = pl.array(mc.runiform(0, 100, n), dtype=int)
    age_start.sort()  # sort to make it easy to discard the edges when testing
    age_end = pl.array(mc.runiform(age_start+1, pl.minimum(age_start+10,100)), dtype=int)

    # find truth for the integral across the age intervals
    import scipy.integrate
    pi_interval_true = [scipy.integrate.trapz(pi_age_true[a_0i:(a_1i+1)]) / (a_1i - a_0i) 
                        for a_0i, a_1i in zip(age_start, age_end)]

    # generate covariates that add explained variation
    X = mc.rnormal(0., 1.**2, size=(n,3))
    beta_true = [-.1, .1, .2]
    Y_true = pl.dot(X, beta_true)

    # calculate the true value of the rate in each interval
    pi_true = pi_interval_true*pl.exp(Y_true)

    # simulate the noisy measurement of the rate in each interval
    p = mc.rnormal(pi_true, 1./sigma_true**2.)

    # store the simulated data in a pandas DataFrame
    data = pandas.DataFrame(dict(value=p, age_start=age_start, age_end=age_end,
                                 x_0=X[:,0], x_1=X[:,1], x_2=X[:,2]))
    data['effective_sample_size'] = pl.maximum(p*(1-p)/sigma_true**2, 1.)

    data['standard_error'] = pl.nan
    data['upper_ci'] = pl.nan
    data['lower_ci'] = pl.nan

    data['year_start'] = 2005.  # TODO: make these vary
    data['year_end'] = 2005.
    data['sex'] = 'total'
    data['area'] = 'all'

    # generate a moderately complicated hierarchy graph for the model
    hierarchy = nx.DiGraph()
    hierarchy.add_node('all')
    hierarchy.add_edge('all', 'super-region-1', weight=.1)
    hierarchy.add_edge('super-region-1', 'NAHI', weight=.1)
    hierarchy.add_edge('NAHI', 'CAN', weight=.1)
    hierarchy.add_edge('NAHI', 'USA', weight=.1)
    output_template=pandas.DataFrame(dict(year=[1990, 1990, 2005, 2005, 2010, 2010]*2,
                                          sex=['male', 'female']*3*2,
                                          x_0=[.5]*6*2,
                                          x_1=[0.]*6*2,
                                          x_2=[.5]*6*2,
                                          pop=[50.]*6*2,
                                          area=['CAN']*6 + ['USA']*6))
    

    # create model and priors
    vars = data_model.data_model('test', data, hierarchy, 'all')


    # fit model
    mc.MAP(vars).fit(method='fmin_powell', verbose=1)
    m = mc.MCMC(vars)
    m.use_step_method(mc.AdaptiveMetropolis, [m.gamma_bar, m.gamma, m.beta])
    m.sample(30000, 15000, 15)

    # check estimates
    pi_usa = data_model.predict_for(output_template, hierarchy, 'all', 'USA', 'male', 1990, vars)
    assert pl.allclose(pi_usa.mean(), (m.mu_age.trace()*pl.exp(.05)).mean(), rtol=.1)

    # check convergence
    print 'gamma mc error:', m.gamma_bar.stats()['mc error'].round(2), m.gamma.stats()['mc error'].round(2)


    # plot results
    for a_0i, a_1i, p_i in zip(age_start, age_end, p):
        pl.plot([a_0i, a_1i], [p_i,p_i], 'rs-', mew=1, mec='w', ms=4)
    pl.plot(a, pi_age_true, 'g-', linewidth=2)
    pl.plot(pl.arange(101), m.mu_age.stats()['mean'], 'k-', drawstyle='steps-post', linewidth=3)
    pl.plot(pl.arange(101), m.mu_age.stats()['95% HPD interval'], 'k', linestyle='steps-post:')
    pl.plot(pl.arange(101), pi_usa.mean(0), 'r-', linewidth=2, drawstyle='steps-post')
    pl.savefig('age_integrating_sim.png')

    # compare estimate to ground truth (skip endpoints, because they are extra hard to get right)
    assert pl.allclose(m.pi.stats()['mean'][10:-10], pi_true[10:-10], rtol=.2)
    lb, ub = m.pi.stats()['95% HPD interval'].T
    assert pl.mean((lb <= pi_true)[10:-10] & (pi_true <= ub)[10:-10]) > .75
Exemple #30
0
# <codecell>

print '$m_{\all}$ is as low as %.0f per 10,000 PY at age %d, but rises to %.0f per 10,000 PY at age %d.' % (min_mx*10000, age_min_mx, max_mx*10000, age_max_mx)

# <codecell>

mc.np.random.seed(1234567)

ages = pl.arange(101)
knots = [0, 15, 60, 100]
import scipy.interpolate
Y_true = pl.exp(scipy.interpolate.interp1d(knots, pl.log([1.2, .3, .6, 1.5]), kind='linear')(ages))

N = 50
tau = .1**-2
X = pl.array(mc.runiform(pl.arange(0., 100., 100./N), 100./N + pl.arange(0., 100., 100./N), size=N), dtype=int)
Y = mc.rnormal(Y_true[X], tau)

# <codecell>

def decorate_figure():
    pl.legend(loc='lower right', fancybox=True, shadow=True)#, prop={'size':'x-large'})
    #pl.xticks(fontsize='x-large')
    pl.yticks([0., .5, 1., 1.5])#, fontsize='x-large')
    pl.ylabel('$h(a)$', rotation=0)#, fontsize='xx-large')
    
    pl.subplots_adjust(.1, .175, .98, .875, .275)
    pl.axis([-5, 105, 0., 1.7])
    

# <codecell>
Exemple #31
0
def validate_consistent_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], 
                           true=dict(i=quadratic, f=constant, r=constant)):
    types = pl.array(['i', 'r', 'f', 'p'])

    ## generate simulated data
    model = data_simulation.simple_model(N)
    model.input_data['effective_sample_size'] = 1.
    model.input_data['value'] = 0.
    # coarse knot spacing for fast testing
    for t in types:
        model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20)

    sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {})
    for t in 'irf':
        for i, k_i in enumerate(sim[t]['knots']):
            sim[t]['gamma'][i].value = pl.log(true[t](k_i))

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)]


    a = pl.arange(101)
    age_weights = pl.ones_like(a)
    sum_wt = pl.cumsum(age_weights)

    p = pl.zeros(N)
    for t in types:
        mu_t = sim[t]['mu_age'].value
        sum_mu_wt = pl.cumsum(mu_t*age_weights)
    
        p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

        # correct cases where age_start == age_end
        i = age_start == age_end
        if pl.any(i):
            p_t[i] = mu_t[age_start[i]]

        # copy part into p
        p[data_type==t] = p_t[data_type==t]


    # add covariate shifts
    import dismod3
    import simplejson as json
    gbd_model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    model.hierarchy = gbd_model.hierarchy

    from validate_covariates import alpha_true_sim
    area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR'])
    alpha = {}
    for t in types:
        alpha[t] = alpha_true_sim(model, area_list, sigma_true)
    print json.dumps(alpha, indent=2)

    model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)]
    
    for i, a in model.input_data['area'].iteritems():
        t = data_type[i]
        p[i] = p[i] * pl.exp(pl.sum([alpha[t][n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha]))

    n = mc.runiform(100, 10000, size=N)

    model.input_data['data_type'] = data_type
    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n

    # coarse knot spacing for fast testing
    for t in types:
        model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20)

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {})
    #model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=101, burn=0, thin=1, tune_interval=100)
    model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100)

    graphics.plot_convergence_diag(model.vars)

    graphics.plot_fit(model, model.vars, {}, {})
    for i, t in enumerate('i r f p rr pf'.split()):
        pl.subplot(2, 3, i+1)
        pl.plot(range(101), sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2)
        pl.plot(range(101), sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1)

    pl.show()

    model.input_data['mu_pred'] = 0.
    model.input_data['sigma_pred'] = 0.
    for t in types:
        model.input_data['mu_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['mean']
        model.input_data['sigma_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true for t in types if t != 'rr']))
    model.delta['mu_pred'] = [pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr']
    model.delta['sigma_pred'] = [pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr']
    data_simulation.add_quality_metrics(model.delta)

    model.alpha = pandas.DataFrame()
    model.sigma = pandas.DataFrame()
    for t in types:
        alpha_t = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
        alpha_t['true'] = pandas.Series(dict(alpha[t]))
        alpha_t['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns)
        alpha_t['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns)
        alpha_t['type'] = t
        model.alpha = model.alpha.append(alpha_t.dropna(), ignore_index=True)

        sigma_t = pandas.DataFrame(dict(true=sigma_true))
        sigma_t['mu_pred'] = [n.stats()['mean'] for n in model.vars[t]['sigma_alpha']]
        sigma_t['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars[t]['sigma_alpha']]
        model.sigma = model.sigma.append(sigma_t.dropna(), ignore_index=True)

    data_simulation.add_quality_metrics(model.alpha)
    data_simulation.add_quality_metrics(model.sigma)


    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame()
    for t in types:
        model.mu = model.mu.append(pandas.DataFrame(dict(true=sim[t]['mu_age'].value,
                                                         mu_pred=model.vars[t]['mu_age'].stats()['mean'],
                                                         sigma_pred=model.vars[t]['mu_age'].stats()['standard deviation'])),
                                   ignore_index=True)
    data_simulation.add_quality_metrics(model.mu)
    print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.mu['abs_err'].mean(),
                                                                         pl.median(pl.absolute(model.mu['rel_err'].dropna())),
                                                                         model.mu['covered?'].mean())
    print


    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.add_to_results(model, 'alpha')
    data_simulation.add_to_results(model, 'sigma')
    data_simulation.finalize_results(model)

    print model.results

    return model
def validate_age_integrating_model_sim(N=500,
                                       delta_true=.15,
                                       pi_true=quadratic):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    model = data_simulation.simple_model(N)
    #model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    #model.parameters['p']['smoothness'] = dict(amount='Very')

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true * age_weights)
    sum_wt = pl.cumsum(age_weights)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] -
                                                       sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    n = mc.runiform(100, 10000, size=N)

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=25,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(a, pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(
        dict(true=pi_age_true,
             mu_pred=model.vars['p']['mu_age'].stats()['mean'],
             sigma_pred=model.vars['p']['mu_age'].stats()
             ['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    model.results = pandas.DataFrame(model.results,
                                     columns='param bias mae mare pc'.split())

    print model.results

    return model
Exemple #33
0
where p(x) stands for the prior for the true input and p(a,b) the prior for the
regression parameters.
"""
from pymc import stochastic, observed, deterministic, uniform_like, runiform, rnormal, Sampler, Normal, Uniform
from numpy import inf, log, cos,array
import pylab

# ------------------------------------------------------------------------------
# Synthetic values
# Replace by real data
# ------------------------------------------------------------------------------
slope = 1.5
intercept = 4
N = 30
true_x = runiform(0,50, N)
true_y = slope*true_x + intercept
data_y = rnormal(true_y, 2)
data_x = rnormal(true_x, 2)



# ------------------------------------------------------------------------------
# Calibration of straight line parameters from data
# ------------------------------------------------------------------------------


@stochastic
def theta(value=array([2.,5.])):
    """Slope and intercept parameters for a straight line.
    The likelihood corresponds to the prior probability of the parameters."""
Exemple #34
0
# start with a simple model with N rows of data
model = data_simulation.simple_model(N)


# set covariate to 0/1 values randomly
model.input_data['x_cov'] = 1. * mc.rcategorical([.5, .5], size=N)

# record the true age-specific rates
model.ages = pl.arange(0, 101, 1)
model.pi_age_true = pi_true(model.ages)


# choose age groups randomly
age_width = pl.zeros(N)
age_mid = mc.runiform(age_width/2, 100-age_width/2, size=N)
age_start = pl.array(age_mid - age_width/2, dtype=int)
age_end = pl.array(age_mid + age_width/2, dtype=int)

model.input_data['age_start'] = age_start
model.input_data['age_end'] = age_end


# choose effective sample size uniformly at random
n = mc.runiform(100, 10000, size=N)
model.input_data['effective_sample_size'] = n


# find true rate, with covariate
p = model.pi_age_true[age_start] * pl.exp(model.input_data['x_cov']*beta_true)
import numpy as np
import matplotlib.pyplot as plt
import pymc as mc
import scipy.stats as stats
import math

# http://blog.yhathq.com/posts/estimating-user-lifetimes-with-pymc.html

# artificial data
N = 20
true_alpha = 2
true_beta = 5
lifetime = mc.rweibull(true_alpha, true_beta, size=N)
birth = mc.runiform(0, 10, N)

# an individual is right censored if this is true
censor = (birth + lifetime) > 10
lifetime_ = np.ma.masked_array(lifetime, censor)
lifetime_.set_fill_value(10)

plt.clf()
y = np.arange(0, N)
for b, l, yy in zip(birth, lifetime, y):
    plt.plot([b, b + l], [yy, yy])
plt.plot(birth + lifetime, y, linestyle="", marker="o")
plt.draw()
plt.show(block=False)

# begin the model
# just use uniform priors
alpha = mc.Uniform("alpha", 0, 20)
Exemple #36
0
### @export 'initialize'
df = pandas.read_csv('ssas_mx.csv', index_col=None)

ages = pl.arange(101)
knots = [0, 15, 60, 100]
import scipy.interpolate

Y_true = pl.exp(
    scipy.interpolate.interp1d(knots,
                               pl.log([1.2, .3, .6, 1.5]),
                               kind='linear')(ages))

N = 50
tau = .1**-2
X = pl.array(mc.runiform(pl.arange(0., 100., 100. / N),
                         100. / N + pl.arange(0., 100., 100. / N),
                         size=N),
             dtype=int)
Y = mc.rnormal(Y_true[X], tau)

### @export 'initial-rates'
pl.figure(figsize=(17., 11), dpi=72)

dismod3.graphics.plot_data_bars(df, 'talk')
pl.semilogy([0], [.1], '-')

pl.title(
    'All-cause mortality rate\nin 1990 for females\nin sub-Saharan Africa, Southern.',
    size=55)
pl.ylabel('Rate (Per PY)', size=48)
pl.xlabel('Age (Years)', size=48)
def validate_consistent_model_sim(N=500,
                                  delta_true=.5,
                                  true=dict(i=quadratic,
                                            f=constant,
                                            r=constant)):
    types = pl.array(['i', 'r', 'f', 'p'])

    ## generate simulated data
    model = data_simulation.simple_model(N)
    model.input_data['effective_sample_size'] = 1.
    model.input_data['value'] = 0.

    for t in types:
        model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20)

    sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {})
    for t in 'irf':
        for i, k_i in enumerate(sim[t]['knots']):
            sim[t]['gamma'][i].value = pl.log(true[t](k_i))

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) /
                                      float(len(types)),
                                      size=N)]

    a = pl.arange(101)
    age_weights = pl.ones_like(a)
    sum_wt = pl.cumsum(age_weights)

    p = pl.zeros(N)
    for t in types:
        mu_t = sim[t]['mu_age'].value
        sum_mu_wt = pl.cumsum(mu_t * age_weights)

        p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] -
                                                             sum_wt[age_start])

        # correct cases where age_start == age_end
        i = age_start == age_end
        if pl.any(i):
            p_t[i] = mu_t[age_start[i]]

        # copy part into p
        p[data_type == t] = p_t[data_type == t]
    n = mc.runiform(100, 10000, size=N)

    model.input_data['data_type'] = data_type
    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    # coarse knot spacing for fast testing
    for t in types:
        model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20)

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars = consistent_model.consistent_model(model, 'all', 'total',
                                                   'all', {})
    model.map, model.mcmc = fit_model.fit_consistent_model(model.vars,
                                                           iter=10000,
                                                           burn=5000,
                                                           thin=25,
                                                           tune_interval=100)

    graphics.plot_convergence_diag(model.vars)

    graphics.plot_fit(model, model.vars, {}, {})
    for i, t in enumerate('i r f p rr pf'.split()):
        pl.subplot(2, 3, i + 1)
        pl.plot(a, sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2)
        pl.plot(a, sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1)

    #graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    #pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = 0.
    model.input_data['sigma_pred'] = 0.
    for t in types:
        model.input_data['mu_pred'][
            data_type == t] = model.vars[t]['p_pred'].stats()['mean']
        model.input_data['sigma_pred'][data_type == t] = model.vars['p'][
            'p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(
        dict(true=[delta_true for t in types if t != 'rr']))
    model.delta['mu_pred'] = [
        pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr'
    ]
    model.delta['sigma_pred'] = [
        pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr'
    ]
    data_simulation.add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame()
    for t in types:
        model.mu = model.mu.append(pandas.DataFrame(
            dict(true=sim[t]['mu_age'].value,
                 mu_pred=model.vars[t]['mu_age'].stats()['mean'],
                 sigma_pred=model.vars[t]['mu_age'].stats()
                 ['standard deviation'])),
                                   ignore_index=True)
    data_simulation.add_quality_metrics(model.mu)
    print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.mu['abs_err'].mean(),
        pl.median(pl.absolute(
            model.mu['rel_err'].dropna())), model.mu['covered?'].mean())
    print

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.finalize_results(model)

    print model.results

    return model
Exemple #38
0
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]):
    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0
    delta = delta_true * pl.exp(pl.dot(Z, zeta_true))
    for i in range(len(zeta_true)):
        model.input_data['z_%d'%i] = Z[:,i]

    model.input_data['true'] = pi_true

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta*n*p) / n


    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()


    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    add_quality_metrics(model.input_data)


    model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns)
    model.zeta['true'] = zeta_true
    
    model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean']
    model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats()['standard deviation']
    add_quality_metrics(model.zeta)

    print '\nzeta'
    print model.zeta
    
    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.zeta['abs_err'].dropna())),
                                                           model.zeta.dropna()['covered?'].mean())


    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'delta')
    add_to_results(model, 'input_data')
    add_to_results(model, 'zeta')
    model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split())

    return model
def validate_ai_re(N=500,
                   delta_true=.15,
                   sigma_true=[.1, .1, .1, .1, .1],
                   pi_true=quadratic,
                   smoothness='Moderately',
                   heterogeneity='Slightly'):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    import dismod3
    import simplejson as json
    model = data.ModelData.from_gbd_jsons(
        json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    gbd_hierarchy = model.hierarchy

    model = data_simulation.simple_model(N)
    model.hierarchy = gbd_hierarchy

    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    model.parameters['p']['smoothness'] = dict(amount=smoothness)
    model.parameters['p']['heterogeneity'] = heterogeneity

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true * age_weights)
    sum_wt = pl.cumsum(age_weights * 1.)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] -
                                                       sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N)

    from validate_covariates import alpha_true_sim
    area_list = pl.array([
        'all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT',
        'IRN', 'IRQ', 'JOR', 'SYR'
    ])
    alpha = alpha_true_sim(model, area_list, sigma_true)
    print alpha

    model.input_data['true'] = pl.nan

    model.input_data['area'] = area_list[mc.rcategorical(
        pl.ones(len(area_list)) / float(len(area_list)), N)]

    for i, a in model.input_data['area'].iteritems():
        model.input_data['true'][i] = p[i] * pl.exp(
            pl.sum([
                alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a)
                if n in alpha
            ]))
    p = model.input_data['true']

    n = model.input_data['effective_sample_size']
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p',
                                            'north_africa_middle_east',
                                            'total', 'all', None, None, None)
    #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=25,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(range(101), pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    model.alpha = pandas.DataFrame(
        index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
    model.alpha['true'] = pandas.Series(dict(alpha))
    model.alpha['mu_pred'] = pandas.Series(
        [n.stats()['mean'] for n in model.vars['p']['alpha']],
        index=model.vars['p']['U'].columns)
    model.alpha['sigma_pred'] = pandas.Series(
        [n.stats()['standard deviation'] for n in model.vars['p']['alpha']],
        index=model.vars['p']['U'].columns)
    model.alpha = model.alpha.dropna()
    data_simulation.add_quality_metrics(model.alpha)

    model.sigma = pandas.DataFrame(dict(true=sigma_true))
    model.sigma['mu_pred'] = [
        n.stats()['mean'] for n in model.vars['p']['sigma_alpha']
    ]
    model.sigma['sigma_pred'] = [
        n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']
    ]
    data_simulation.add_quality_metrics(model.sigma)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(
        dict(true=pi_age_true,
             mu_pred=model.vars['p']['mu_age'].stats()['mean'],
             sigma_pred=model.vars['p']['mu_age'].stats()
             ['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.add_to_results(model, 'alpha')
    data_simulation.add_to_results(model, 'sigma')
    data_simulation.finalize_results(model)

    print model.results

    return model