Ejemplo n.º 1
0
def test_predict_for_wo_data():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """
    
    
    d = data.ModelData()
    d.hierarchy, d.output_template = data_simulation.small_output()


    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None, None)

    # fit model
    m = mc.MCMC(vars)
    m.sample(1)


    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(dist='Constant', mu=0, sigma=1.e-9) # zero out REs to see if test passes
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)


    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients
    # FIXME: this test was failing because PyMC is drawing from the prior of beta[0] even though I asked for NoStepper
                                                      
    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1']):
        d.parameters['p']['random_effects'][node]['mu'] = (i+1.)/10.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(.5*vars['p']['beta'][0].value) # beta[0] is drawn from prior, even though I set it to NoStepper, see FIXME above
    re_usa_1990 = pl.exp(.1+.2+.3)
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)
Ejemplo n.º 2
0
def test_predict_for_wo_data():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """

    d = data.ModelData()
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None,
                                 None)

    # fit model
    m = mc.MCMC(vars)
    m.sample(1)

    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(
            dist='Constant', mu=0,
            sigma=1.e-9)  # zero out REs to see if test passes

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients
    # FIXME: this test was failing because PyMC is drawing from the prior of beta[0] even though I asked for NoStepper

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1']):
        d.parameters['p']['random_effects'][node]['mu'] = (i + 1.) / 10.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(
        .5 * vars['p']['beta'][0].value
    )  # beta[0] is drawn from prior, even though I set it to NoStepper, see FIXME above
    re_usa_1990 = pl.exp(.1 + .2 + .3)
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)
Ejemplo n.º 3
0
def test_covariate_model_shift_for_root_consistency():
    # generate simulated data
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        'p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None,
                                 None)

    vars = ism.age_specific_rate(d, 'p', 'all', 'male', 1990, None, None, None)

    # fit model
    m = mc.MCMC(vars)

    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters['p'], 'all', 'male',
                                         1990, 'USA', 'male', 1990, 0.,
                                         vars['p'], 0., pl.inf)
Ejemplo n.º 4
0
def test_data_model_sim():
    # generate simulated data
    data_type = 'p'
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()
    
    # create model and priors
    vars = ism.age_specific_rate(d, data_type,
                                 reference_area='all', reference_sex='total', reference_year='all',
                                 mu_age=None, mu_age_parent=None, sigma_age_parent=None)


    # fit model
    m = mc.MCMC(vars)
    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters, 'all', 'total', 'all', 'USA', 'male', 1990, 0., vars[data_type], -pl.inf, pl.inf)


    # create model w/ emp prior
    # create model and priors
    vars = ism.age_specific_rate(d, data_type,
                                 reference_area='all', reference_sex='total', reference_year='all',
                                 mu_age=None, mu_age_parent=pi_usa.mean(0), sigma_age_parent=pi_usa.std(0))
Ejemplo n.º 5
0
def fit_simulated(dm, area, sex, year):
    #dm.map, dm.mcmc = fit_model.fit_consistent_model(dm.vars, iter=101, burn=0, thin=1, tune_interval=100)
    dm.map, dm.mcmc = fit_model.fit_consistent_model(dm.vars, iter=10000, burn=5000, thin=25, tune_interval=100)

    posteriors = {}
    for t in 'i r f p rr pf'.split():
        est_k = covariate_model.predict_for(dm.model, area, sex, year, area, sex, year, 1., dm.vars[t], 0., pl.inf)
        posteriors[t] = est_k
    dm.posteriors = posteriors
Ejemplo n.º 6
0
def fit_simulated(dm, area, sex, year):
    #dm.map, dm.mcmc = fit_model.fit_consistent_model(dm.vars, iter=101, burn=0, thin=1, tune_interval=100)
    dm.map, dm.mcmc = fit_model.fit_consistent_model(dm.vars,
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=25,
                                                     tune_interval=100)

    posteriors = {}
    for t in 'i r f p rr pf'.split():
        est_k = covariate_model.predict_for(dm.model, area, sex, year, area,
                                            sex, year, 1., dm.vars[t], 0.,
                                            pl.inf)
        posteriors[t] = est_k
    dm.posteriors = posteriors
Ejemplo n.º 7
0
def test_predict_for_wo_effects():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """

    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        'p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d,
                                 'p',
                                 'NAHI',
                                 'male',
                                 2005,
                                 None,
                                 None,
                                 None,
                                 include_covariates=False)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(10)

    ### Prediction case: prediction should match mu age

    pred = covariate_model.predict_for(d, d.parameters['p'], 'NAHI', 'male',
                                       2005, 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    assert_almost_equal(pred, vars['p']['mu_age'].trace())
Ejemplo n.º 8
0
def test_data_model_sim():
    # generate simulated data
    data_type = 'p'
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        data_type, n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d,
                                 data_type,
                                 reference_area='all',
                                 reference_sex='total',
                                 reference_year='all',
                                 mu_age=None,
                                 mu_age_parent=None,
                                 sigma_age_parent=None)

    # fit model
    m = mc.MCMC(vars)
    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters, 'all', 'total',
                                         'all', 'USA', 'male', 1990, 0.,
                                         vars[data_type], -pl.inf, pl.inf)

    # create model w/ emp prior
    # create model and priors
    vars = ism.age_specific_rate(d,
                                 data_type,
                                 reference_area='all',
                                 reference_sex='total',
                                 reference_year='all',
                                 mu_age=None,
                                 mu_age_parent=pi_usa.mean(0),
                                 sigma_age_parent=pi_usa.std(0))
Ejemplo n.º 9
0
def test_predict_for_wo_effects():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """
    
    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)
    
    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()


    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'NAHI', 'male', 2005, None, None, None, include_covariates=False)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(10)


    ### Prediction case: prediction should match mu age
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'NAHI', 'male', 2005,
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace())
Ejemplo n.º 10
0
def test_covariate_model_shift_for_root_consistency():
    # generate simulated data
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)
    
    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()
    

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None, None)

    vars = ism.age_specific_rate(d, 'p', 'all', 'male', 1990, None, None, None)

    # fit model
    m = mc.MCMC(vars)

    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters['p'], 'all', 'male', 1990, 'USA', 'male', 1990, 0., vars['p'], 0., pl.inf)
Ejemplo n.º 11
0
def fit_emp_prior(id,
                  param_type,
                  fast_fit=False,
                  generate_emp_priors=True,
                  zero_re=True,
                  alt_prior=False,
                  global_heterogeneity='Slightly'):
    """ Fit empirical prior of specified type for specified model

    Parameters
    ----------
    id : int
      The model id number for the job to fit
    param_type : str, one of incidence, prevalence, remission, excess-mortality, prevalence_x_excess-mortality
      The disease parameter to generate empirical priors for

    Example
    -------
    >>> import fit_emp_prior
    >>> fit_emp_prior.fit_emp_prior(2552, 'incidence')
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    dm = dismod3.load_disease_model(id)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if 'heterogeneity' in model.parameters[t]:
            model.parameters[t]['heterogeneity'] = global_heterogeneity

    t = {
        'incidence': 'i',
        'prevalence': 'p',
        'remission': 'r',
        'excess-mortality': 'f',
        'prevalence_x_excess-mortality': 'pf'
    }[param_type]
    model.input_data = model.get_data(t)
    if len(model.input_data) == 0:
        print 'No data for type %s, exiting' % param_type
        return dm

    ### For testing:
    ## speed up computation by reducing number of knots
    ## model.parameters[t]['parameter_age_mesh'] = [0, 10, 20, 40, 60, 100]

    ## smooth Slightly, Moderately, or Very
    ## model.parameters[t]['smoothness'] = dict(age_start=0, age_end=100, amount='Very')

    ## speed up computation be reducing data size
    ## predict_area = 'super-region_0'
    ## predict_year=2005
    ## predict_sex='total'
    ## subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)
    ## relevant_rows = [i for i, r in model.input_data.T.iteritems() \
    ##                      if (r['area'] in subtree or r['area'] == 'all')\
    ##                      and (r['year_end'] >= 1997) \
    ##                      and r['sex'] in [predict_sex, 'total']]
    ## model.input_data = model.input_data.ix[relevant_rows]

    # testing changes
    #model.input_data['effective_sample_size'] = pl.minimum(1.e3, model.input_data['effective_sample_size'])
    #missing_ess = pl.isnan(model.input_data['effective_sample_size'])
    #model.input_data['effective_sample_size'][missing_ess] = 1.
    #model.input_data['z_overdisperse'] = 1.
    #print model.describe(t)
    #model.input_data = model.input_data[model.input_data['area'].map(lambda x: x in nx.bfs_tree(model.hierarchy, 'super-region_5'))]
    #model.input_data = model.input_data = model.input_data.drop(['x_LDI_id_Updated_7July2011'], axis=1)
    #model.input_data = model.input_data.filter([model.input_data['x_nottroponinuse'] == 0.]
    #model.input_data = model.input_data[:100]

    ## speed up output by not making predictions for empirical priors
    #generate_emp_priors = False

    print 'fitting', t
    model.vars += ism.age_specific_rate(model,
                                        t,
                                        reference_area='all',
                                        reference_sex='total',
                                        reference_year='all',
                                        mu_age=None,
                                        mu_age_parent=None,
                                        sigma_age_parent=None,
                                        rate_type=(t == 'rr') and 'log_normal'
                                        or 'neg_binom',
                                        zero_re=zero_re)
    # for backwards compatibility, should be removed eventually
    dm.model = model
    dm.vars = model.vars[t]
    vars = dm.vars

    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model,
                                              t,
                                              iter=101,
                                              burn=0,
                                              thin=1,
                                              tune_interval=100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model,
                                              t,
                                              iter=50000,
                                              burn=10000,
                                              thin=40,
                                              tune_interval=1000,
                                              verbose=True)

    stats = dm.vars['p_pred'].stats(batches=5)
    dm.vars['data']['mu_pred'] = stats['mean']
    dm.vars['data']['sigma_pred'] = stats['standard deviation']

    stats = dm.vars['pi'].stats(batches=5)
    dm.vars['data']['mc_error'] = stats['mc error']

    dm.vars['data'][
        'residual'] = dm.vars['data']['value'] - dm.vars['data']['mu_pred']
    dm.vars['data']['abs_residual'] = pl.absolute(dm.vars['data']['residual'])

    graphics.plot_fit(model,
                      data_types=[t],
                      ylab=['PY'],
                      plot_config=(1, 1),
                      fig_size=(8, 8))
    if generate_emp_priors:
        for a in [
                dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions
        ]:
            print 'generating empirical prior for %s' % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and 'level_bounds' in model.parameters[
                            t]:
                        lower = model.parameters[t]['level_bounds']['lower']
                        upper = model.parameters[t]['level_bounds']['upper']
                    else:
                        lower = 0
                        upper = pl.inf

                    emp_priors = covariate_model.predict_for(
                        model, model.parameters[t], 'all', 'total', 'all', a,
                        dismod3.utils.clean(s), int(y), alt_prior, vars, lower,
                        upper)
                    dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0))

                    if 'eta' in vars:
                        N, A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose([
                            pl.exp(vars['eta'].trace()) for _ in range(A)
                        ])  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(
                            emp_priors.var(0) +
                            (emp_priors**2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc('emp_prior_std', key, emp_prior_std)

                    pl.plot(model.parameters['ages'],
                            dm.get_mcmc('emp_prior_mean', key),
                            color='grey',
                            label=a,
                            zorder=-10,
                            alpha=.5)
    pl.savefig(dir + '/prior-%s.png' % param_type)

    store_effect_coefficients(dm, vars, param_type)

    #graphics.plot_one_ppc(vars, t)
    #pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

    graphics.plot_acorr(model)
    pl.savefig(dir + '/prior-%s-convergence.png' % param_type)
    graphics.plot_trace(model)
    pl.savefig(dir + '/prior-%s-trace.png' % param_type)

    graphics.plot_one_effects(model, t)
    pl.savefig(dir + '/prior-%s-effects.png' % param_type)

    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save('dm-%d-prior-%s.json' % (id, param_type))
    except IOError, e:
        print e
Ejemplo n.º 12
0
def fit_posterior(dm,
                  region,
                  sex,
                  year,
                  fast_fit=False,
                  inconsistent_fit=False,
                  params_to_fit=['p', 'r', 'i'],
                  zero_re=True,
                  posteriors_only=False):
    """ Fit posterior of specified region/sex/year for specified model

    Parameters
    ----------
    dm : DiseaseJson
    region : str
      From dismod3.settings.gbd_regions, but clean()-ed
    sex : str, from dismod3.settings.gbd_sexes
    year : str, from dismod3.settings.gbd_years

    fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing)
    inconsistent_fit : fit parameters  separately
    params_to_fit : list of params to fit, if not fitting all consistently

    zero_re : bool, if true, enforce constraint that sibling area REs sum to zero
    posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010

    Example
    -------
    >>> import fit_posterior
    >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005')
    """
    dir = dismod3.settings.JOB_WORKING_DIR % dm.id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    predict_area = dismod3.utils.clean(region)
    predict_sex = dismod3.utils.clean(sex)
    predict_year = int(year)

    ## load emp_priors dict from dm.params
    param_type = dict(i='incidence',
                      p='prevalence',
                      r='remission',
                      f='excess-mortality',
                      rr='relative-risk',
                      pf='prevalence_x_excess-mortality',
                      m_with='mortality')
    emp_priors = {}
    for t in 'i r p f'.split():

        # uncomment below to not use empirical prior for rate with zero data
        # if pl.all(model.input_data['data_type'] != t):
        #     continue

        #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex)
        key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex)
        mu = dm.get_mcmc('emp_prior_mean', key)
        #mu = dm.get_mcmc('emp_prior_median', key)
        sigma = dm.get_mcmc('emp_prior_std', key)

        if len(mu) == 101 and len(sigma) == 101:
            emp_priors[t, 'mu'] = mu

            # TODO: determine best way to propagate prior on function
            emp_priors[t, 'sigma'] = sigma

            # ALT 1: scale so that the joint probability is not a
            # function of the length of the age function
            # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma))

        ## update model.parameters['random_effects'] if there is information in the disease model
        expert_priors = model.parameters[t].get('random_effects', {})
        model.parameters[t]['random_effects'] = dm.get_empirical_prior(
            param_type[t]).get('new_alpha', {})
        model.parameters[t]['random_effects'].update(expert_priors)

        # shift random effects to make REs for observed children of predict area have mean zero
        re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \
                           for area in model.hierarchy.neighbors(predict_area) \
                           if area in model.parameters[t]['random_effects']])
        for area in model.hierarchy.neighbors(predict_area):
            if area in model.parameters[t]['random_effects']:
                model.parameters[t]['random_effects'][area]['mu'] -= re_mean

        ## update model.parameters['fixed_effects'] if there is information in the disease model
        expert_fe_priors = model.parameters[t].get('fixed_effects', {})
        model.parameters[t]['fixed_effects'].update(
            dm.get_empirical_prior(param_type[t]).get('new_beta', {}))

    ## create model and priors for region/sex/year
    # select data that is about areas in this region, recent years, and sex of male or total only
    assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area
    subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)

    def is_relevant(r):
        if (r['area'] not in subtree) and r['area'] != 'all':
            return False

        if predict_year == 1990:
            if r['year_start'] > 1997:
                return False
        elif predict_year == 2005:
            if posteriors_only:
                if r['year_end'] < 1997 or r['year_start'] > 2007:
                    return False
            else:
                if r['year_end'] < 1997:
                    return False
        elif predict_year == 2010:
            if posteriors_only:
                if r['data_type'] == 'm_all':
                    # include m_all data from 2005, since 2010 is not loaded
                    if r['year_end'] < 1997:
                        return False
                else:
                    if r['year_end'] < 2007:
                        return False
            else:
                if r['year_end'] < 1997:
                    return False
        else:
            assert 0, 'Predictions for year %d not yet implemented' % predict_year

        if r['sex'] not in [predict_sex, 'total']:
            return False

        return True

    old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \
                         if (r['area'] in subtree or r['area'] == 'all')\
                         and ((predict_year >= 1997 and r['year_end'] >= 1997) or
                              (predict_year <= 1997 and r['year_start'] <= 1997)) \
                         and r['sex'] in [predict_sex, 'total']]

    relevant_rows = model.input_data.index[model.input_data.apply(is_relevant,
                                                                  axis=1)]

    if predict_year == 1990:
        assert pl.all(
            relevant_rows == old_relevant_rows
        ), "relevant rows should be the same in new and old implementation for 1990"

    if not posteriors_only:
        assert pl.all(
            relevant_rows == old_relevant_rows
        ), "relevant rows should be the same in new and old implementation when posteriors_only is False"

    model.input_data = model.input_data.ix[relevant_rows]

    # replace area 'all' with predict_area
    model.input_data['area'][model.input_data['area'] == 'all'] = predict_area

    if inconsistent_fit:
        # generate fits for requested parameters inconsistently
        for t in params_to_fit:
            model.vars += ism.age_specific_rate(
                model,
                t,
                reference_area=predict_area,
                reference_sex=predict_sex,
                reference_year=predict_year,
                mu_age=None,
                mu_age_parent=emp_priors.get((t, 'mu')),
                sigma_age_parent=emp_priors.get((t, 'sigma')),
                rate_type=(t == 'rr') and 'log_normal' or 'neg_binom',
                zero_re=zero_re)
            if fast_fit:
                dismod3.fit.fit_asr(model,
                                    t,
                                    iter=101,
                                    burn=0,
                                    thin=1,
                                    tune_interval=100)
            else:
                dismod3.fit.fit_asr(model,
                                    t,
                                    iter=iter,
                                    burn=burn,
                                    thin=thin,
                                    tune_interval=100)

    else:
        model.vars += ism.consistent(model,
                                     reference_area=predict_area,
                                     reference_sex=predict_sex,
                                     reference_year=predict_year,
                                     priors=emp_priors,
                                     zero_re=zero_re)

        ## fit model to data
        if fast_fit:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
        else:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model,
                                                         iter=iter,
                                                         burn=burn,
                                                         thin=thin,
                                                         tune_interval=100,
                                                         verbose=True)

    # generate estimates
    posteriors = {}
    for t in 'i r f p rr pf m_with X'.split():
        if t in model.vars:
            if t in model.parameters and 'level_bounds' in model.parameters[t]:
                lower = model.parameters[t]['level_bounds']['lower']
                upper = model.parameters[t]['level_bounds']['upper']
            else:
                lower = 0
                upper = pl.inf
            posteriors[t] = covariate_model.predict_for(
                model,
                model.parameters.get(t, {}),
                predict_area,
                predict_sex,
                predict_year,
                predict_area,
                predict_sex,
                predict_year,
                True,  # population weighted averages
                model.vars[t],
                lower,
                upper)
    try:
        graphics.plot_fit(model, vars, emp_priors, {})
        pl.savefig(dir + '/image/posterior-%s+%s+%s.png' %
                   (predict_area, predict_sex, predict_year))
    except Exception, e:
        print 'Error generating output graphics'
        print e
Ejemplo n.º 13
0
model.parameters['p']['fixed_effects']['x_CODcorrected_Cirrhosis_ASDR'] = dict(dist='TruncatedNormal', mu=0, sigma=.01, lower=-1., upper=1.)
model.parameters['p']['fixed_effects']['x_IHME_alcohol_liters_pc_25July11'] = dict(dist='TruncatedNormal', mu=0, sigma=.01, lower=-1., upper=1.)
# create model for global prevalence
root_area = 'all'
t = 'p'
vars = data_model.data_model(t, model, t,
                             root_area='all', root_sex='total', root_year='all',
                             mu_age=None, mu_age_parent=None, sigma_age_parent=None)

m = fit_model.fit_data_model(vars, iter=4000, burn=2000, thin=20, tune_interval=100)


# generate estimates for all leaves of the hierarchy
est = {}
for n in model.hierarchy:
    if len(model.hierarchy.successors(n)) == 0:
        est[n] = pl.median(covariate_model.predict_for(model,
                                                       'all', 'total', 'all',
                                                       n, 'male', 2005, 0., vars, 0., 1.), axis=0)

graphics.plot_one_type(model, vars, {}, 'p')

r='europe_western'
N = len(model.hierarchy[r])
for i, n in enumerate(sorted(model.hierarchy[r], key=lambda n: est[n][-1])):
    col = pl.cm.Spectral((.1+i)/N)
    pl.semilogy(est[n], label=n, color=col, linewidth=2)
    pl.text(100,est[n][-1], n, color=col)

pl.axis([-5,110,1e-5,.02])
Ejemplo n.º 14
0
def test_predict_for_w_region_as_reference():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """
    
    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)
    
    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()


    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'NAHI', 'male', 2005, None, None, None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(10)


    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(dist='Constant', mu=0, sigma=1.e-9) # zero out REs to see if test passes
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'NAHI', 'male', 2005,
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(0.)
    re_usa_1990 = pl.exp(0.)
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1', 'all']):
        d.parameters['p']['random_effects'][node]['mu'] = (i+1.)/10.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'NAHI', 'male', 2005,
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(0.)
    re_usa_1990 = pl.exp(.1)
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 3: random effect not constant, zero fixed effect coefficients

    # set random seed to make randomness reproducible
    pl.np.random.seed(12345)
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'NAHI', 'male', 2005,
                                         'CAN', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    pl.np.random.seed(12345)
    fe = pl.exp(0.)
    re = pl.exp(mc.rnormal(0., vars['p']['sigma_alpha'][3].trace()**-2))
    assert_almost_equal(pred.mean(0),
                        (vars['p']['mu_age'].trace().T * fe * re).T.mean(0))
Ejemplo n.º 15
0
def save_country_level_posterior(dm, model, vars, region, sex, year,
                                 rate_type_list):
    """ Save country level posterior in a csv file, and put the file in the 
    directory job_working_directory/posterior/country_level_posterior_dm-'id'
    """
    import csv

    # job working directory
    job_wd = dismod3.settings.JOB_WORKING_DIR % dm.id

    # directory to save the file
    dir = job_wd + '/posterior/'

    for rate_type in rate_type_list:
        try:
            # make an output file
            filename = 'dm-%s-%s-%s-%s-%s.csv' % (str(
                dm.id), rate_type, region, sex, year)
            # open a file to write
            f_file = open(dir + filename, 'w')

            # get csv file writer
            csv_f = csv.writer(f_file)
            print('writing csv file %s' % (dir + filename))

            # write header
            csv_f.writerow(['Iso3', 'Population', 'Rate type', 'Age'] +
                           ['Draw%d' % i for i in range(1000)])

            t = {
                'incidence': 'i',
                'prevalence': 'p',
                'remission': 'r',
                'excess-mortality': 'f',
                'prevalence_x_excess-mortality': 'pf',
                'duration': 'X'
            }[rate_type]

            if t in vars:

                # loop over countries and rate_types
                for a in nx.traversal.bfs_tree(model.hierarchy, region):
                    if len(model.hierarchy.successors(a)) != 0:
                        continue  # only store hierarchy leaves
                    if t in model.parameters and 'level_bounds' in model.parameters[
                            t]:
                        lower = model.parameters[t]['level_bounds']['lower']
                        upper = model.parameters[t]['level_bounds']['upper']
                    else:
                        lower = 0
                        upper = pl.inf

                    posterior = covariate_model.predict_for(
                        model,
                        model.parameters[t],
                        region,
                        sex,
                        year,
                        a,
                        sex,
                        year,
                        True,  # population weighted averages
                        vars[t],
                        lower,
                        upper)

                    # write a row
                    pop = dismod3.neg_binom_model.population_by_age[(a,
                                                                     str(year),
                                                                     sex)]
                    ages = model.parameters['ages']
                    for i, age in enumerate(ages):
                        csv_f.writerow([a, pop[age], rate_type,
                                        str(age)] + list(posterior[:, i]))

        except IOError, e:
            print 'WARNING: could not save country level output for %s' % rate_type
            print e
Ejemplo n.º 16
0
def save_country_level_posterior(dm, model, vars, region, sex, year, rate_type_list):
    """ Save country level posterior in a csv file, and put the file in the 
    directory job_working_directory/posterior/country_level_posterior_dm-'id'
    """
    import csv
    
    # job working directory
    job_wd = dismod3.settings.JOB_WORKING_DIR % dm.id

    # directory to save the file
    dir = job_wd + '/posterior/'

    for rate_type in rate_type_list:
        try:
            # make an output file
            filename = 'dm-%s-%s-%s-%s-%s.csv' % (str(dm.id), rate_type, region, sex, year)
            # open a file to write
            f_file = open(dir + filename, 'w')

            # get csv file writer
            csv_f = csv.writer(f_file)
            print('writing csv file %s' % (dir + filename))

            # write header
            csv_f.writerow(['Iso3', 'Population', 'Rate type', 'Age'] + ['Draw%d'%i for i in range(1000)])

            t = {'incidence': 'i', 'prevalence': 'p', 'remission': 'r', 'excess-mortality': 'f',
                 'prevalence_x_excess-mortality': 'pf', 'duration': 'X'}[rate_type]

            if t in vars:

                # loop over countries and rate_types
                for a in nx.traversal.bfs_tree(model.hierarchy, region):
                    if len(model.hierarchy.successors(a)) != 0:
                        continue # only store hierarchy leaves
                    if t in model.parameters and 'level_bounds' in model.parameters[t]:
                        lower=model.parameters[t]['level_bounds']['lower']
                        upper=model.parameters[t]['level_bounds']['upper']
                    else:
                        lower=0
                        upper=pl.inf

                    posterior = covariate_model.predict_for(model,
                                                            model.parameters[t],
                                                            region, sex, year,
                                                            a, sex, year,
                                                            True,  # population weighted averages
                                                            vars[t],
                                                            lower, upper)

                    # write a row
                    pop = dismod3.neg_binom_model.population_by_age[(a, str(year), sex)]
                    ages = model.parameters['ages']
                    for i, age in enumerate(ages):
                        csv_f.writerow([a, pop[age],
                                        rate_type, str(age)] +
                                       list(posterior[:,i]))

        except IOError, e:
            print 'WARNING: could not save country level output for %s' % rate_type
            print e
Ejemplo n.º 17
0
def fit_world(id, fast_fit=False, zero_re=True, alt_prior=False, global_heterogeneity='Slightly'):
    """ Fit consistent for all data in world

    Parameters
    ----------
    id : int
      The model id number for the job to fit

    Example
    -------
    >>> import fit_world
    >>> dm = fit_world.dismod3.load_disease_model(1234)
    >>> fit_world.fit_world(dm)
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
        dm = dismod3.load_disease_model(id)
    except (IOError, AssertionError):
        dm = dismod3.load_disease_model(id)
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        try:
            model.save(dir)
            print 'loaded data from json, saved in new format for next time in %s' % dir
        except IOError:
            print 'loaded data from json, failed to save in new format'


    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if 'heterogeneity' in model.parameters[t]:
            model.parameters[t]['heterogeneity'] = global_heterogeneity

    ### For testing:
    ## speed up computation by reducing number of knots
    ## for t in 'irf':
    ##     model.parameters[t]['parameter_age_mesh'] = [0, 100]
    model.vars += dismod3.ism.consistent(model,
                                         reference_area='all',
                                         reference_sex='total',
                                         reference_year='all',
                                         priors={},
                                         zero_re=zero_re)

    ## fit model to data
    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True)

    dm.model = model

    # borrow strength to inform sigma_alpha between rate types post-hoc
    types_with_re = ['rr', 'f', 'i', 'm', 'smr', 'p', 'r', 'pf', 'm_with', 'X']
    ## first calculate sigma_alpha_bar from posterior draws from each alpha
    alpha_vals = []
    for type in types_with_re:
        if 'alpha' in model.vars[type]:
            for alpha_i in model.vars[type]['alpha']:
                alpha_vals += [a for a in alpha_i.trace() if a != 0]  # remove zeros because areas with no siblings are included for convenience but are pinned to zero
    ## then blend sigma_alpha_i and sigma_alpha_bar for each sigma_alpha_i
    if len(alpha_vals) > 0:
        sigma_alpha_bar = pl.std(alpha_vals)
        for type in types_with_re:
            if 'sigma_alpha' in model.vars[type]:
                for sigma_alpha_i in model.vars[type]['sigma_alpha']:
                    cur_val = sigma_alpha_i.trace()
                    sigma_alpha_i.trace._trace[0] = (cur_val + sigma_alpha_bar) * pl.ones_like(sigma_alpha_i.trace._trace[0])


    for t in 'p i r f rr pf m_with'.split():
        param_type = dict(i='incidence', r='remission', f='excess-mortality', p='prevalence', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality')[t]
        #graphics.plot_one_type(model, model.vars[t], {}, t)
        for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]:
            print 'generating empirical prior for %s' % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and 'level_bounds' in model.parameters[t]:
                        lower=model.parameters[t]['level_bounds']['lower']
                        upper=model.parameters[t]['level_bounds']['upper']
                    else:
                        lower=0
                        upper=pl.inf
                        
                    emp_priors = covariate_model.predict_for(model,
                                                             model.parameters.get(t, {}),
                                                             'all', 'total', 'all',
                                                             a, dismod3.utils.clean(s), int(y),
                                                             alt_prior,
                                                             model.vars[t], lower, upper)
                    dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0))
                    if 'eta' in model.vars[t]:
                        N,A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose([pl.exp(model.vars[t]['eta'].trace()) for _ in range(A)])  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors**2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc('emp_prior_std', key, emp_prior_std)


        from fit_emp_prior import store_effect_coefficients
        store_effect_coefficients(dm, model.vars[t], param_type)

    
        if 'p_pred' in model.vars[t]:
            graphics.plot_one_ppc(model, t)
            pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

        if 'p_pred' in model.vars[t] or 'lb' in model.vars[t]:
            graphics.plot_one_effects(model, t)
            pl.savefig(dir + '/prior-%s-effects.png'%param_type)


    for t in 'i r f p rr pf X m_with smr'.split():
        fname = dir + '/empirical_priors/data-%s.csv'%t
        print 'saving tables for', t, 'to', fname
        if 'data' in model.vars[t] and 'p_pred' in model.vars[t]:
            stats = model.vars[t]['p_pred'].stats(batches=5)
            model.vars[t]['data']['mu_pred'] = stats['mean']
            model.vars[t]['data']['sigma_pred'] = stats['standard deviation']

            stats = model.vars[t]['pi'].stats(batches=5)
            model.vars[t]['data']['mc_error'] = stats['mc error']

            model.vars[t]['data']['residual'] = model.vars[t]['data']['value'] - model.vars[t]['data']['mu_pred']
            model.vars[t]['data']['abs_residual'] = pl.absolute(model.vars[t]['data']['residual'])
            #if 'delta' in model.vars[t]:
            #    model.vars[t]['data']['logp'] = [mc.negative_binomial_like(n*p_obs, n*p_pred, n*p_pred*d) for n, p_obs, p_pred, d \
            #                                  in zip(model.vars[t]['data']['effective_sample_size'],
            #                                         model.vars[t]['data']['value'],
            #                                         model.vars[t]['data']['mu_pred'],
            #                                         pl.atleast_1d(model.vars[t]['delta'].stats()['mean']))]
            model.vars[t]['data'].to_csv(fname)


    graphics.plot_fit(model)
    pl.savefig(dir + '/prior.png')

    graphics.plot_acorr(model)
    pl.savefig(dir + '/prior-convergence.png')

    graphics.plot_trace(model)
    pl.savefig(dir + '/prior-trace.png')
    
    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save('dm-%d-prior-%s.json' % (dm.id, 'all'))
    except IOError, e:
        print e
Ejemplo n.º 18
0

# create model for (latin_america_central, male, 2005)
root_area = "latin_america_central"
subtree = nx.traversal.bfs_tree(model.hierarchy, root_area)
relevant_rows = [
    i
    for i, r in model.input_data.T.iteritems()
    if r["area"] in subtree and r["year_end"] >= 1997 and r["sex"] in ["male", "total"]
]
model.input_data = model.input_data.ix[relevant_rows]


## create and fit consistent model at gbd region level
vars = consistent_model.consistent_model(model, root_area=root_area, root_sex="male", root_year=2005, priors={})
posterior_model = fit_model.fit_consistent_model(vars, iter=3003, burn=1500, thin=10, tune_interval=100)


## generate estimates for latin_america_central, male, 2005
predict_area = root_area
posteriors = {}
for t in "i r f p rr pf".split():
    posteriors[t] = pl.median(
        covariate_model.predict_for(
            model.output_template, model.hierarchy, root_area, "male", 2005, predict_area, "male", 2005, vars[t]
        ),
        axis=0,
    )

graphics.all_plots(model, vars, {}, posteriors)
Ejemplo n.º 19
0
def test_predict_for():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """
    
    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)
    
    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()


    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None, None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(3)


    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'CAN', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(dist='Constant', mu=0, sigma=1.e-9) # zero out REs to see if test passes
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = 1.
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1']):
        d.parameters['p']['random_effects'][node]['mu'] = (i+1.)/10.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = pl.exp(.1+.2+.3)
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 3: confirm that changing RE for reference area does not change results

    d.parameters['p']['random_effects']['all']['mu'] = 1.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = pl.exp(.1+.2+.3)  # unchanged, since it is alpha_all that is now 1.
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 4: see that prediction of CAN includes region and super-region effect, but not USA effect
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'CAN', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe = 1.
    re = pl.exp(0.+.2+.3)  # unchanged, since it is alpha_all that is now 1.
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe * re)



    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'USA', 'male', 1990, None, None, None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters['p'],
                                         'USA', 'male', 1990,
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    assert_almost_equal(pi_usa, vars['p']['mu_age'].trace())


    ### Prediction case 5: confirm that const RE prior with sigma = 0 does not crash

    d.parameters['p']['random_effects']['USA']['sigma'] = 0.
    d.parameters['p']['random_effects']['CAN']['sigma'] = 0.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                       'all', 'total', 'all',
                                       'NAHI', 'male', 1990,
                                       0., vars['p'], 0., pl.inf)




    d.vars = vars
    return d
Ejemplo n.º 20
0
def save_posterior(disease,
                   model,
                   country,
                   sex,
                   year,
                   param_type_list,
                   folder=''):
    ''' Save country level posterior in a csv file, and put the file in the 
    directory job_working_directory/posterior/country_level_posterior_dm-'id'
    disease : int, model number
    model : dataModel
    country : str
      iso country code
    sex : str
      one of 'male', 'female', or 'total'
    year : int
      one of 1990, 2005, 2010
    param_type_list : list
      i.e. ['i', 'p', 'r', 'f', 'X', 'pf']
    '''
    # job working directory
    job_wd = '/home/j/Project/dismod/dismod_status/prod/dm-%s/' % (disease)

    # directory to save the file
    dir = job_wd + 'posterior/' + folder

    # create posteriors for rate types
    for data_type in param_type_list:
        try:
            # make an output file
            filename = 'dm-%s-%s-%s-%s-%s.csv' % (
                str(disease), full_name[data_type], country, sex, year)
            print('writing csv file %s' % (dir + filename))

            # set prior bounds
            if data_type in model.vars:
                if data_type in model.parameters and 'level_bounds' in model.parameters[
                        data_type]:
                    lower = model.parameters[data_type]['level_bounds'][
                        'lower']
                    upper = model.parameters[data_type]['level_bounds'][
                        'upper']
                else:
                    lower = 0
                    upper = pl.inf

                posterior = covariate_model.predict_for(
                    model,
                    model.parameters[data_type],
                    country,
                    sex,
                    year,
                    country,
                    sex,
                    year,
                    True,  # population weighted averages
                    model.vars[data_type],
                    lower,
                    upper).T

                # create correct number of draws
                if posterior.shape[1] < 1000:
                    draws = posterior.shape[1]
                    print 'Output file will have fewer than 1000 draws'
                else:
                    draws = 1000
                    posterior = posterior[:, 0:draws]
                # create file
                file = pandas.DataFrame(
                    posterior, columns=['Draw%d' % i for i in range(draws)])
                file['Iso3'] = country
                file['Population'] = dismod3.neg_binom_model.population_by_age[
                    (country, str(year), sex)]
                file['Rate type'] = data_type
                file['Age'] = model.parameters['ages']

                # save file
                file.to_csv(dir + filename, index=False)

        except IOError, e:
            print 'WARNING: could not save country level output for %s' % data_type
            print e
Ejemplo n.º 21
0
def fit_emp_prior(
    id,
    param_type,
    fast_fit=False,
    generate_emp_priors=True,
    zero_re=True,
    alt_prior=False,
    global_heterogeneity="Slightly",
):
    """ Fit empirical prior of specified type for specified model

    Parameters
    ----------
    id : int
      The model id number for the job to fit
    param_type : str, one of incidence, prevalence, remission, excess-mortality, prevalence_x_excess-mortality
      The disease parameter to generate empirical priors for

    Example
    -------
    >>> import fit_emp_prior
    >>> fit_emp_prior.fit_emp_prior(2552, 'incidence')
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data

    reload(data)

    dm = dismod3.load_disease_model(id)

    try:
        model = data.ModelData.load(dir)
        print "loaded data from new format from %s" % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        # model.save(dir)
        print "loaded data from json, saved in new format for next time in %s" % dir

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith("x_"):
            model.input_data[col] = model.input_data[col].fillna(0.0)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if "heterogeneity" in model.parameters[t]:
            model.parameters[t]["heterogeneity"] = global_heterogeneity

    t = {
        "incidence": "i",
        "prevalence": "p",
        "remission": "r",
        "excess-mortality": "f",
        "prevalence_x_excess-mortality": "pf",
    }[param_type]
    model.input_data = model.get_data(t)
    if len(model.input_data) == 0:
        print "No data for type %s, exiting" % param_type
        return dm

    ### For testing:
    ## speed up computation by reducing number of knots
    ## model.parameters[t]['parameter_age_mesh'] = [0, 10, 20, 40, 60, 100]

    ## smooth Slightly, Moderately, or Very
    ## model.parameters[t]['smoothness'] = dict(age_start=0, age_end=100, amount='Very')

    ## speed up computation be reducing data size
    ## predict_area = 'super-region_0'
    ## predict_year=2005
    ## predict_sex='total'
    ## subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)
    ## relevant_rows = [i for i, r in model.input_data.T.iteritems() \
    ##                      if (r['area'] in subtree or r['area'] == 'all')\
    ##                      and (r['year_end'] >= 1997) \
    ##                      and r['sex'] in [predict_sex, 'total']]
    ## model.input_data = model.input_data.ix[relevant_rows]

    # testing changes
    # model.input_data['effective_sample_size'] = pl.minimum(1.e3, model.input_data['effective_sample_size'])
    # missing_ess = pl.isnan(model.input_data['effective_sample_size'])
    # model.input_data['effective_sample_size'][missing_ess] = 1.
    # model.input_data['z_overdisperse'] = 1.
    # print model.describe(t)
    # model.input_data = model.input_data[model.input_data['area'].map(lambda x: x in nx.bfs_tree(model.hierarchy, 'super-region_5'))]
    # model.input_data = model.input_data = model.input_data.drop(['x_LDI_id_Updated_7July2011'], axis=1)
    # model.input_data = model.input_data.filter([model.input_data['x_nottroponinuse'] == 0.]
    # model.input_data = model.input_data[:100]

    ## speed up output by not making predictions for empirical priors
    # generate_emp_priors = False

    print "fitting", t
    model.vars += ism.age_specific_rate(
        model,
        t,
        reference_area="all",
        reference_sex="total",
        reference_year="all",
        mu_age=None,
        mu_age_parent=None,
        sigma_age_parent=None,
        rate_type=(t == "rr") and "log_normal" or "neg_binom",
        zero_re=zero_re,
    )
    # for backwards compatibility, should be removed eventually
    dm.model = model
    dm.vars = model.vars[t]
    vars = dm.vars

    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(
            model, t, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True
        )

    stats = dm.vars["p_pred"].stats(batches=5)
    dm.vars["data"]["mu_pred"] = stats["mean"]
    dm.vars["data"]["sigma_pred"] = stats["standard deviation"]

    stats = dm.vars["pi"].stats(batches=5)
    dm.vars["data"]["mc_error"] = stats["mc error"]

    dm.vars["data"]["residual"] = dm.vars["data"]["value"] - dm.vars["data"]["mu_pred"]
    dm.vars["data"]["abs_residual"] = pl.absolute(dm.vars["data"]["residual"])

    graphics.plot_fit(model, data_types=[t], ylab=["PY"], plot_config=(1, 1), fig_size=(8, 8))
    if generate_emp_priors:
        for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]:
            print "generating empirical prior for %s" % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and "level_bounds" in model.parameters[t]:
                        lower = model.parameters[t]["level_bounds"]["lower"]
                        upper = model.parameters[t]["level_bounds"]["upper"]
                    else:
                        lower = 0
                        upper = pl.inf

                    emp_priors = covariate_model.predict_for(
                        model,
                        model.parameters[t],
                        "all",
                        "total",
                        "all",
                        a,
                        dismod3.utils.clean(s),
                        int(y),
                        alt_prior,
                        vars,
                        lower,
                        upper,
                    )
                    dm.set_mcmc("emp_prior_mean", key, emp_priors.mean(0))

                    if "eta" in vars:
                        N, A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose(
                            [pl.exp(vars["eta"].trace()) for _ in range(A)]
                        )  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors ** 2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc("emp_prior_std", key, emp_prior_std)

                    pl.plot(
                        model.parameters["ages"],
                        dm.get_mcmc("emp_prior_mean", key),
                        color="grey",
                        label=a,
                        zorder=-10,
                        alpha=0.5,
                    )
    pl.savefig(dir + "/prior-%s.png" % param_type)

    store_effect_coefficients(dm, vars, param_type)

    # graphics.plot_one_ppc(vars, t)
    # pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

    graphics.plot_acorr(model)
    pl.savefig(dir + "/prior-%s-convergence.png" % param_type)
    graphics.plot_trace(model)
    pl.savefig(dir + "/prior-%s-trace.png" % param_type)

    graphics.plot_one_effects(model, t)
    pl.savefig(dir + "/prior-%s-effects.png" % param_type)

    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save("dm-%d-prior-%s.json" % (id, param_type))
    except IOError, e:
        print e
                             root_year='all',
                             mu_age=None,
                             mu_age_parent=None,
                             sigma_age_parent=None)

m = fit_model.fit_data_model(vars,
                             iter=4000,
                             burn=2000,
                             thin=20,
                             tune_interval=100)

# generate estimates for all leaves of the hierarchy
est = {}
for n in model.hierarchy:
    if len(model.hierarchy.successors(n)) == 0:
        est[n] = pl.median(covariate_model.predict_for(model, 'all', 'total',
                                                       'all', n, 'male', 2005,
                                                       0., vars, 0., 1.),
                           axis=0)

graphics.plot_one_type(model, vars, {}, 'p')

r = 'europe_western'
N = len(model.hierarchy[r])
for i, n in enumerate(sorted(model.hierarchy[r], key=lambda n: est[n][-1])):
    col = pl.cm.Spectral((.1 + i) / N)
    pl.semilogy(est[n], label=n, color=col, linewidth=2)
    pl.text(100, est[n][-1], n, color=col)

pl.axis([-5, 110, 1e-5, .02])
Ejemplo n.º 23
0
model.parameters['r']['level_value'] = dict(age_before=100, age_after=100, value=0.)

# no covariates
model.input_data = model.input_data.drop([col for col in model.input_data.columns if col.startswith('x_')], axis=1)

# create model for (europe_western, male, 2005)
root_area = 'europe_western'
subtree = nx.traversal.bfs_tree(model.hierarchy, root_area)
relevant_rows = [i for i, r in model.input_data.T.iteritems() \
                     if r['area'] in subtree \
                     and r['year_end'] >= 1997 \
                     and r['sex'] in ['male', 'total'] \
                     and r['data_type'] in ['pf', 'm']]
model.input_data = model.input_data.ix[relevant_rows]


## create and fit consistent model at gbd region level
vars = consistent_model.consistent_model(model, root_area=root_area, root_sex='male', root_year=2005, priors={})
posterior_model = fit_model.fit_consistent_model(vars, iter=1030, burn=500, thin=5, tune_interval=100)


## generate estimates
predict_area = root_area
posteriors = {}
for t in 'i r f p rr pf'.split():
    posteriors[t] = pl.median(covariate_model.predict_for(model.output_template, model.hierarchy,
                                                root_area, 'male', 2005,
                                                predict_area, 'male', 2005, vars[t]), axis=0)

graphics.all_plots(model, vars, {}, posteriors)
Ejemplo n.º 24
0
def fit_posterior(dm, region, sex, year, fast_fit=False, 
                  inconsistent_fit=False, params_to_fit=['p', 'r', 'i'], zero_re=True,
                  posteriors_only=False):
    """ Fit posterior of specified region/sex/year for specified model

    Parameters
    ----------
    dm : DiseaseJson
    region : str
      From dismod3.settings.gbd_regions, but clean()-ed
    sex : str, from dismod3.settings.gbd_sexes
    year : str, from dismod3.settings.gbd_years

    fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing)
    inconsistent_fit : fit parameters  separately
    params_to_fit : list of params to fit, if not fitting all consistently

    zero_re : bool, if true, enforce constraint that sibling area REs sum to zero
    posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010

    Example
    -------
    >>> import fit_posterior
    >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005')
    """
    dir = dismod3.settings.JOB_WORKING_DIR % dm.id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    predict_area = dismod3.utils.clean(region)
    predict_sex = dismod3.utils.clean(sex)
    predict_year = int(year)

    ## load emp_priors dict from dm.params
    param_type = dict(i='incidence', p='prevalence', r='remission', f='excess-mortality', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality')
    emp_priors = {}
    for t in 'i r p f'.split():

        # uncomment below to not use empirical prior for rate with zero data
        # if pl.all(model.input_data['data_type'] != t):
        #     continue

        #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex)
        key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex)
        mu = dm.get_mcmc('emp_prior_mean', key)
        #mu = dm.get_mcmc('emp_prior_median', key)
        sigma = dm.get_mcmc('emp_prior_std', key)
        
        if len(mu) == 101 and len(sigma) == 101:
            emp_priors[t, 'mu'] = mu

            # TODO: determine best way to propagate prior on function
            emp_priors[t, 'sigma'] = sigma
            
            # ALT 1: scale so that the joint probability is not a
            # function of the length of the age function
            # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma))

        ## update model.parameters['random_effects'] if there is information in the disease model
        expert_priors = model.parameters[t].get('random_effects', {})
        model.parameters[t]['random_effects'] = dm.get_empirical_prior(param_type[t]).get('new_alpha', {})
        model.parameters[t]['random_effects'].update(expert_priors)

        # shift random effects to make REs for observed children of predict area have mean zero
        re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \
                           for area in model.hierarchy.neighbors(predict_area) \
                           if area in model.parameters[t]['random_effects']])
        for area in model.hierarchy.neighbors(predict_area):
            if area in model.parameters[t]['random_effects']:
                model.parameters[t]['random_effects'][area]['mu'] -= re_mean
            

        ## update model.parameters['fixed_effects'] if there is information in the disease model
        expert_fe_priors = model.parameters[t].get('fixed_effects', {})
        model.parameters[t]['fixed_effects'].update(dm.get_empirical_prior(param_type[t]).get('new_beta', {}))


    ## create model and priors for region/sex/year
    # select data that is about areas in this region, recent years, and sex of male or total only
    assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area
    subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)

    def is_relevant(r):
        if (r['area'] not in subtree) and r['area'] != 'all':
            return False


        if predict_year == 1990:
            if r['year_start'] > 1997:
                return False
        elif predict_year == 2005:
            if posteriors_only:
                if r['year_end'] < 1997 or r['year_start'] > 2007:
                    return False
            else:
                if r['year_end'] < 1997:
                    return False
        elif predict_year == 2010:
            if posteriors_only:
                if r['data_type'] == 'm_all':
                    # include m_all data from 2005, since 2010 is not loaded
                    if r['year_end'] < 1997:
                        return False
                else:
                    if r['year_end'] < 2007:
                        return False
            else:
                if r['year_end'] < 1997:
                    return False
        else:
            assert 0, 'Predictions for year %d not yet implemented' % predict_year

        if r['sex'] not in [predict_sex, 'total']:
            return False

        return True
    
    old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \
                         if (r['area'] in subtree or r['area'] == 'all')\
                         and ((predict_year >= 1997 and r['year_end'] >= 1997) or
                              (predict_year <= 1997 and r['year_start'] <= 1997)) \
                         and r['sex'] in [predict_sex, 'total']]

    relevant_rows = model.input_data.index[model.input_data.apply(is_relevant, axis=1)]

    if predict_year == 1990:
        assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation for 1990"

    if not posteriors_only:
        assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation when posteriors_only is False"
    
    model.input_data = model.input_data.ix[relevant_rows]

    # replace area 'all' with predict_area
    model.input_data['area'][model.input_data['area'] == 'all'] = predict_area

    if inconsistent_fit:
        # generate fits for requested parameters inconsistently
        for t in params_to_fit:
            model.vars += ism.age_specific_rate(model, t,
                                            reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year,
                                            mu_age=None,
                                            mu_age_parent=emp_priors.get((t, 'mu')),
                                            sigma_age_parent=emp_priors.get((t, 'sigma')),
                                            rate_type=(t == 'rr') and 'log_normal' or 'neg_binom',
                                            zero_re=zero_re)
            if fast_fit:
                dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100)
            else:
                dismod3.fit.fit_asr(model, t, iter=iter, burn=burn, thin=thin, tune_interval=100)

    else:
        model.vars += ism.consistent(model,
                                     reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year,
                                     priors=emp_priors, zero_re=zero_re)

        ## fit model to data
        if fast_fit:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
        else:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=iter, burn=burn, thin=thin, tune_interval=100, verbose=True)


    # generate estimates
    posteriors = {}
    for t in 'i r f p rr pf m_with X'.split():
        if t in model.vars:
            if t in model.parameters and 'level_bounds' in model.parameters[t]:
                lower=model.parameters[t]['level_bounds']['lower']
                upper=model.parameters[t]['level_bounds']['upper']
            else:
                lower=0
                upper=pl.inf
            posteriors[t] = covariate_model.predict_for(model,
                                                        model.parameters.get(t, {}),
                                                        predict_area, predict_sex, predict_year,
                                                        predict_area, predict_sex, predict_year,
                                                        True,  # population weighted averages
                                                        model.vars[t], lower, upper)
    try:
        graphics.plot_fit(model, vars, emp_priors, {})
        pl.savefig(dir + '/image/posterior-%s+%s+%s.png'%(predict_area, predict_sex, predict_year))
    except Exception, e:
        print 'Error generating output graphics'
        print e
Ejemplo n.º 25
0
subtree = nx.traversal.bfs_tree(model.hierarchy, root_area)
relevant_rows = [i for i, r in model.input_data.T.iteritems() \
                     if r['area'] in subtree \
                     and r['year_end'] >= 1997 \
                     and r['sex'] in ['male', 'total'] \
                     and r['data_type'] in ['pf', 'm']]
model.input_data = model.input_data.ix[relevant_rows]

## create and fit consistent model at gbd region level
vars = consistent_model.consistent_model(model,
                                         root_area=root_area,
                                         root_sex='male',
                                         root_year=2005,
                                         priors={})
posterior_model = fit_model.fit_consistent_model(vars,
                                                 iter=1030,
                                                 burn=500,
                                                 thin=5,
                                                 tune_interval=100)

## generate estimates
predict_area = root_area
posteriors = {}
for t in 'i r f p rr pf'.split():
    posteriors[t] = pl.median(covariate_model.predict_for(
        model.output_template, model.hierarchy, root_area, 'male', 2005,
        predict_area, 'male', 2005, vars[t]),
                              axis=0)

graphics.all_plots(model, vars, {}, posteriors)
Ejemplo n.º 26
0
def test_predict_for():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """

    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        'p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None,
                                 None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(3)

    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'CAN', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(
            dist='Constant', mu=0,
            sigma=1.e-9)  # zero out REs to see if test passes

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = 1.
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1']):
        d.parameters['p']['random_effects'][node]['mu'] = (i + 1.) / 10.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = pl.exp(.1 + .2 + .3)
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 3: confirm that changing RE for reference area does not change results

    d.parameters['p']['random_effects']['all']['mu'] = 1.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = pl.exp(.1 + .2 +
                         .3)  # unchanged, since it is alpha_all that is now 1.
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 4: see that prediction of CAN includes region and super-region effect, but not USA effect
    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'CAN', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe = 1.
    re = pl.exp(0. + .2 +
                .3)  # unchanged, since it is alpha_all that is now 1.
    assert_almost_equal(pred, vars['p']['mu_age'].trace() * fe * re)

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'USA', 'male', 1990, None, None, None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters['p'], 'USA', 'male',
                                         1990, 'USA', 'male', 1990, 0.,
                                         vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    assert_almost_equal(pi_usa, vars['p']['mu_age'].trace())

    ### Prediction case 5: confirm that const RE prior with sigma = 0 does not crash

    d.parameters['p']['random_effects']['USA']['sigma'] = 0.
    d.parameters['p']['random_effects']['CAN']['sigma'] = 0.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'NAHI', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    d.vars = vars
    return d
Ejemplo n.º 27
0
def test_predict_for_w_region_as_reference():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """

    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        'p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'NAHI', 'male', 2005, None, None,
                                 None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(10)

    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(
            dist='Constant', mu=0,
            sigma=1.e-9)  # zero out REs to see if test passes

    pred = covariate_model.predict_for(d, d.parameters['p'], 'NAHI', 'male',
                                       2005, 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(0.)
    re_usa_1990 = pl.exp(0.)
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1', 'all']):
        d.parameters['p']['random_effects'][node]['mu'] = (i + 1.) / 10.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'NAHI', 'male',
                                       2005, 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(0.)
    re_usa_1990 = pl.exp(.1)
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 3: random effect not constant, zero fixed effect coefficients

    # set random seed to make randomness reproducible
    pl.np.random.seed(12345)
    pred = covariate_model.predict_for(d, d.parameters['p'], 'NAHI', 'male',
                                       2005, 'CAN', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    pl.np.random.seed(12345)
    fe = pl.exp(0.)
    re = pl.exp(mc.rnormal(0., vars['p']['sigma_alpha'][3].trace()**-2))
    assert_almost_equal(pred.mean(0),
                        (vars['p']['mu_age'].trace().T * fe * re).T.mean(0))
Ejemplo n.º 28
0
def save_posterior(disease, model, country, sex, year, param_type_list, folder=''):
    ''' Save country level posterior in a csv file, and put the file in the 
    directory job_working_directory/posterior/country_level_posterior_dm-'id'
    disease : int, model number
    model : dataModel
    country : str
      iso country code
    sex : str
      one of 'male', 'female', or 'total'
    year : int
      one of 1990, 2005, 2010
    param_type_list : list
      i.e. ['i', 'p', 'r', 'f', 'X', 'pf']
    '''
    # job working directory
    job_wd = '/home/j/Project/dismod/dismod_status/prod/dm-%s/'%(disease) 

    # directory to save the file
    dir = job_wd + 'posterior/' + folder

    # create posteriors for rate types
    for data_type in param_type_list:
        try:
            # make an output file
            filename = 'dm-%s-%s-%s-%s-%s.csv' % (str(disease), full_name[data_type], country, sex, year)
            print('writing csv file %s' % (dir + filename))

            # set prior bounds
            if data_type in model.vars:
                if data_type in model.parameters and 'level_bounds' in model.parameters[data_type]:
                    lower=model.parameters[data_type]['level_bounds']['lower']
                    upper=model.parameters[data_type]['level_bounds']['upper']
                else:
                    lower=0
                    upper=pl.inf

                posterior = covariate_model.predict_for(model,
                                                        model.parameters[data_type],
                                                        country, sex, year,
                                                        country, sex, year,
                                                        True,  # population weighted averages
                                                        model.vars[data_type],
                                                        lower, upper).T

                # create correct number of draws
                if posterior.shape[1] < 1000: 
                    draws = posterior.shape[1]
                    print 'Output file will have fewer than 1000 draws'
                else: 
                    draws = 1000
                    posterior = posterior[:,0:draws] 
                # create file
                file = pandas.DataFrame(posterior, columns=['Draw%d'%i for i in range(draws)])
                file['Iso3'] = country
                file['Population'] = dismod3.neg_binom_model.population_by_age[(country, str(year), sex)]
                file['Rate type'] = data_type
                file['Age'] = model.parameters['ages']
                
                # save file
                file.to_csv(dir+filename, index=False)

        except IOError, e:
            print 'WARNING: could not save country level output for %s' % data_type
            print e