コード例 #1
0
ファイル: test_data_model.py プロジェクト: aflaxman/gbd
def test_data_model_sim():
    # generate simulated data
    data_type = 'p'
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()
    
    # create model and priors
    vars = ism.age_specific_rate(d, data_type,
                                 reference_area='all', reference_sex='total', reference_year='all',
                                 mu_age=None, mu_age_parent=None, sigma_age_parent=None)


    # fit model
    m = mc.MCMC(vars)
    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters, 'all', 'total', 'all', 'USA', 'male', 1990, 0., vars[data_type], -pl.inf, pl.inf)


    # create model w/ emp prior
    # create model and priors
    vars = ism.age_specific_rate(d, data_type,
                                 reference_area='all', reference_sex='total', reference_year='all',
                                 mu_age=None, mu_age_parent=pi_usa.mean(0), sigma_age_parent=pi_usa.std(0))
コード例 #2
0
def test_covariate_model_shift_for_root_consistency():
    # generate simulated data
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        'p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None,
                                 None)

    vars = ism.age_specific_rate(d, 'p', 'all', 'male', 1990, None, None, None)

    # fit model
    m = mc.MCMC(vars)

    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters['p'], 'all', 'male',
                                         1990, 'USA', 'male', 1990, 0.,
                                         vars['p'], 0., pl.inf)
コード例 #3
0
ファイル: test_data_model.py プロジェクト: studentmicky/gbd
def test_data_model_lower_bound():
    # generate simulated data
    data_type = 'csmr'
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        data_type, n, a, pi_age_true, sigma_true)
    d.input_data = d.input_data.append(data_simulation.simulated_age_intervals(
        'pf', n, a, pi_age_true * 2., sigma_true),
                                       ignore_index=True)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d,
                                 'pf',
                                 reference_area='all',
                                 reference_sex='total',
                                 reference_year='all',
                                 mu_age=None,
                                 mu_age_parent=None,
                                 sigma_age_parent=None,
                                 lower_bound='csmr')

    # fit model
    m = mc.MCMC(vars)
    m.sample(3)
コード例 #4
0
ファイル: test_data_model.py プロジェクト: studentmicky/gbd
def test_data_model_sim():
    # generate simulated data
    data_type = 'p'
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        data_type, n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d,
                                 data_type,
                                 reference_area='all',
                                 reference_sex='total',
                                 reference_year='all',
                                 mu_age=None,
                                 mu_age_parent=None,
                                 sigma_age_parent=None)

    # fit model
    m = mc.MCMC(vars)
    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters, 'all', 'total',
                                         'all', 'USA', 'male', 1990, 0.,
                                         vars[data_type], -pl.inf, pl.inf)

    # create model w/ emp prior
    # create model and priors
    vars = ism.age_specific_rate(d,
                                 data_type,
                                 reference_area='all',
                                 reference_sex='total',
                                 reference_year='all',
                                 mu_age=None,
                                 mu_age_parent=pi_usa.mean(0),
                                 sigma_age_parent=pi_usa.std(0))
コード例 #5
0
ファイル: test_covariates.py プロジェクト: aflaxman/gbd
def test_predict_for_wo_data():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """
    
    
    d = data.ModelData()
    d.hierarchy, d.output_template = data_simulation.small_output()


    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None, None)

    # fit model
    m = mc.MCMC(vars)
    m.sample(1)


    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(dist='Constant', mu=0, sigma=1.e-9) # zero out REs to see if test passes
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)


    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients
    # FIXME: this test was failing because PyMC is drawing from the prior of beta[0] even though I asked for NoStepper
                                                      
    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1']):
        d.parameters['p']['random_effects'][node]['mu'] = (i+1.)/10.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(.5*vars['p']['beta'][0].value) # beta[0] is drawn from prior, even though I set it to NoStepper, see FIXME above
    re_usa_1990 = pl.exp(.1+.2+.3)
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)
コード例 #6
0
def test_predict_for_wo_data():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """

    d = data.ModelData()
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None,
                                 None)

    # fit model
    m = mc.MCMC(vars)
    m.sample(1)

    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(
            dist='Constant', mu=0,
            sigma=1.e-9)  # zero out REs to see if test passes

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients
    # FIXME: this test was failing because PyMC is drawing from the prior of beta[0] even though I asked for NoStepper

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1']):
        d.parameters['p']['random_effects'][node]['mu'] = (i + 1.) / 10.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(
        .5 * vars['p']['beta'][0].value
    )  # beta[0] is drawn from prior, even though I set it to NoStepper, see FIXME above
    re_usa_1990 = pl.exp(.1 + .2 + .3)
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)
コード例 #7
0
ファイル: test_covariates.py プロジェクト: aflaxman/gbd
def test_covariate_model_shift_for_root_consistency():
    # generate simulated data
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)
    
    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()
    

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None, None)

    vars = ism.age_specific_rate(d, 'p', 'all', 'male', 1990, None, None, None)

    # fit model
    m = mc.MCMC(vars)

    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters['p'], 'all', 'male', 1990, 'USA', 'male', 1990, 0., vars['p'], 0., pl.inf)
コード例 #8
0
def test_predict_for_wo_effects():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """

    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        'p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d,
                                 'p',
                                 'NAHI',
                                 'male',
                                 2005,
                                 None,
                                 None,
                                 None,
                                 include_covariates=False)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(10)

    ### Prediction case: prediction should match mu age

    pred = covariate_model.predict_for(d, d.parameters['p'], 'NAHI', 'male',
                                       2005, 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    assert_almost_equal(pred, vars['p']['mu_age'].trace())
コード例 #9
0
ファイル: test_covariates.py プロジェクト: aflaxman/gbd
def test_predict_for_wo_effects():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """
    
    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)
    
    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()


    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'NAHI', 'male', 2005, None, None, None, include_covariates=False)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(10)


    ### Prediction case: prediction should match mu age
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'NAHI', 'male', 2005,
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace())
コード例 #10
0
ファイル: test_data_model.py プロジェクト: aflaxman/gbd
def test_data_model_lower_bound():
    # generate simulated data
    data_type = 'csmr'
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(data_type, n, a, pi_age_true, sigma_true)
    d.input_data = d.input_data.append(data_simulation.simulated_age_intervals('pf', n, a, pi_age_true*2., sigma_true),
                                       ignore_index=True)
    d.hierarchy, d.output_template = data_simulation.small_output()
    
    # create model and priors
    vars = ism.age_specific_rate(d, 'pf',
                                 reference_area='all', reference_sex='total', reference_year='all',
                                 mu_age=None, mu_age_parent=None, sigma_age_parent=None, lower_bound='csmr')


    # fit model
    m = mc.MCMC(vars)
    m.sample(3)
コード例 #11
0
def test_predict_for():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """

    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        'p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None,
                                 None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(3)

    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'CAN', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(
            dist='Constant', mu=0,
            sigma=1.e-9)  # zero out REs to see if test passes

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = 1.
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1']):
        d.parameters['p']['random_effects'][node]['mu'] = (i + 1.) / 10.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = pl.exp(.1 + .2 + .3)
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 3: confirm that changing RE for reference area does not change results

    d.parameters['p']['random_effects']['all']['mu'] = 1.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = pl.exp(.1 + .2 +
                         .3)  # unchanged, since it is alpha_all that is now 1.
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 4: see that prediction of CAN includes region and super-region effect, but not USA effect
    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'CAN', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe = 1.
    re = pl.exp(0. + .2 +
                .3)  # unchanged, since it is alpha_all that is now 1.
    assert_almost_equal(pred, vars['p']['mu_age'].trace() * fe * re)

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'USA', 'male', 1990, None, None, None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters['p'], 'USA', 'male',
                                         1990, 'USA', 'male', 1990, 0.,
                                         vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    assert_almost_equal(pi_usa, vars['p']['mu_age'].trace())

    ### Prediction case 5: confirm that const RE prior with sigma = 0 does not crash

    d.parameters['p']['random_effects']['USA']['sigma'] = 0.
    d.parameters['p']['random_effects']['CAN']['sigma'] = 0.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'all', 'total',
                                       'all', 'NAHI', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    d.vars = vars
    return d
コード例 #12
0
def fit_emp_prior(id,
                  param_type,
                  fast_fit=False,
                  generate_emp_priors=True,
                  zero_re=True,
                  alt_prior=False,
                  global_heterogeneity='Slightly'):
    """ Fit empirical prior of specified type for specified model

    Parameters
    ----------
    id : int
      The model id number for the job to fit
    param_type : str, one of incidence, prevalence, remission, excess-mortality, prevalence_x_excess-mortality
      The disease parameter to generate empirical priors for

    Example
    -------
    >>> import fit_emp_prior
    >>> fit_emp_prior.fit_emp_prior(2552, 'incidence')
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    dm = dismod3.load_disease_model(id)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if 'heterogeneity' in model.parameters[t]:
            model.parameters[t]['heterogeneity'] = global_heterogeneity

    t = {
        'incidence': 'i',
        'prevalence': 'p',
        'remission': 'r',
        'excess-mortality': 'f',
        'prevalence_x_excess-mortality': 'pf'
    }[param_type]
    model.input_data = model.get_data(t)
    if len(model.input_data) == 0:
        print 'No data for type %s, exiting' % param_type
        return dm

    ### For testing:
    ## speed up computation by reducing number of knots
    ## model.parameters[t]['parameter_age_mesh'] = [0, 10, 20, 40, 60, 100]

    ## smooth Slightly, Moderately, or Very
    ## model.parameters[t]['smoothness'] = dict(age_start=0, age_end=100, amount='Very')

    ## speed up computation be reducing data size
    ## predict_area = 'super-region_0'
    ## predict_year=2005
    ## predict_sex='total'
    ## subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)
    ## relevant_rows = [i for i, r in model.input_data.T.iteritems() \
    ##                      if (r['area'] in subtree or r['area'] == 'all')\
    ##                      and (r['year_end'] >= 1997) \
    ##                      and r['sex'] in [predict_sex, 'total']]
    ## model.input_data = model.input_data.ix[relevant_rows]

    # testing changes
    #model.input_data['effective_sample_size'] = pl.minimum(1.e3, model.input_data['effective_sample_size'])
    #missing_ess = pl.isnan(model.input_data['effective_sample_size'])
    #model.input_data['effective_sample_size'][missing_ess] = 1.
    #model.input_data['z_overdisperse'] = 1.
    #print model.describe(t)
    #model.input_data = model.input_data[model.input_data['area'].map(lambda x: x in nx.bfs_tree(model.hierarchy, 'super-region_5'))]
    #model.input_data = model.input_data = model.input_data.drop(['x_LDI_id_Updated_7July2011'], axis=1)
    #model.input_data = model.input_data.filter([model.input_data['x_nottroponinuse'] == 0.]
    #model.input_data = model.input_data[:100]

    ## speed up output by not making predictions for empirical priors
    #generate_emp_priors = False

    print 'fitting', t
    model.vars += ism.age_specific_rate(model,
                                        t,
                                        reference_area='all',
                                        reference_sex='total',
                                        reference_year='all',
                                        mu_age=None,
                                        mu_age_parent=None,
                                        sigma_age_parent=None,
                                        rate_type=(t == 'rr') and 'log_normal'
                                        or 'neg_binom',
                                        zero_re=zero_re)
    # for backwards compatibility, should be removed eventually
    dm.model = model
    dm.vars = model.vars[t]
    vars = dm.vars

    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model,
                                              t,
                                              iter=101,
                                              burn=0,
                                              thin=1,
                                              tune_interval=100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model,
                                              t,
                                              iter=50000,
                                              burn=10000,
                                              thin=40,
                                              tune_interval=1000,
                                              verbose=True)

    stats = dm.vars['p_pred'].stats(batches=5)
    dm.vars['data']['mu_pred'] = stats['mean']
    dm.vars['data']['sigma_pred'] = stats['standard deviation']

    stats = dm.vars['pi'].stats(batches=5)
    dm.vars['data']['mc_error'] = stats['mc error']

    dm.vars['data'][
        'residual'] = dm.vars['data']['value'] - dm.vars['data']['mu_pred']
    dm.vars['data']['abs_residual'] = pl.absolute(dm.vars['data']['residual'])

    graphics.plot_fit(model,
                      data_types=[t],
                      ylab=['PY'],
                      plot_config=(1, 1),
                      fig_size=(8, 8))
    if generate_emp_priors:
        for a in [
                dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions
        ]:
            print 'generating empirical prior for %s' % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and 'level_bounds' in model.parameters[
                            t]:
                        lower = model.parameters[t]['level_bounds']['lower']
                        upper = model.parameters[t]['level_bounds']['upper']
                    else:
                        lower = 0
                        upper = pl.inf

                    emp_priors = covariate_model.predict_for(
                        model, model.parameters[t], 'all', 'total', 'all', a,
                        dismod3.utils.clean(s), int(y), alt_prior, vars, lower,
                        upper)
                    dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0))

                    if 'eta' in vars:
                        N, A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose([
                            pl.exp(vars['eta'].trace()) for _ in range(A)
                        ])  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(
                            emp_priors.var(0) +
                            (emp_priors**2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc('emp_prior_std', key, emp_prior_std)

                    pl.plot(model.parameters['ages'],
                            dm.get_mcmc('emp_prior_mean', key),
                            color='grey',
                            label=a,
                            zorder=-10,
                            alpha=.5)
    pl.savefig(dir + '/prior-%s.png' % param_type)

    store_effect_coefficients(dm, vars, param_type)

    #graphics.plot_one_ppc(vars, t)
    #pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

    graphics.plot_acorr(model)
    pl.savefig(dir + '/prior-%s-convergence.png' % param_type)
    graphics.plot_trace(model)
    pl.savefig(dir + '/prior-%s-trace.png' % param_type)

    graphics.plot_one_effects(model, t)
    pl.savefig(dir + '/prior-%s-effects.png' % param_type)

    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save('dm-%d-prior-%s.json' % (id, param_type))
    except IOError, e:
        print e
コード例 #13
0
def fit_posterior(dm,
                  region,
                  sex,
                  year,
                  fast_fit=False,
                  inconsistent_fit=False,
                  params_to_fit=['p', 'r', 'i'],
                  zero_re=True,
                  posteriors_only=False):
    """ Fit posterior of specified region/sex/year for specified model

    Parameters
    ----------
    dm : DiseaseJson
    region : str
      From dismod3.settings.gbd_regions, but clean()-ed
    sex : str, from dismod3.settings.gbd_sexes
    year : str, from dismod3.settings.gbd_years

    fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing)
    inconsistent_fit : fit parameters  separately
    params_to_fit : list of params to fit, if not fitting all consistently

    zero_re : bool, if true, enforce constraint that sibling area REs sum to zero
    posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010

    Example
    -------
    >>> import fit_posterior
    >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005')
    """
    dir = dismod3.settings.JOB_WORKING_DIR % dm.id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    predict_area = dismod3.utils.clean(region)
    predict_sex = dismod3.utils.clean(sex)
    predict_year = int(year)

    ## load emp_priors dict from dm.params
    param_type = dict(i='incidence',
                      p='prevalence',
                      r='remission',
                      f='excess-mortality',
                      rr='relative-risk',
                      pf='prevalence_x_excess-mortality',
                      m_with='mortality')
    emp_priors = {}
    for t in 'i r p f'.split():

        # uncomment below to not use empirical prior for rate with zero data
        # if pl.all(model.input_data['data_type'] != t):
        #     continue

        #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex)
        key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex)
        mu = dm.get_mcmc('emp_prior_mean', key)
        #mu = dm.get_mcmc('emp_prior_median', key)
        sigma = dm.get_mcmc('emp_prior_std', key)

        if len(mu) == 101 and len(sigma) == 101:
            emp_priors[t, 'mu'] = mu

            # TODO: determine best way to propagate prior on function
            emp_priors[t, 'sigma'] = sigma

            # ALT 1: scale so that the joint probability is not a
            # function of the length of the age function
            # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma))

        ## update model.parameters['random_effects'] if there is information in the disease model
        expert_priors = model.parameters[t].get('random_effects', {})
        model.parameters[t]['random_effects'] = dm.get_empirical_prior(
            param_type[t]).get('new_alpha', {})
        model.parameters[t]['random_effects'].update(expert_priors)

        # shift random effects to make REs for observed children of predict area have mean zero
        re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \
                           for area in model.hierarchy.neighbors(predict_area) \
                           if area in model.parameters[t]['random_effects']])
        for area in model.hierarchy.neighbors(predict_area):
            if area in model.parameters[t]['random_effects']:
                model.parameters[t]['random_effects'][area]['mu'] -= re_mean

        ## update model.parameters['fixed_effects'] if there is information in the disease model
        expert_fe_priors = model.parameters[t].get('fixed_effects', {})
        model.parameters[t]['fixed_effects'].update(
            dm.get_empirical_prior(param_type[t]).get('new_beta', {}))

    ## create model and priors for region/sex/year
    # select data that is about areas in this region, recent years, and sex of male or total only
    assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area
    subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)

    def is_relevant(r):
        if (r['area'] not in subtree) and r['area'] != 'all':
            return False

        if predict_year == 1990:
            if r['year_start'] > 1997:
                return False
        elif predict_year == 2005:
            if posteriors_only:
                if r['year_end'] < 1997 or r['year_start'] > 2007:
                    return False
            else:
                if r['year_end'] < 1997:
                    return False
        elif predict_year == 2010:
            if posteriors_only:
                if r['data_type'] == 'm_all':
                    # include m_all data from 2005, since 2010 is not loaded
                    if r['year_end'] < 1997:
                        return False
                else:
                    if r['year_end'] < 2007:
                        return False
            else:
                if r['year_end'] < 1997:
                    return False
        else:
            assert 0, 'Predictions for year %d not yet implemented' % predict_year

        if r['sex'] not in [predict_sex, 'total']:
            return False

        return True

    old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \
                         if (r['area'] in subtree or r['area'] == 'all')\
                         and ((predict_year >= 1997 and r['year_end'] >= 1997) or
                              (predict_year <= 1997 and r['year_start'] <= 1997)) \
                         and r['sex'] in [predict_sex, 'total']]

    relevant_rows = model.input_data.index[model.input_data.apply(is_relevant,
                                                                  axis=1)]

    if predict_year == 1990:
        assert pl.all(
            relevant_rows == old_relevant_rows
        ), "relevant rows should be the same in new and old implementation for 1990"

    if not posteriors_only:
        assert pl.all(
            relevant_rows == old_relevant_rows
        ), "relevant rows should be the same in new and old implementation when posteriors_only is False"

    model.input_data = model.input_data.ix[relevant_rows]

    # replace area 'all' with predict_area
    model.input_data['area'][model.input_data['area'] == 'all'] = predict_area

    if inconsistent_fit:
        # generate fits for requested parameters inconsistently
        for t in params_to_fit:
            model.vars += ism.age_specific_rate(
                model,
                t,
                reference_area=predict_area,
                reference_sex=predict_sex,
                reference_year=predict_year,
                mu_age=None,
                mu_age_parent=emp_priors.get((t, 'mu')),
                sigma_age_parent=emp_priors.get((t, 'sigma')),
                rate_type=(t == 'rr') and 'log_normal' or 'neg_binom',
                zero_re=zero_re)
            if fast_fit:
                dismod3.fit.fit_asr(model,
                                    t,
                                    iter=101,
                                    burn=0,
                                    thin=1,
                                    tune_interval=100)
            else:
                dismod3.fit.fit_asr(model,
                                    t,
                                    iter=iter,
                                    burn=burn,
                                    thin=thin,
                                    tune_interval=100)

    else:
        model.vars += ism.consistent(model,
                                     reference_area=predict_area,
                                     reference_sex=predict_sex,
                                     reference_year=predict_year,
                                     priors=emp_priors,
                                     zero_re=zero_re)

        ## fit model to data
        if fast_fit:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
        else:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model,
                                                         iter=iter,
                                                         burn=burn,
                                                         thin=thin,
                                                         tune_interval=100,
                                                         verbose=True)

    # generate estimates
    posteriors = {}
    for t in 'i r f p rr pf m_with X'.split():
        if t in model.vars:
            if t in model.parameters and 'level_bounds' in model.parameters[t]:
                lower = model.parameters[t]['level_bounds']['lower']
                upper = model.parameters[t]['level_bounds']['upper']
            else:
                lower = 0
                upper = pl.inf
            posteriors[t] = covariate_model.predict_for(
                model,
                model.parameters.get(t, {}),
                predict_area,
                predict_sex,
                predict_year,
                predict_area,
                predict_sex,
                predict_year,
                True,  # population weighted averages
                model.vars[t],
                lower,
                upper)
    try:
        graphics.plot_fit(model, vars, emp_priors, {})
        pl.savefig(dir + '/image/posterior-%s+%s+%s.png' %
                   (predict_area, predict_sex, predict_year))
    except Exception, e:
        print 'Error generating output graphics'
        print e
コード例 #14
0
ファイル: test_covariates.py プロジェクト: aflaxman/gbd
def test_predict_for_w_region_as_reference():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """
    
    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)
    
    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()


    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'NAHI', 'male', 2005, None, None, None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(10)


    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(dist='Constant', mu=0, sigma=1.e-9) # zero out REs to see if test passes
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'NAHI', 'male', 2005,
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(0.)
    re_usa_1990 = pl.exp(0.)
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1', 'all']):
        d.parameters['p']['random_effects'][node]['mu'] = (i+1.)/10.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'NAHI', 'male', 2005,
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(0.)
    re_usa_1990 = pl.exp(.1)
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 3: random effect not constant, zero fixed effect coefficients

    # set random seed to make randomness reproducible
    pl.np.random.seed(12345)
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'NAHI', 'male', 2005,
                                         'CAN', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    pl.np.random.seed(12345)
    fe = pl.exp(0.)
    re = pl.exp(mc.rnormal(0., vars['p']['sigma_alpha'][3].trace()**-2))
    assert_almost_equal(pred.mean(0),
                        (vars['p']['mu_age'].trace().T * fe * re).T.mean(0))
コード例 #15
0
ファイル: test_covariates.py プロジェクト: aflaxman/gbd
def test_predict_for():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """
    
    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)
    
    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()


    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'all', 'total', 'all', None, None, None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(3)


    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'CAN', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(dist='Constant', mu=0, sigma=1.e-9) # zero out REs to see if test passes
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = 1.
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1']):
        d.parameters['p']['random_effects'][node]['mu'] = (i+1.)/10.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = pl.exp(.1+.2+.3)
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 3: confirm that changing RE for reference area does not change results

    d.parameters['p']['random_effects']['all']['mu'] = 1.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = 1.
    re_usa_1990 = pl.exp(.1+.2+.3)  # unchanged, since it is alpha_all that is now 1.
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)


    ### Prediction case 4: see that prediction of CAN includes region and super-region effect, but not USA effect
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                         'all', 'total', 'all',
                                         'CAN', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe = 1.
    re = pl.exp(0.+.2+.3)  # unchanged, since it is alpha_all that is now 1.
    assert_almost_equal(pred,
                        vars['p']['mu_age'].trace() * fe * re)



    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'USA', 'male', 1990, None, None, None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(3)

    # check estimates
    pi_usa = covariate_model.predict_for(d, d.parameters['p'],
                                         'USA', 'male', 1990,
                                         'USA', 'male', 1990,
                                         0., vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    assert_almost_equal(pi_usa, vars['p']['mu_age'].trace())


    ### Prediction case 5: confirm that const RE prior with sigma = 0 does not crash

    d.parameters['p']['random_effects']['USA']['sigma'] = 0.
    d.parameters['p']['random_effects']['CAN']['sigma'] = 0.
        
    pred = covariate_model.predict_for(d, d.parameters['p'],
                                       'all', 'total', 'all',
                                       'NAHI', 'male', 1990,
                                       0., vars['p'], 0., pl.inf)




    d.vars = vars
    return d
コード例 #16
0
def test_predict_for_w_region_as_reference():
    """ Approach to testing predict_for function:

    1. Create model with known mu_age, known covariate values, known effect coefficients
    2. Setup MCMC with NoStepper for all stochs
    3. Sample to generate trace with known values
    4. Predict for results, and confirm that they match expected values
    """

    # generate simulated data
    n = 5
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    d = data.ModelData()
    d.input_data = data_simulation.simulated_age_intervals(
        'p', n, a, pi_age_true, sigma_true)
    d.hierarchy, d.output_template = data_simulation.small_output()

    # create model and priors
    vars = ism.age_specific_rate(d, 'p', 'NAHI', 'male', 2005, None, None,
                                 None)

    # fit model
    m = mc.MCMC(vars)
    for n in m.stochastics:
        m.use_step_method(mc.NoStepper, n)
    m.sample(10)

    ### Prediction case 1: constant zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    d.parameters['p']['random_effects'] = {}
    for node in ['USA', 'NAHI', 'super-region-1', 'all']:
        d.parameters['p']['random_effects'][node] = dict(
            dist='Constant', mu=0,
            sigma=1.e-9)  # zero out REs to see if test passes

    pred = covariate_model.predict_for(d, d.parameters['p'], 'NAHI', 'male',
                                       2005, 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(0.)
    re_usa_1990 = pl.exp(0.)
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 2: constant non-zero random effects, zero fixed effect coefficients

    # check estimates with priors on random effects
    for i, node in enumerate(['USA', 'NAHI', 'super-region-1', 'all']):
        d.parameters['p']['random_effects'][node]['mu'] = (i + 1.) / 10.

    pred = covariate_model.predict_for(d, d.parameters['p'], 'NAHI', 'male',
                                       2005, 'USA', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    fe_usa_1990 = pl.exp(0.)
    re_usa_1990 = pl.exp(.1)
    assert_almost_equal(
        pred, vars['p']['mu_age'].trace() * fe_usa_1990 * re_usa_1990)

    ### Prediction case 3: random effect not constant, zero fixed effect coefficients

    # set random seed to make randomness reproducible
    pl.np.random.seed(12345)
    pred = covariate_model.predict_for(d, d.parameters['p'], 'NAHI', 'male',
                                       2005, 'CAN', 'male', 1990, 0.,
                                       vars['p'], 0., pl.inf)

    # test that the predicted value is as expected
    pl.np.random.seed(12345)
    fe = pl.exp(0.)
    re = pl.exp(mc.rnormal(0., vars['p']['sigma_alpha'][3].trace()**-2))
    assert_almost_equal(pred.mean(0),
                        (vars['p']['mu_age'].trace().T * fe * re).T.mean(0))
コード例 #17
0
ファイル: fit_emp_prior.py プロジェクト: aflaxman/gbd
def fit_emp_prior(
    id,
    param_type,
    fast_fit=False,
    generate_emp_priors=True,
    zero_re=True,
    alt_prior=False,
    global_heterogeneity="Slightly",
):
    """ Fit empirical prior of specified type for specified model

    Parameters
    ----------
    id : int
      The model id number for the job to fit
    param_type : str, one of incidence, prevalence, remission, excess-mortality, prevalence_x_excess-mortality
      The disease parameter to generate empirical priors for

    Example
    -------
    >>> import fit_emp_prior
    >>> fit_emp_prior.fit_emp_prior(2552, 'incidence')
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data

    reload(data)

    dm = dismod3.load_disease_model(id)

    try:
        model = data.ModelData.load(dir)
        print "loaded data from new format from %s" % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        # model.save(dir)
        print "loaded data from json, saved in new format for next time in %s" % dir

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith("x_"):
            model.input_data[col] = model.input_data[col].fillna(0.0)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if "heterogeneity" in model.parameters[t]:
            model.parameters[t]["heterogeneity"] = global_heterogeneity

    t = {
        "incidence": "i",
        "prevalence": "p",
        "remission": "r",
        "excess-mortality": "f",
        "prevalence_x_excess-mortality": "pf",
    }[param_type]
    model.input_data = model.get_data(t)
    if len(model.input_data) == 0:
        print "No data for type %s, exiting" % param_type
        return dm

    ### For testing:
    ## speed up computation by reducing number of knots
    ## model.parameters[t]['parameter_age_mesh'] = [0, 10, 20, 40, 60, 100]

    ## smooth Slightly, Moderately, or Very
    ## model.parameters[t]['smoothness'] = dict(age_start=0, age_end=100, amount='Very')

    ## speed up computation be reducing data size
    ## predict_area = 'super-region_0'
    ## predict_year=2005
    ## predict_sex='total'
    ## subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)
    ## relevant_rows = [i for i, r in model.input_data.T.iteritems() \
    ##                      if (r['area'] in subtree or r['area'] == 'all')\
    ##                      and (r['year_end'] >= 1997) \
    ##                      and r['sex'] in [predict_sex, 'total']]
    ## model.input_data = model.input_data.ix[relevant_rows]

    # testing changes
    # model.input_data['effective_sample_size'] = pl.minimum(1.e3, model.input_data['effective_sample_size'])
    # missing_ess = pl.isnan(model.input_data['effective_sample_size'])
    # model.input_data['effective_sample_size'][missing_ess] = 1.
    # model.input_data['z_overdisperse'] = 1.
    # print model.describe(t)
    # model.input_data = model.input_data[model.input_data['area'].map(lambda x: x in nx.bfs_tree(model.hierarchy, 'super-region_5'))]
    # model.input_data = model.input_data = model.input_data.drop(['x_LDI_id_Updated_7July2011'], axis=1)
    # model.input_data = model.input_data.filter([model.input_data['x_nottroponinuse'] == 0.]
    # model.input_data = model.input_data[:100]

    ## speed up output by not making predictions for empirical priors
    # generate_emp_priors = False

    print "fitting", t
    model.vars += ism.age_specific_rate(
        model,
        t,
        reference_area="all",
        reference_sex="total",
        reference_year="all",
        mu_age=None,
        mu_age_parent=None,
        sigma_age_parent=None,
        rate_type=(t == "rr") and "log_normal" or "neg_binom",
        zero_re=zero_re,
    )
    # for backwards compatibility, should be removed eventually
    dm.model = model
    dm.vars = model.vars[t]
    vars = dm.vars

    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(
            model, t, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True
        )

    stats = dm.vars["p_pred"].stats(batches=5)
    dm.vars["data"]["mu_pred"] = stats["mean"]
    dm.vars["data"]["sigma_pred"] = stats["standard deviation"]

    stats = dm.vars["pi"].stats(batches=5)
    dm.vars["data"]["mc_error"] = stats["mc error"]

    dm.vars["data"]["residual"] = dm.vars["data"]["value"] - dm.vars["data"]["mu_pred"]
    dm.vars["data"]["abs_residual"] = pl.absolute(dm.vars["data"]["residual"])

    graphics.plot_fit(model, data_types=[t], ylab=["PY"], plot_config=(1, 1), fig_size=(8, 8))
    if generate_emp_priors:
        for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]:
            print "generating empirical prior for %s" % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and "level_bounds" in model.parameters[t]:
                        lower = model.parameters[t]["level_bounds"]["lower"]
                        upper = model.parameters[t]["level_bounds"]["upper"]
                    else:
                        lower = 0
                        upper = pl.inf

                    emp_priors = covariate_model.predict_for(
                        model,
                        model.parameters[t],
                        "all",
                        "total",
                        "all",
                        a,
                        dismod3.utils.clean(s),
                        int(y),
                        alt_prior,
                        vars,
                        lower,
                        upper,
                    )
                    dm.set_mcmc("emp_prior_mean", key, emp_priors.mean(0))

                    if "eta" in vars:
                        N, A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose(
                            [pl.exp(vars["eta"].trace()) for _ in range(A)]
                        )  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors ** 2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc("emp_prior_std", key, emp_prior_std)

                    pl.plot(
                        model.parameters["ages"],
                        dm.get_mcmc("emp_prior_mean", key),
                        color="grey",
                        label=a,
                        zorder=-10,
                        alpha=0.5,
                    )
    pl.savefig(dir + "/prior-%s.png" % param_type)

    store_effect_coefficients(dm, vars, param_type)

    # graphics.plot_one_ppc(vars, t)
    # pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

    graphics.plot_acorr(model)
    pl.savefig(dir + "/prior-%s-convergence.png" % param_type)
    graphics.plot_trace(model)
    pl.savefig(dir + "/prior-%s-trace.png" % param_type)

    graphics.plot_one_effects(model, t)
    pl.savefig(dir + "/prior-%s-effects.png" % param_type)

    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save("dm-%d-prior-%s.json" % (id, param_type))
    except IOError, e:
        print e
コード例 #18
0
ファイル: fit_posterior.py プロジェクト: aflaxman/gbd
def fit_posterior(dm, region, sex, year, fast_fit=False, 
                  inconsistent_fit=False, params_to_fit=['p', 'r', 'i'], zero_re=True,
                  posteriors_only=False):
    """ Fit posterior of specified region/sex/year for specified model

    Parameters
    ----------
    dm : DiseaseJson
    region : str
      From dismod3.settings.gbd_regions, but clean()-ed
    sex : str, from dismod3.settings.gbd_sexes
    year : str, from dismod3.settings.gbd_years

    fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing)
    inconsistent_fit : fit parameters  separately
    params_to_fit : list of params to fit, if not fitting all consistently

    zero_re : bool, if true, enforce constraint that sibling area REs sum to zero
    posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010

    Example
    -------
    >>> import fit_posterior
    >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005')
    """
    dir = dismod3.settings.JOB_WORKING_DIR % dm.id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    predict_area = dismod3.utils.clean(region)
    predict_sex = dismod3.utils.clean(sex)
    predict_year = int(year)

    ## load emp_priors dict from dm.params
    param_type = dict(i='incidence', p='prevalence', r='remission', f='excess-mortality', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality')
    emp_priors = {}
    for t in 'i r p f'.split():

        # uncomment below to not use empirical prior for rate with zero data
        # if pl.all(model.input_data['data_type'] != t):
        #     continue

        #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex)
        key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex)
        mu = dm.get_mcmc('emp_prior_mean', key)
        #mu = dm.get_mcmc('emp_prior_median', key)
        sigma = dm.get_mcmc('emp_prior_std', key)
        
        if len(mu) == 101 and len(sigma) == 101:
            emp_priors[t, 'mu'] = mu

            # TODO: determine best way to propagate prior on function
            emp_priors[t, 'sigma'] = sigma
            
            # ALT 1: scale so that the joint probability is not a
            # function of the length of the age function
            # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma))

        ## update model.parameters['random_effects'] if there is information in the disease model
        expert_priors = model.parameters[t].get('random_effects', {})
        model.parameters[t]['random_effects'] = dm.get_empirical_prior(param_type[t]).get('new_alpha', {})
        model.parameters[t]['random_effects'].update(expert_priors)

        # shift random effects to make REs for observed children of predict area have mean zero
        re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \
                           for area in model.hierarchy.neighbors(predict_area) \
                           if area in model.parameters[t]['random_effects']])
        for area in model.hierarchy.neighbors(predict_area):
            if area in model.parameters[t]['random_effects']:
                model.parameters[t]['random_effects'][area]['mu'] -= re_mean
            

        ## update model.parameters['fixed_effects'] if there is information in the disease model
        expert_fe_priors = model.parameters[t].get('fixed_effects', {})
        model.parameters[t]['fixed_effects'].update(dm.get_empirical_prior(param_type[t]).get('new_beta', {}))


    ## create model and priors for region/sex/year
    # select data that is about areas in this region, recent years, and sex of male or total only
    assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area
    subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)

    def is_relevant(r):
        if (r['area'] not in subtree) and r['area'] != 'all':
            return False


        if predict_year == 1990:
            if r['year_start'] > 1997:
                return False
        elif predict_year == 2005:
            if posteriors_only:
                if r['year_end'] < 1997 or r['year_start'] > 2007:
                    return False
            else:
                if r['year_end'] < 1997:
                    return False
        elif predict_year == 2010:
            if posteriors_only:
                if r['data_type'] == 'm_all':
                    # include m_all data from 2005, since 2010 is not loaded
                    if r['year_end'] < 1997:
                        return False
                else:
                    if r['year_end'] < 2007:
                        return False
            else:
                if r['year_end'] < 1997:
                    return False
        else:
            assert 0, 'Predictions for year %d not yet implemented' % predict_year

        if r['sex'] not in [predict_sex, 'total']:
            return False

        return True
    
    old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \
                         if (r['area'] in subtree or r['area'] == 'all')\
                         and ((predict_year >= 1997 and r['year_end'] >= 1997) or
                              (predict_year <= 1997 and r['year_start'] <= 1997)) \
                         and r['sex'] in [predict_sex, 'total']]

    relevant_rows = model.input_data.index[model.input_data.apply(is_relevant, axis=1)]

    if predict_year == 1990:
        assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation for 1990"

    if not posteriors_only:
        assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation when posteriors_only is False"
    
    model.input_data = model.input_data.ix[relevant_rows]

    # replace area 'all' with predict_area
    model.input_data['area'][model.input_data['area'] == 'all'] = predict_area

    if inconsistent_fit:
        # generate fits for requested parameters inconsistently
        for t in params_to_fit:
            model.vars += ism.age_specific_rate(model, t,
                                            reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year,
                                            mu_age=None,
                                            mu_age_parent=emp_priors.get((t, 'mu')),
                                            sigma_age_parent=emp_priors.get((t, 'sigma')),
                                            rate_type=(t == 'rr') and 'log_normal' or 'neg_binom',
                                            zero_re=zero_re)
            if fast_fit:
                dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100)
            else:
                dismod3.fit.fit_asr(model, t, iter=iter, burn=burn, thin=thin, tune_interval=100)

    else:
        model.vars += ism.consistent(model,
                                     reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year,
                                     priors=emp_priors, zero_re=zero_re)

        ## fit model to data
        if fast_fit:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
        else:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=iter, burn=burn, thin=thin, tune_interval=100, verbose=True)


    # generate estimates
    posteriors = {}
    for t in 'i r f p rr pf m_with X'.split():
        if t in model.vars:
            if t in model.parameters and 'level_bounds' in model.parameters[t]:
                lower=model.parameters[t]['level_bounds']['lower']
                upper=model.parameters[t]['level_bounds']['upper']
            else:
                lower=0
                upper=pl.inf
            posteriors[t] = covariate_model.predict_for(model,
                                                        model.parameters.get(t, {}),
                                                        predict_area, predict_sex, predict_year,
                                                        predict_area, predict_sex, predict_year,
                                                        True,  # population weighted averages
                                                        model.vars[t], lower, upper)
    try:
        graphics.plot_fit(model, vars, emp_priors, {})
        pl.savefig(dir + '/image/posterior-%s+%s+%s.png'%(predict_area, predict_sex, predict_year))
    except Exception, e:
        print 'Error generating output graphics'
        print e