Beispiel #1
0
def test_consistent_model_sim():
    m = data.ModelData()

    # generate simulated data
    n = 50
    sigma_true = .025
    a = pl.arange(0, 100, 1)
    pi_age_true = .0001 * (a * (100. - a) + 100.)

    m.input_data = data_simulation.simulated_age_intervals('p', n, a, pi_age_true, sigma_true)
    last_index = m.input_data.index[-1]
    m.input_data.ix[last_index, 'data_type'] = 'r'  # make sure that there are multiple data types in the data set
    # create model and priors
    vars = ism.consistent(m, 'all', 'total', 'all', {})

    # fit model
    m = mc.MCMC(vars)
    m.sample(1)

    return vars
Beispiel #2
0
def test_consistent_model_forward():
    m = data.ModelData()
    vars = ism.consistent(m, 'all', 'total', 'all', {})
    def set_mu_age(vars, x):
        for n in vars['gamma']:
            n.value = pl.log(x)

    set_mu_age(vars['i'], .01)
    set_mu_age(vars['r'], .0001)
    set_mu_age(vars['f'], .0001)
    print vars['p']['mu_age'].value[::10].round(3)

    set_mu_age(vars['i'], .02)
    set_mu_age(vars['r'], .0001)
    set_mu_age(vars['f'], .0001)
    print vars['p']['mu_age'].value[::10].round(3)

    set_mu_age(vars['i'], 2.)
    set_mu_age(vars['r'], 20.)
    set_mu_age(vars['f'], .0001)
    print vars['p']['mu_age'].value[::10].round(3)
Beispiel #3
0
def fit_posterior(dm,
                  region,
                  sex,
                  year,
                  fast_fit=False,
                  inconsistent_fit=False,
                  params_to_fit=['p', 'r', 'i'],
                  zero_re=True,
                  posteriors_only=False):
    """ Fit posterior of specified region/sex/year for specified model

    Parameters
    ----------
    dm : DiseaseJson
    region : str
      From dismod3.settings.gbd_regions, but clean()-ed
    sex : str, from dismod3.settings.gbd_sexes
    year : str, from dismod3.settings.gbd_years

    fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing)
    inconsistent_fit : fit parameters  separately
    params_to_fit : list of params to fit, if not fitting all consistently

    zero_re : bool, if true, enforce constraint that sibling area REs sum to zero
    posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010

    Example
    -------
    >>> import fit_posterior
    >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005')
    """
    dir = dismod3.settings.JOB_WORKING_DIR % dm.id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    predict_area = dismod3.utils.clean(region)
    predict_sex = dismod3.utils.clean(sex)
    predict_year = int(year)

    ## load emp_priors dict from dm.params
    param_type = dict(i='incidence',
                      p='prevalence',
                      r='remission',
                      f='excess-mortality',
                      rr='relative-risk',
                      pf='prevalence_x_excess-mortality',
                      m_with='mortality')
    emp_priors = {}
    for t in 'i r p f'.split():

        # uncomment below to not use empirical prior for rate with zero data
        # if pl.all(model.input_data['data_type'] != t):
        #     continue

        #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex)
        key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex)
        mu = dm.get_mcmc('emp_prior_mean', key)
        #mu = dm.get_mcmc('emp_prior_median', key)
        sigma = dm.get_mcmc('emp_prior_std', key)

        if len(mu) == 101 and len(sigma) == 101:
            emp_priors[t, 'mu'] = mu

            # TODO: determine best way to propagate prior on function
            emp_priors[t, 'sigma'] = sigma

            # ALT 1: scale so that the joint probability is not a
            # function of the length of the age function
            # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma))

        ## update model.parameters['random_effects'] if there is information in the disease model
        expert_priors = model.parameters[t].get('random_effects', {})
        model.parameters[t]['random_effects'] = dm.get_empirical_prior(
            param_type[t]).get('new_alpha', {})
        model.parameters[t]['random_effects'].update(expert_priors)

        # shift random effects to make REs for observed children of predict area have mean zero
        re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \
                           for area in model.hierarchy.neighbors(predict_area) \
                           if area in model.parameters[t]['random_effects']])
        for area in model.hierarchy.neighbors(predict_area):
            if area in model.parameters[t]['random_effects']:
                model.parameters[t]['random_effects'][area]['mu'] -= re_mean

        ## update model.parameters['fixed_effects'] if there is information in the disease model
        expert_fe_priors = model.parameters[t].get('fixed_effects', {})
        model.parameters[t]['fixed_effects'].update(
            dm.get_empirical_prior(param_type[t]).get('new_beta', {}))

    ## create model and priors for region/sex/year
    # select data that is about areas in this region, recent years, and sex of male or total only
    assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area
    subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)

    def is_relevant(r):
        if (r['area'] not in subtree) and r['area'] != 'all':
            return False

        if predict_year == 1990:
            if r['year_start'] > 1997:
                return False
        elif predict_year == 2005:
            if posteriors_only:
                if r['year_end'] < 1997 or r['year_start'] > 2007:
                    return False
            else:
                if r['year_end'] < 1997:
                    return False
        elif predict_year == 2010:
            if posteriors_only:
                if r['data_type'] == 'm_all':
                    # include m_all data from 2005, since 2010 is not loaded
                    if r['year_end'] < 1997:
                        return False
                else:
                    if r['year_end'] < 2007:
                        return False
            else:
                if r['year_end'] < 1997:
                    return False
        else:
            assert 0, 'Predictions for year %d not yet implemented' % predict_year

        if r['sex'] not in [predict_sex, 'total']:
            return False

        return True

    old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \
                         if (r['area'] in subtree or r['area'] == 'all')\
                         and ((predict_year >= 1997 and r['year_end'] >= 1997) or
                              (predict_year <= 1997 and r['year_start'] <= 1997)) \
                         and r['sex'] in [predict_sex, 'total']]

    relevant_rows = model.input_data.index[model.input_data.apply(is_relevant,
                                                                  axis=1)]

    if predict_year == 1990:
        assert pl.all(
            relevant_rows == old_relevant_rows
        ), "relevant rows should be the same in new and old implementation for 1990"

    if not posteriors_only:
        assert pl.all(
            relevant_rows == old_relevant_rows
        ), "relevant rows should be the same in new and old implementation when posteriors_only is False"

    model.input_data = model.input_data.ix[relevant_rows]

    # replace area 'all' with predict_area
    model.input_data['area'][model.input_data['area'] == 'all'] = predict_area

    if inconsistent_fit:
        # generate fits for requested parameters inconsistently
        for t in params_to_fit:
            model.vars += ism.age_specific_rate(
                model,
                t,
                reference_area=predict_area,
                reference_sex=predict_sex,
                reference_year=predict_year,
                mu_age=None,
                mu_age_parent=emp_priors.get((t, 'mu')),
                sigma_age_parent=emp_priors.get((t, 'sigma')),
                rate_type=(t == 'rr') and 'log_normal' or 'neg_binom',
                zero_re=zero_re)
            if fast_fit:
                dismod3.fit.fit_asr(model,
                                    t,
                                    iter=101,
                                    burn=0,
                                    thin=1,
                                    tune_interval=100)
            else:
                dismod3.fit.fit_asr(model,
                                    t,
                                    iter=iter,
                                    burn=burn,
                                    thin=thin,
                                    tune_interval=100)

    else:
        model.vars += ism.consistent(model,
                                     reference_area=predict_area,
                                     reference_sex=predict_sex,
                                     reference_year=predict_year,
                                     priors=emp_priors,
                                     zero_re=zero_re)

        ## fit model to data
        if fast_fit:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
        else:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model,
                                                         iter=iter,
                                                         burn=burn,
                                                         thin=thin,
                                                         tune_interval=100,
                                                         verbose=True)

    # generate estimates
    posteriors = {}
    for t in 'i r f p rr pf m_with X'.split():
        if t in model.vars:
            if t in model.parameters and 'level_bounds' in model.parameters[t]:
                lower = model.parameters[t]['level_bounds']['lower']
                upper = model.parameters[t]['level_bounds']['upper']
            else:
                lower = 0
                upper = pl.inf
            posteriors[t] = covariate_model.predict_for(
                model,
                model.parameters.get(t, {}),
                predict_area,
                predict_sex,
                predict_year,
                predict_area,
                predict_sex,
                predict_year,
                True,  # population weighted averages
                model.vars[t],
                lower,
                upper)
    try:
        graphics.plot_fit(model, vars, emp_priors, {})
        pl.savefig(dir + '/image/posterior-%s+%s+%s.png' %
                   (predict_area, predict_sex, predict_year))
    except Exception, e:
        print 'Error generating output graphics'
        print e
Beispiel #4
0
def fit_posterior(dm, region, sex, year, fast_fit=False, 
                  inconsistent_fit=False, params_to_fit=['p', 'r', 'i'], zero_re=True,
                  posteriors_only=False):
    """ Fit posterior of specified region/sex/year for specified model

    Parameters
    ----------
    dm : DiseaseJson
    region : str
      From dismod3.settings.gbd_regions, but clean()-ed
    sex : str, from dismod3.settings.gbd_sexes
    year : str, from dismod3.settings.gbd_years

    fast_fit : sample 101 draws from posterior, don't try for convergence (fast for testing)
    inconsistent_fit : fit parameters  separately
    params_to_fit : list of params to fit, if not fitting all consistently

    zero_re : bool, if true, enforce constraint that sibling area REs sum to zero
    posteriors_only : bool, if tru use data from 1997-2007 for 2005 and from 2007 on for 2010

    Example
    -------
    >>> import fit_posterior
    >>> fit_posterior.fit_posterior(2552, 'asia_east', 'male', '2005')
    """
    dir = dismod3.settings.JOB_WORKING_DIR % dm.id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    # TODO: check for missing covariates, and have them fixed, instead of filling them with zeros

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    predict_area = dismod3.utils.clean(region)
    predict_sex = dismod3.utils.clean(sex)
    predict_year = int(year)

    ## load emp_priors dict from dm.params
    param_type = dict(i='incidence', p='prevalence', r='remission', f='excess-mortality', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality')
    emp_priors = {}
    for t in 'i r p f'.split():

        # uncomment below to not use empirical prior for rate with zero data
        # if pl.all(model.input_data['data_type'] != t):
        #     continue

        #key = dismod3.utils.gbd_key_for(param_type[t], model.hierarchy.predecessors(predict_area)[0], year, sex)
        key = dismod3.utils.gbd_key_for(param_type[t], predict_area, year, sex)
        mu = dm.get_mcmc('emp_prior_mean', key)
        #mu = dm.get_mcmc('emp_prior_median', key)
        sigma = dm.get_mcmc('emp_prior_std', key)
        
        if len(mu) == 101 and len(sigma) == 101:
            emp_priors[t, 'mu'] = mu

            # TODO: determine best way to propagate prior on function
            emp_priors[t, 'sigma'] = sigma
            
            # ALT 1: scale so that the joint probability is not a
            # function of the length of the age function
            # emp_priors[t, 'sigma'] = sigma * pl.sqrt(len(sigma))

        ## update model.parameters['random_effects'] if there is information in the disease model
        expert_priors = model.parameters[t].get('random_effects', {})
        model.parameters[t]['random_effects'] = dm.get_empirical_prior(param_type[t]).get('new_alpha', {})
        model.parameters[t]['random_effects'].update(expert_priors)

        # shift random effects to make REs for observed children of predict area have mean zero
        re_mean = pl.mean([model.parameters[t]['random_effects'][area]['mu'] \
                           for area in model.hierarchy.neighbors(predict_area) \
                           if area in model.parameters[t]['random_effects']])
        for area in model.hierarchy.neighbors(predict_area):
            if area in model.parameters[t]['random_effects']:
                model.parameters[t]['random_effects'][area]['mu'] -= re_mean
            

        ## update model.parameters['fixed_effects'] if there is information in the disease model
        expert_fe_priors = model.parameters[t].get('fixed_effects', {})
        model.parameters[t]['fixed_effects'].update(dm.get_empirical_prior(param_type[t]).get('new_beta', {}))


    ## create model and priors for region/sex/year
    # select data that is about areas in this region, recent years, and sex of male or total only
    assert predict_area in model.hierarchy, 'region %s not found in area hierarchy' % predict_area
    subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)

    def is_relevant(r):
        if (r['area'] not in subtree) and r['area'] != 'all':
            return False


        if predict_year == 1990:
            if r['year_start'] > 1997:
                return False
        elif predict_year == 2005:
            if posteriors_only:
                if r['year_end'] < 1997 or r['year_start'] > 2007:
                    return False
            else:
                if r['year_end'] < 1997:
                    return False
        elif predict_year == 2010:
            if posteriors_only:
                if r['data_type'] == 'm_all':
                    # include m_all data from 2005, since 2010 is not loaded
                    if r['year_end'] < 1997:
                        return False
                else:
                    if r['year_end'] < 2007:
                        return False
            else:
                if r['year_end'] < 1997:
                    return False
        else:
            assert 0, 'Predictions for year %d not yet implemented' % predict_year

        if r['sex'] not in [predict_sex, 'total']:
            return False

        return True
    
    old_relevant_rows = [i for i, r in model.input_data.T.iteritems() \
                         if (r['area'] in subtree or r['area'] == 'all')\
                         and ((predict_year >= 1997 and r['year_end'] >= 1997) or
                              (predict_year <= 1997 and r['year_start'] <= 1997)) \
                         and r['sex'] in [predict_sex, 'total']]

    relevant_rows = model.input_data.index[model.input_data.apply(is_relevant, axis=1)]

    if predict_year == 1990:
        assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation for 1990"

    if not posteriors_only:
        assert pl.all(relevant_rows == old_relevant_rows), "relevant rows should be the same in new and old implementation when posteriors_only is False"
    
    model.input_data = model.input_data.ix[relevant_rows]

    # replace area 'all' with predict_area
    model.input_data['area'][model.input_data['area'] == 'all'] = predict_area

    if inconsistent_fit:
        # generate fits for requested parameters inconsistently
        for t in params_to_fit:
            model.vars += ism.age_specific_rate(model, t,
                                            reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year,
                                            mu_age=None,
                                            mu_age_parent=emp_priors.get((t, 'mu')),
                                            sigma_age_parent=emp_priors.get((t, 'sigma')),
                                            rate_type=(t == 'rr') and 'log_normal' or 'neg_binom',
                                            zero_re=zero_re)
            if fast_fit:
                dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100)
            else:
                dismod3.fit.fit_asr(model, t, iter=iter, burn=burn, thin=thin, tune_interval=100)

    else:
        model.vars += ism.consistent(model,
                                     reference_area=predict_area, reference_sex=predict_sex, reference_year=predict_year,
                                     priors=emp_priors, zero_re=zero_re)

        ## fit model to data
        if fast_fit:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
        else:
            dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=iter, burn=burn, thin=thin, tune_interval=100, verbose=True)


    # generate estimates
    posteriors = {}
    for t in 'i r f p rr pf m_with X'.split():
        if t in model.vars:
            if t in model.parameters and 'level_bounds' in model.parameters[t]:
                lower=model.parameters[t]['level_bounds']['lower']
                upper=model.parameters[t]['level_bounds']['upper']
            else:
                lower=0
                upper=pl.inf
            posteriors[t] = covariate_model.predict_for(model,
                                                        model.parameters.get(t, {}),
                                                        predict_area, predict_sex, predict_year,
                                                        predict_area, predict_sex, predict_year,
                                                        True,  # population weighted averages
                                                        model.vars[t], lower, upper)
    try:
        graphics.plot_fit(model, vars, emp_priors, {})
        pl.savefig(dir + '/image/posterior-%s+%s+%s.png'%(predict_area, predict_sex, predict_year))
    except Exception, e:
        print 'Error generating output graphics'
        print e