Ejemplo n.º 1
0
def fit_emp_prior(
    id,
    param_type,
    fast_fit=False,
    generate_emp_priors=True,
    zero_re=True,
    alt_prior=False,
    global_heterogeneity="Slightly",
):
    """ Fit empirical prior of specified type for specified model

    Parameters
    ----------
    id : int
      The model id number for the job to fit
    param_type : str, one of incidence, prevalence, remission, excess-mortality, prevalence_x_excess-mortality
      The disease parameter to generate empirical priors for

    Example
    -------
    >>> import fit_emp_prior
    >>> fit_emp_prior.fit_emp_prior(2552, 'incidence')
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data

    reload(data)

    dm = dismod3.load_disease_model(id)

    try:
        model = data.ModelData.load(dir)
        print "loaded data from new format from %s" % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        # model.save(dir)
        print "loaded data from json, saved in new format for next time in %s" % dir

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith("x_"):
            model.input_data[col] = model.input_data[col].fillna(0.0)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if "heterogeneity" in model.parameters[t]:
            model.parameters[t]["heterogeneity"] = global_heterogeneity

    t = {
        "incidence": "i",
        "prevalence": "p",
        "remission": "r",
        "excess-mortality": "f",
        "prevalence_x_excess-mortality": "pf",
    }[param_type]
    model.input_data = model.get_data(t)
    if len(model.input_data) == 0:
        print "No data for type %s, exiting" % param_type
        return dm

    ### For testing:
    ## speed up computation by reducing number of knots
    ## model.parameters[t]['parameter_age_mesh'] = [0, 10, 20, 40, 60, 100]

    ## smooth Slightly, Moderately, or Very
    ## model.parameters[t]['smoothness'] = dict(age_start=0, age_end=100, amount='Very')

    ## speed up computation be reducing data size
    ## predict_area = 'super-region_0'
    ## predict_year=2005
    ## predict_sex='total'
    ## subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)
    ## relevant_rows = [i for i, r in model.input_data.T.iteritems() \
    ##                      if (r['area'] in subtree or r['area'] == 'all')\
    ##                      and (r['year_end'] >= 1997) \
    ##                      and r['sex'] in [predict_sex, 'total']]
    ## model.input_data = model.input_data.ix[relevant_rows]

    # testing changes
    # model.input_data['effective_sample_size'] = pl.minimum(1.e3, model.input_data['effective_sample_size'])
    # missing_ess = pl.isnan(model.input_data['effective_sample_size'])
    # model.input_data['effective_sample_size'][missing_ess] = 1.
    # model.input_data['z_overdisperse'] = 1.
    # print model.describe(t)
    # model.input_data = model.input_data[model.input_data['area'].map(lambda x: x in nx.bfs_tree(model.hierarchy, 'super-region_5'))]
    # model.input_data = model.input_data = model.input_data.drop(['x_LDI_id_Updated_7July2011'], axis=1)
    # model.input_data = model.input_data.filter([model.input_data['x_nottroponinuse'] == 0.]
    # model.input_data = model.input_data[:100]

    ## speed up output by not making predictions for empirical priors
    # generate_emp_priors = False

    print "fitting", t
    model.vars += ism.age_specific_rate(
        model,
        t,
        reference_area="all",
        reference_sex="total",
        reference_year="all",
        mu_age=None,
        mu_age_parent=None,
        sigma_age_parent=None,
        rate_type=(t == "rr") and "log_normal" or "neg_binom",
        zero_re=zero_re,
    )
    # for backwards compatibility, should be removed eventually
    dm.model = model
    dm.vars = model.vars[t]
    vars = dm.vars

    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model, t, iter=101, burn=0, thin=1, tune_interval=100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(
            model, t, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True
        )

    stats = dm.vars["p_pred"].stats(batches=5)
    dm.vars["data"]["mu_pred"] = stats["mean"]
    dm.vars["data"]["sigma_pred"] = stats["standard deviation"]

    stats = dm.vars["pi"].stats(batches=5)
    dm.vars["data"]["mc_error"] = stats["mc error"]

    dm.vars["data"]["residual"] = dm.vars["data"]["value"] - dm.vars["data"]["mu_pred"]
    dm.vars["data"]["abs_residual"] = pl.absolute(dm.vars["data"]["residual"])

    graphics.plot_fit(model, data_types=[t], ylab=["PY"], plot_config=(1, 1), fig_size=(8, 8))
    if generate_emp_priors:
        for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]:
            print "generating empirical prior for %s" % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and "level_bounds" in model.parameters[t]:
                        lower = model.parameters[t]["level_bounds"]["lower"]
                        upper = model.parameters[t]["level_bounds"]["upper"]
                    else:
                        lower = 0
                        upper = pl.inf

                    emp_priors = covariate_model.predict_for(
                        model,
                        model.parameters[t],
                        "all",
                        "total",
                        "all",
                        a,
                        dismod3.utils.clean(s),
                        int(y),
                        alt_prior,
                        vars,
                        lower,
                        upper,
                    )
                    dm.set_mcmc("emp_prior_mean", key, emp_priors.mean(0))

                    if "eta" in vars:
                        N, A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose(
                            [pl.exp(vars["eta"].trace()) for _ in range(A)]
                        )  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors ** 2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc("emp_prior_std", key, emp_prior_std)

                    pl.plot(
                        model.parameters["ages"],
                        dm.get_mcmc("emp_prior_mean", key),
                        color="grey",
                        label=a,
                        zorder=-10,
                        alpha=0.5,
                    )
    pl.savefig(dir + "/prior-%s.png" % param_type)

    store_effect_coefficients(dm, vars, param_type)

    # graphics.plot_one_ppc(vars, t)
    # pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

    graphics.plot_acorr(model)
    pl.savefig(dir + "/prior-%s-convergence.png" % param_type)
    graphics.plot_trace(model)
    pl.savefig(dir + "/prior-%s-trace.png" % param_type)

    graphics.plot_one_effects(model, t)
    pl.savefig(dir + "/prior-%s-effects.png" % param_type)

    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save("dm-%d-prior-%s.json" % (id, param_type))
    except IOError, e:
        print e
Ejemplo n.º 2
0
def fit_emp_prior(id,
                  param_type,
                  fast_fit=False,
                  generate_emp_priors=True,
                  zero_re=True,
                  alt_prior=False,
                  global_heterogeneity='Slightly'):
    """ Fit empirical prior of specified type for specified model

    Parameters
    ----------
    id : int
      The model id number for the job to fit
    param_type : str, one of incidence, prevalence, remission, excess-mortality, prevalence_x_excess-mortality
      The disease parameter to generate empirical priors for

    Example
    -------
    >>> import fit_emp_prior
    >>> fit_emp_prior.fit_emp_prior(2552, 'incidence')
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    dm = dismod3.load_disease_model(id)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
    except (IOError, AssertionError):
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        #model.save(dir)
        print 'loaded data from json, saved in new format for next time in %s' % dir

    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if 'heterogeneity' in model.parameters[t]:
            model.parameters[t]['heterogeneity'] = global_heterogeneity

    t = {
        'incidence': 'i',
        'prevalence': 'p',
        'remission': 'r',
        'excess-mortality': 'f',
        'prevalence_x_excess-mortality': 'pf'
    }[param_type]
    model.input_data = model.get_data(t)
    if len(model.input_data) == 0:
        print 'No data for type %s, exiting' % param_type
        return dm

    ### For testing:
    ## speed up computation by reducing number of knots
    ## model.parameters[t]['parameter_age_mesh'] = [0, 10, 20, 40, 60, 100]

    ## smooth Slightly, Moderately, or Very
    ## model.parameters[t]['smoothness'] = dict(age_start=0, age_end=100, amount='Very')

    ## speed up computation be reducing data size
    ## predict_area = 'super-region_0'
    ## predict_year=2005
    ## predict_sex='total'
    ## subtree = nx.traversal.bfs_tree(model.hierarchy, predict_area)
    ## relevant_rows = [i for i, r in model.input_data.T.iteritems() \
    ##                      if (r['area'] in subtree or r['area'] == 'all')\
    ##                      and (r['year_end'] >= 1997) \
    ##                      and r['sex'] in [predict_sex, 'total']]
    ## model.input_data = model.input_data.ix[relevant_rows]

    # testing changes
    #model.input_data['effective_sample_size'] = pl.minimum(1.e3, model.input_data['effective_sample_size'])
    #missing_ess = pl.isnan(model.input_data['effective_sample_size'])
    #model.input_data['effective_sample_size'][missing_ess] = 1.
    #model.input_data['z_overdisperse'] = 1.
    #print model.describe(t)
    #model.input_data = model.input_data[model.input_data['area'].map(lambda x: x in nx.bfs_tree(model.hierarchy, 'super-region_5'))]
    #model.input_data = model.input_data = model.input_data.drop(['x_LDI_id_Updated_7July2011'], axis=1)
    #model.input_data = model.input_data.filter([model.input_data['x_nottroponinuse'] == 0.]
    #model.input_data = model.input_data[:100]

    ## speed up output by not making predictions for empirical priors
    #generate_emp_priors = False

    print 'fitting', t
    model.vars += ism.age_specific_rate(model,
                                        t,
                                        reference_area='all',
                                        reference_sex='total',
                                        reference_year='all',
                                        mu_age=None,
                                        mu_age_parent=None,
                                        sigma_age_parent=None,
                                        rate_type=(t == 'rr') and 'log_normal'
                                        or 'neg_binom',
                                        zero_re=zero_re)
    # for backwards compatibility, should be removed eventually
    dm.model = model
    dm.vars = model.vars[t]
    vars = dm.vars

    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model,
                                              t,
                                              iter=101,
                                              burn=0,
                                              thin=1,
                                              tune_interval=100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_asr(model,
                                              t,
                                              iter=50000,
                                              burn=10000,
                                              thin=40,
                                              tune_interval=1000,
                                              verbose=True)

    stats = dm.vars['p_pred'].stats(batches=5)
    dm.vars['data']['mu_pred'] = stats['mean']
    dm.vars['data']['sigma_pred'] = stats['standard deviation']

    stats = dm.vars['pi'].stats(batches=5)
    dm.vars['data']['mc_error'] = stats['mc error']

    dm.vars['data'][
        'residual'] = dm.vars['data']['value'] - dm.vars['data']['mu_pred']
    dm.vars['data']['abs_residual'] = pl.absolute(dm.vars['data']['residual'])

    graphics.plot_fit(model,
                      data_types=[t],
                      ylab=['PY'],
                      plot_config=(1, 1),
                      fig_size=(8, 8))
    if generate_emp_priors:
        for a in [
                dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions
        ]:
            print 'generating empirical prior for %s' % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and 'level_bounds' in model.parameters[
                            t]:
                        lower = model.parameters[t]['level_bounds']['lower']
                        upper = model.parameters[t]['level_bounds']['upper']
                    else:
                        lower = 0
                        upper = pl.inf

                    emp_priors = covariate_model.predict_for(
                        model, model.parameters[t], 'all', 'total', 'all', a,
                        dismod3.utils.clean(s), int(y), alt_prior, vars, lower,
                        upper)
                    dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0))

                    if 'eta' in vars:
                        N, A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose([
                            pl.exp(vars['eta'].trace()) for _ in range(A)
                        ])  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(
                            emp_priors.var(0) +
                            (emp_priors**2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc('emp_prior_std', key, emp_prior_std)

                    pl.plot(model.parameters['ages'],
                            dm.get_mcmc('emp_prior_mean', key),
                            color='grey',
                            label=a,
                            zorder=-10,
                            alpha=.5)
    pl.savefig(dir + '/prior-%s.png' % param_type)

    store_effect_coefficients(dm, vars, param_type)

    #graphics.plot_one_ppc(vars, t)
    #pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

    graphics.plot_acorr(model)
    pl.savefig(dir + '/prior-%s-convergence.png' % param_type)
    graphics.plot_trace(model)
    pl.savefig(dir + '/prior-%s-trace.png' % param_type)

    graphics.plot_one_effects(model, t)
    pl.savefig(dir + '/prior-%s-effects.png' % param_type)

    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save('dm-%d-prior-%s.json' % (id, param_type))
    except IOError, e:
        print e
Ejemplo n.º 3
0
def fit_world(id, fast_fit=False, zero_re=True, alt_prior=False, global_heterogeneity='Slightly'):
    """ Fit consistent for all data in world

    Parameters
    ----------
    id : int
      The model id number for the job to fit

    Example
    -------
    >>> import fit_world
    >>> dm = fit_world.dismod3.load_disease_model(1234)
    >>> fit_world.fit_world(dm)
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
        dm = dismod3.load_disease_model(id)
    except (IOError, AssertionError):
        dm = dismod3.load_disease_model(id)
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        try:
            model.save(dir)
            print 'loaded data from json, saved in new format for next time in %s' % dir
        except IOError:
            print 'loaded data from json, failed to save in new format'


    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if 'heterogeneity' in model.parameters[t]:
            model.parameters[t]['heterogeneity'] = global_heterogeneity

    ### For testing:
    ## speed up computation by reducing number of knots
    ## for t in 'irf':
    ##     model.parameters[t]['parameter_age_mesh'] = [0, 100]
    model.vars += dismod3.ism.consistent(model,
                                         reference_area='all',
                                         reference_sex='total',
                                         reference_year='all',
                                         priors={},
                                         zero_re=zero_re)

    ## fit model to data
    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True)

    dm.model = model

    # borrow strength to inform sigma_alpha between rate types post-hoc
    types_with_re = ['rr', 'f', 'i', 'm', 'smr', 'p', 'r', 'pf', 'm_with', 'X']
    ## first calculate sigma_alpha_bar from posterior draws from each alpha
    alpha_vals = []
    for type in types_with_re:
        if 'alpha' in model.vars[type]:
            for alpha_i in model.vars[type]['alpha']:
                alpha_vals += [a for a in alpha_i.trace() if a != 0]  # remove zeros because areas with no siblings are included for convenience but are pinned to zero
    ## then blend sigma_alpha_i and sigma_alpha_bar for each sigma_alpha_i
    if len(alpha_vals) > 0:
        sigma_alpha_bar = pl.std(alpha_vals)
        for type in types_with_re:
            if 'sigma_alpha' in model.vars[type]:
                for sigma_alpha_i in model.vars[type]['sigma_alpha']:
                    cur_val = sigma_alpha_i.trace()
                    sigma_alpha_i.trace._trace[0] = (cur_val + sigma_alpha_bar) * pl.ones_like(sigma_alpha_i.trace._trace[0])


    for t in 'p i r f rr pf m_with'.split():
        param_type = dict(i='incidence', r='remission', f='excess-mortality', p='prevalence', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality')[t]
        #graphics.plot_one_type(model, model.vars[t], {}, t)
        for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]:
            print 'generating empirical prior for %s' % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and 'level_bounds' in model.parameters[t]:
                        lower=model.parameters[t]['level_bounds']['lower']
                        upper=model.parameters[t]['level_bounds']['upper']
                    else:
                        lower=0
                        upper=pl.inf
                        
                    emp_priors = covariate_model.predict_for(model,
                                                             model.parameters.get(t, {}),
                                                             'all', 'total', 'all',
                                                             a, dismod3.utils.clean(s), int(y),
                                                             alt_prior,
                                                             model.vars[t], lower, upper)
                    dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0))
                    if 'eta' in model.vars[t]:
                        N,A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose([pl.exp(model.vars[t]['eta'].trace()) for _ in range(A)])  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors**2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc('emp_prior_std', key, emp_prior_std)


        from fit_emp_prior import store_effect_coefficients
        store_effect_coefficients(dm, model.vars[t], param_type)

    
        if 'p_pred' in model.vars[t]:
            graphics.plot_one_ppc(model, t)
            pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

        if 'p_pred' in model.vars[t] or 'lb' in model.vars[t]:
            graphics.plot_one_effects(model, t)
            pl.savefig(dir + '/prior-%s-effects.png'%param_type)


    for t in 'i r f p rr pf X m_with smr'.split():
        fname = dir + '/empirical_priors/data-%s.csv'%t
        print 'saving tables for', t, 'to', fname
        if 'data' in model.vars[t] and 'p_pred' in model.vars[t]:
            stats = model.vars[t]['p_pred'].stats(batches=5)
            model.vars[t]['data']['mu_pred'] = stats['mean']
            model.vars[t]['data']['sigma_pred'] = stats['standard deviation']

            stats = model.vars[t]['pi'].stats(batches=5)
            model.vars[t]['data']['mc_error'] = stats['mc error']

            model.vars[t]['data']['residual'] = model.vars[t]['data']['value'] - model.vars[t]['data']['mu_pred']
            model.vars[t]['data']['abs_residual'] = pl.absolute(model.vars[t]['data']['residual'])
            #if 'delta' in model.vars[t]:
            #    model.vars[t]['data']['logp'] = [mc.negative_binomial_like(n*p_obs, n*p_pred, n*p_pred*d) for n, p_obs, p_pred, d \
            #                                  in zip(model.vars[t]['data']['effective_sample_size'],
            #                                         model.vars[t]['data']['value'],
            #                                         model.vars[t]['data']['mu_pred'],
            #                                         pl.atleast_1d(model.vars[t]['delta'].stats()['mean']))]
            model.vars[t]['data'].to_csv(fname)


    graphics.plot_fit(model)
    pl.savefig(dir + '/prior.png')

    graphics.plot_acorr(model)
    pl.savefig(dir + '/prior-convergence.png')

    graphics.plot_trace(model)
    pl.savefig(dir + '/prior-trace.png')
    
    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save('dm-%d-prior-%s.json' % (dm.id, 'all'))
    except IOError, e:
        print e