Ejemplo n.º 1
0
def predict(type, dm, d):
    for k in d.keys():
        d[dismod3.utils.clean(k)] = d[k]

    t = d['parameter'].replace(' data', '').replace(' ', '-')
    r = d['region']
    y = int(d['year_start'])
    s = d['sex']
    key = dismod3.gbd_key_for(t, r, y, s)

    a0 = int(d['age_start'])
    a1 = int(d['age_end'])
    est_by_age = dm.get_mcmc(type, key)

    if len(est_by_age) == 0:
        return -99

    ages = range(a0, a1 + 1)
    #pop = np.ones(a1 + 1 - a0) / float(a1 + 1 - a0))
    c = d['country_iso3_code']

    if t == 'incidence_x_duration':
        pop = 1. * np.ones_like(ages)
    else:
        pop = [population_by_age[(c, str(y), s)][a] for a in ages]
        pop /= np.sum(pop)  # normalize the pop weights to sum to 1

    est = dismod3.utils.rate_for_range(est_by_age, ages, pop)
    d['estimate %s' % type] = est

    return est
Ejemplo n.º 2
0
def predict(type, dm, d):
    for k in d.keys():
        d[dismod3.utils.clean(k)] = d[k]
        
    t = d['parameter'].replace(' data', '').replace(' ', '-')
    r = d['region']
    y = int(d['year_start'])
    s = d['sex']
    key = dismod3.gbd_key_for(t, r, y, s)

    a0 = int(d['age_start'])
    a1 = int(d['age_end'])
    est_by_age = dm.get_mcmc(type, key)

    if len(est_by_age) == 0:
        return -99

    ages = range(a0, a1 + 1)
    #pop = np.ones(a1 + 1 - a0) / float(a1 + 1 - a0))
    c = d['country_iso3_code']

    if t == 'incidence_x_duration':
        pop = 1. * np.ones_like(ages)
    else:
        pop = [population_by_age[(c, str(y), s)][a] for a in ages]
        pop /= np.sum(pop)  # normalize the pop weights to sum to 1

    est = dismod3.utils.rate_for_range(est_by_age, ages, pop)
    d['estimate %s' % type] = est

    return est
Ejemplo n.º 3
0
def fit_emp_prior(dm, param_type, prior_str=None):
    """ Generate an empirical prior distribution for a single disease parameter

    Parameters
    ----------
    dm : dismod3.DiseaseModel
      The object containing all the data, (hyper)-priors, and additional
      information (like input and output age-mesh).

    param_type : str, one of 'incidence', 'prevalence', 'remission', 'excess-mortality'
      The disease parameter to work with

    prior_str : str, optional
      The (hyper)-prior for this disease parameter; see
      utils.generate_prior_potentials for format

    Notes
    -----
    The results of this fit are stored in the disease model's params
    hash for use when fitting multiple paramter types together

    Example
    -------
    >>> import dismod3
    >>> import dismod3.beta_binomial_model as model
    >>> dm = dismod3.get_disease_model(1)
    >>> model.fit_emp_prior(dm, 'incidence', 'zero 0 4, smooth 25')
    >>> assert dm.params.has_key('emp_prior')
    >>> assert dm.params['emp_prior'].has_key('incidence')
    >>> dismod3.post_disease_model(dm)
    """
    if prior_str:
        dm.set_priors(param_type, prior_str)

    # remove the old PyMC model, if it exists
    if hasattr(dm, "vars"):
        delattr(dm, "vars")
    if hasattr(dm, "map"):
        delattr(dm, "map")
    dm.set_empirical_prior(param_type, {})

    # fit the model
    fit(dm, method="map", param_type=param_type)

    # save the results in the param_hash
    mu = dm.vars["rate_stoch"].value
    se = mu * (1 - mu) * np.sqrt(dm.vars["dispersion"].value)
    dm.set_empirical_prior(
        param_type, {"mu": list(mu), "se": list(se), "dispersion": float(dm.vars["dispersion"].value)}
    )

    for r in dismod3.gbd_regions:
        for y in dismod3.gbd_years:
            for s in dismod3.gbd_sexes:
                key = dismod3.gbd_key_for(param_type, r, y, s)
                dm.set_map(key, mu)
                dm.set_mcmc("lower_ui", key, mu - 1.96 * se)
                dm.set_mcmc("upper_ui", key, mu + 19.6 * se)
Ejemplo n.º 4
0
Archivo: models.py Proyecto: jjdu/gbd
    def to_djson(self, region='*'):
        """ Return a dismod_dataset json corresponding to this model object

        See ``dismod_data_json.html`` for details.

        region : str
          a regex string for the regions to load posteriors for

        Example
        -------
        >> dm = DiseaseModel.objects.get(id=1)
        >> dm.to_djson(region='none')
        """
        param_dict = {}

        if region != '*':
            param_filter = self.params.filter(region__contains=region)
        else:
            param_filter = self.params.all()
            
        for p in param_filter:
            if p.type and p.region and p.sex and p.year:
                if not param_dict.has_key(p.key):
                    param_dict[p.key] = {}
                param_dict[p.key][dismod3.gbd_key_for(p.type,p.region,p.year,p.sex)] = json.loads(p.json)
            else:
                try:
                    param_dict[p.key] = json.loads(p.json)
                except ValueError:
                    # skip bad json, it sometimes happens, for unknown reasons (HTTP glitches?)
                    pass

        # include params for all regions as well, if params were filtered above
        if region != '*':
            for p in self.params.filter(region=''):
                if param_dict.has_key(p.key):
                    continue
                try:
                    param_dict[p.key] = json.loads(p.json)
                except ValueError:
                    # skip bad json, it sometimes happens, for unknown reasons (HTTP glitches?)
                    pass

        param_dict.update(id=self.id,
                          condition=self.condition,
                          sex=self.sex,
                          region=self.region,
                          year=self.year)

        from dismod3.disease_json import DiseaseJson
        dj = DiseaseJson(json.dumps({'params': param_dict,
                                     'data': [d.params for d in self.data.all()],
                                     'id': self.id}))
        #if region != 'none':
        #    dj.merge_posteriors(region)

        return dj
Ejemplo n.º 5
0
def setup(dm, keys):
    """ Generate the PyMC variables for a multi-region/year/sex generic
    disease model.

    Parameters
    ----------
    dm : dismod3.DiseaseModel
      the object containing all the data, priors, and additional
      information (like input and output age-mesh)
    
    Results
    -------
    vars : dict of PyMC stochs
      returns a dictionary of all the relevant PyMC objects for the
      multi-region/year/sex generic disease model.
    """
    
    vars = {}

    # for each region-year-sex triple among the keys
    for r in dismod3.gbd_regions:
        for y in dismod3.gbd_years:
            for s in dismod3.gbd_sexes:
                key = dismod3.gbd_key_for('%s', r, y, s)
                if not key%'prevalence' in keys:
                    continue

                dm.set_units(key%'prevalence', '(per person)')
                dm.set_units(key%'duration', '(years)')
                for t in 'incidence', 'remission', 'excess-mortality':
                    dm.set_units(key%t, '(per person-year)')
                    #dm.get_initial_estimate(key%t, [d for d in dm.data if relevant_to(d, t, r, y, s)])

                data = [d for d in dm.data if relevant_to(d, 'all', r, y, s)]
                #data = [d for d in dm.data if relevant_to(d, 'all', r, y, 'all')]  # try using data from all sexes in posterior fits
                sub_vars = submodel.setup(dm, key, data)
                vars.update(sub_vars)
    
    return vars
Ejemplo n.º 6
0
def fit_emp_prior(dm, param_type, iter=30000, thin=20, burn=10000, dbname='/dev/null'):
    """ Generate an empirical prior distribution for a single disease parameter

    Parameters
    ----------
    dm : dismod3.DiseaseModel
      The object containing all the data, (hyper)-priors, and additional
      information (like input and output age-mesh).

    param_type : str, one of 'incidence', 'prevalence', 'remission', 'excess-mortality'
      The disease parameter to work with

    Notes
    -----
    The results of this fit are stored in the disease model's params
    hash for use when fitting multiple paramter types together

    Example
    -------
    $ python2.5 gbd_fit.py 231 -t incidence
    """

    data = [d for d in dm.data if clean(d['data_type']).find(param_type) != -1 and d.get('ignore') != -1]
    dm.calc_effective_sample_size(data)

    lower_bound_data = []
    if param_type == 'excess-mortality':
        lower_bound_data = [d for d in dm.data if d['data_type'] == 'cause-specific mortality data']
        dm.calc_effective_sample_size(lower_bound_data)
                        
    dm.clear_empirical_prior()
    dm.fit_initial_estimate(param_type, data)

    dm.vars = setup(dm, param_type, data, lower_bound_data=lower_bound_data)

    # don't do anything if there is no data for this parameter type
    if len(dm.vars['data']) == 0:
        return

    debug('i: %s' % ', '.join(['%.2f' % x for x in dm.vars['rate_stoch'].value[::10]]))
    sys.stdout.flush()
    
    # fit the model
    #dm.na = mc.NormApprox(dm.vars)

    #dm.na.fit(method='fmin_powell', verbose=1)
    #dm.na.sample(1000, verbose=1)

    log_dispersion = dm.vars.pop('log_dispersion')  # remove the dispersion term while finding initial values for MCMC
    dm.map = mc.MAP(dm.vars)
    dm.vars.update(log_dispersion=log_dispersion)
    
    try:
        dm.map.fit(method='fmin_powell', iterlim=500, verbose=1)
    except KeyboardInterrupt:
        debug('User halted optimization routine before optimal value found')
    sys.stdout.flush()

    # make pymc warnings go to stdout
    mc.warnings.warn = sys.stdout.write
    dm.mcmc = mc.MCMC(dm.vars, db='pickle', dbname=dbname)
    dm.mcmc.use_step_method(mc.Metropolis, dm.vars['log_dispersion'],
                            proposal_sd=dm.vars['dispersion_step_sd'])
    dm.mcmc.use_step_method(mc.AdaptiveMetropolis, dm.vars['age_coeffs_mesh'],
                            cov=dm.vars['age_coeffs_mesh_step_cov'], verbose=0)
    dm.mcmc.sample(iter=iter, burn=burn, thin=thin, verbose=1)
    dm.mcmc.db.commit()
    
    dm.vars['region_coeffs'].value = dm.vars['region_coeffs'].stats()['mean']
    dm.vars['study_coeffs'].value = dm.vars['study_coeffs'].stats()['mean']
    dm.vars['age_coeffs_mesh'].value = dm.vars['age_coeffs_mesh'].stats()['mean']
    dm.vars['log_dispersion'].value = dm.vars['log_dispersion'].stats()['mean']

    alpha = dm.vars['region_coeffs'].stats()['mean']
    beta = dm.vars['study_coeffs'].stats()['mean']
    gamma_mesh = dm.vars['age_coeffs_mesh'].stats()['mean']
    debug('a: %s' % ', '.join(['%.2f' % x for x in alpha]))
    debug('b: %s' % ', '.join(['%.2f' % x for x in beta]))
    debug('g: %s' % ', '.join(['%.2f' % x for x in gamma_mesh]))
    debug('d: %.2f' % dm.vars['dispersion'].stats()['mean'])
    debug('m: %s' % ', '.join(['%.2f' % x for x in dm.vars['rate_stoch'].stats()['mean'][::10]]))
    covariates_dict = dm.get_covariates()
    X = covariates(data[0], covariates_dict)
    debug('p: %s' % ', '.join(['%.2f' % x for x in predict_rate(X, alpha, beta, gamma_mesh, dm.vars['bounds_func'], dm.get_param_age_mesh())]))
    # save the results in the param_hash
    prior_vals = dict(
        alpha=list(dm.vars['region_coeffs'].stats()['mean']),
        beta=list(dm.vars['study_coeffs'].stats()['mean']),
        gamma=list(dm.vars['age_coeffs'].stats()['mean']),
        delta=float(dm.vars['dispersion'].stats()['mean']))

    prior_vals.update(
        sigma_alpha=list(dm.vars['region_coeffs'].stats()['standard deviation']),
        sigma_beta=list(dm.vars['study_coeffs'].stats()['standard deviation']),
        sigma_gamma=list(dm.vars['age_coeffs'].stats()['standard deviation']),
        sigma_delta=float(dm.vars['dispersion'].stats()['standard deviation']))
    # save the goodness-of-fit statistics for the empirical prior
    prior_vals.update(
        aic=dm.map.AIC,
        bic=dm.map.BIC,
        dic=dm.mcmc.dic()
        )
    dm.set_empirical_prior(param_type, prior_vals)


    dispersion = prior_vals['delta']
    median_sample_size = np.median([values_from(dm, d)[3] for d in dm.vars['data']] + [1000])
    debug('median effective sample size: %.1f' % median_sample_size)

    param_mesh = dm.get_param_age_mesh()
    age_mesh = dm.get_estimate_age_mesh()

    import random
    trace = zip(dm.vars['region_coeffs'].trace(), dm.vars['study_coeffs'].trace(), dm.vars['age_coeffs'].trace())[::5]
    
    for r in dismod3.gbd_regions:
        print 'predicting rates for %s' % r
        for y in dismod3.gbd_years:
            for s in dismod3.gbd_sexes:
                key = dismod3.gbd_key_for(param_type, r, y, s)
                rate_trace = []
                for a, b, g in trace:
                    rate_trace.append(predict_region_rate(key,
                                                          alpha=a,
                                                          beta=b,
                                                          gamma=g,
                                                          covariates_dict=covariates_dict,
                                                          bounds_func=dm.vars['bounds_func'],
                                                          ages=dm.get_estimate_age_mesh()))
                mu = dismod3.utils.interpolate(param_mesh, np.mean(rate_trace, axis=0)[param_mesh], age_mesh)
                dm.set_initial_value(key, mu)
                dm.set_mcmc('emp_prior_mean', key, mu)

                # similar to saving upper_ui and lower_ui in function store_mcmc_fit below
                rate_trace = np.sort(rate_trace, axis=0)
                dm.set_mcmc('emp_prior_upper_ui', key, dismod3.utils.interpolate(param_mesh, rate_trace[.975 * len(rate_trace), :][param_mesh], age_mesh))
                dm.set_mcmc('emp_prior_lower_ui', key, dismod3.utils.interpolate(param_mesh, rate_trace[.025 * len(rate_trace), :][param_mesh], age_mesh))
Ejemplo n.º 7
0
def generate_disease_data(condition, cov):
    """ Generate csv files with gold-standard disease data,
    and somewhat good, somewhat dense disease data, as might be expected from a
    condition that is carefully studied in the literature
    """

    age_len = dismod3.MAX_AGE
    ages = np.arange(age_len, dtype='float')

    # incidence rate
    i0 = .005 + .02 * mc.invlogit((ages - 44) / 3)
    #i0 = np.maximum(0., .001 * (-.125 + np.ones_like(ages) + (ages / age_len)**2.))

    # remission rate
    #r = 0. * ages
    r = .1 * np.ones_like(ages)

    # excess-mortality rate
    #f_init = .085 * (ages / 100) ** 2.5
    SMR = 3. * np.ones_like(ages) - ages / age_len

    # all-cause mortality-rate
    mort = dismod3.get_disease_model('all-cause_mortality')

    #age_intervals = [[a, a+9] for a in range(0, dismod3.MAX_AGE-4, 10)] + [[0, 100] for ii in range(1)]
    age_intervals = [[a, a] for a in range(0, dismod3.MAX_AGE, 1)]

    # TODO:  take age structure from real data
    sparse_intervals = dict([[
        region,
        random.sample(age_intervals,
                      (ii**3 * len(age_intervals)) / len(countries_for)**3 / 1)
    ] for ii, region in enumerate(countries_for)])
    dense_intervals = dict(
        [[region, random.sample(age_intervals,
                                len(age_intervals) / 2)]
         for ii, region in enumerate(countries_for)])

    gold_data = []
    noisy_data = []

    for ii, region in enumerate(sorted(countries_for)):
        if region == 'world':
            continue

        print region
        sys.stdout.flush()

        # introduce unexplained regional variation
        #i = i0 * (1 + float(ii) / 21)

        # or not
        i = i0

        for year in [1990, 2005]:
            for sex in ['male', 'female']:

                param_type = 'all-cause_mortality'
                key = dismod3.gbd_key_for(param_type, region, year, sex)
                m_all_cause = mort.mortality(key, mort.data)

                # calculate excess-mortality rate from smr
                f = (SMR - 1.) * m_all_cause

                ## compartmental model (bins S, C, D, M)
                import scipy.linalg
                from dismod3 import NEARLY_ZERO
                from dismod3.utils import trim

                SCDM = np.zeros([4, age_len])
                p = np.zeros(age_len)
                m = np.zeros(age_len)

                SCDM[0, 0] = 1.
                SCDM[1, 0] = 0.
                SCDM[2, 0] = 0.
                SCDM[3, 0] = 0.

                p[0] = SCDM[1, 0] / (SCDM[0, 0] + SCDM[1, 0] + NEARLY_ZERO)
                m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO,
                            1 - NEARLY_ZERO)

                for a in range(age_len - 1):
                    A = [[-i[a] - m[a], r[a], 0., 0.],
                         [i[a], -r[a] - m[a] - f[a], 0., 0.],
                         [m[a], m[a], 0., 0.], [0., f[a], 0., 0.]]

                    SCDM[:, a + 1] = np.dot(scipy.linalg.expm(A), SCDM[:, a])

                    p[a + 1] = SCDM[1, a + 1] / (SCDM[0, a + 1] +
                                                 SCDM[1, a + 1] + NEARLY_ZERO)
                    m[a + 1] = m_all_cause[a + 1] - f[a + 1] * p[a + 1]

                # duration = E[time in bin C]
                hazard = r + m + f
                pr_not_exit = np.exp(-hazard)
                X = np.empty(len(hazard))
                X[-1] = 1 / hazard[-1]
                for ii in reversed(range(len(X) - 1)):
                    X[ii] = (pr_not_exit[ii] *
                             (X[ii + 1] + 1)) + (1 / hazard[ii] *
                                                 (1 - pr_not_exit[ii]) -
                                                 pr_not_exit[ii])

                country = countries_for[region][0]
                params = dict(age_intervals=age_intervals,
                              condition=condition,
                              gbd_region=region,
                              country=country,
                              year=year,
                              sex=sex,
                              effective_sample_size=1000)

                params['age_intervals'] = [[0, 99]]
                generate_and_append_data(gold_data, 'prevalence data', p,
                                         **params)
                generate_and_append_data(gold_data, 'incidence data', i,
                                         **params)
                generate_and_append_data(gold_data, 'excess-mortality data', f,
                                         **params)
                generate_and_append_data(gold_data, 'remission data', r,
                                         **params)
                generate_and_append_data(gold_data, 'duration data', X,
                                         **params)

                # TODO: use this approach to age standardize all gold data, and then change it to get iX as a direct sum
                params['age_intervals'] = [[0, 99]]
                iX = i * X * (1 - p) * regional_population(key)
                generate_and_append_data(gold_data, 'incidence_x_duration', iX,
                                         **params)

                params['effective_sample_size'] = 1000
                params['cov'] = 0.
                params['age_intervals'] = age_intervals
                generate_and_append_data(noisy_data, 'prevalence data', p,
                                         **params)
                generate_and_append_data(noisy_data, 'excess-mortality data',
                                         f, **params)
                generate_and_append_data(noisy_data, 'remission data', r,
                                         **params)
                generate_and_append_data(noisy_data, 'incidence data', i,
                                         **params)

    col_names = sorted(data_dict_for_csv(gold_data[0]).keys())

    f_file = open(OUTPUT_PATH + '%s_gold.tsv' % condition, 'w')
    csv_f = csv.writer(f_file, dialect='excel-tab')
    csv_f.writerow(col_names)
    for d in gold_data:
        dd = data_dict_for_csv(d)
        csv_f.writerow([dd[c] for c in col_names])
    f_file.close()

    f_name = OUTPUT_PATH + '%s_data.tsv' % condition
    f_file = open(f_name, 'w')
    csv_f = csv.writer(f_file, dialect='excel-tab')
    csv_f.writerow(col_names)

    for d in noisy_data:
        dd = data_dict_for_csv(d)
        csv_f.writerow([dd[c] for c in col_names])
    f_file.close()

    # upload data file
    from dismod3.disease_json import dismod_server_login, twc, DISMOD_BASE_URL
    dismod_server_login()
    twc.go(DISMOD_BASE_URL + 'dismod/data/upload/')
    twc.formvalue(1, 'tab_separated_values', open(f_name).read())

    # TODO: find or set the model number for this model, set the
    # expert priors and covariates, merge the covariate data into the
    # model, and add the "ground truth" to the disease json

    try:
        url = twc.submit()
    except Exception, e:
        print e
Ejemplo n.º 8
0
    r = 'asia_southeast'
    for year in [1990]:
        for sex in ['male']:
            for dm3_type, dm4_type in [['remission', 'remission'],
                                       ['excess-mortality', 'excess'],
                                       ['incidence', 'incidence'],
                                       ['mrr', 'risk'],
                                       ['prevalence', 'prevalence'],
                                       ]:
                x = [0]
                y = [0]
                for age in age_mesh:
                    x.append(age)
                    y.append(measure_out.model[index_dict[(dm4_type, year, age)]])

                key = dismod3.gbd_key_for(dm3_type, r, year, sex)
                est = dismod3.utils.interpolate(x, y, dm.get_estimate_age_mesh())
                dm.set_truth(key, est)

                dismod3.tile_plot_disease_model(dm, [key], defaults={})
                try:
                    pl.savefig(dismod3.settings.JOB_WORKING_DIR % id + '/dm-%d-posterior-%s-%s-%s.png' % (id, dm3_type, sex, year))   # TODO: refactor naming into its own function
                except IOError, e:
                    print 'Warning: could not create png.  Maybe it exists already?\n%s' % e

    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    dismod3.try_posting_disease_model(dm, ntries=5)

    print
    print '********************'
    print 'computation complete'
Ejemplo n.º 9
0
def fit_emp_prior(dm, param_type):
    """ Generate an empirical prior distribution for a single disease parameter

    Parameters
    ----------
    dm : dismod3.DiseaseModel
      The object containing all the data, (hyper)-priors, and additional
      information (like input and output age-mesh).

    param_type : str, one of 'incidence', 'prevalence', 'remission', 'excess-mortality'
      The disease parameter to work with

    Notes
    -----
    The results of this fit are stored in the disease model's params
    hash for use when fitting multiple paramter types together

    Example
    -------
    $ python2.5 gbd_fit.py 175 -t incidence -p 'zero 0 4, zero 41 100, smooth 25' # takes 7m to run
    """

    data = [d for d in dm.data if clean(d['data_type']).find(param_type) != -1]

    # don't do anything if there is no data for this parameter type
    if len(data) == 0:
        return
    
    dm.fit_initial_estimate(param_type, data)

    dm.vars = setup(dm, param_type, data)
    
    # fit the model
    dm.map = mc.MAP(dm.vars)
    try:
        dm.map.fit(method='fmin_powell', iterlim=500, tol=.00001, verbose=1)
    except KeyboardInterrupt:
        print 'User halted optimization routine before optimal value found'
    
    # save the results in the param_hash
    dm.clear_empirical_prior()
    prior_vals = dict(
        alpha=list(dm.vars['region_coeffs'].value),
        beta=list(dm.vars['study_coeffs'].value),
        gamma=list(dm.vars['age_coeffs'].value),
        sigma=float(dm.vars['dispersion'].value))
    dm.set_empirical_prior(param_type, prior_vals)

    dispersion = prior_vals['sigma']
    for r in dismod3.gbd_regions:
        for y in dismod3.gbd_years:
            for s in dismod3.gbd_sexes:
                key = dismod3.gbd_key_for(param_type, r, y, s)
                logit_mu = predict_logit_rate(regional_covariates(key), **prior_vals)
                mu = mc.invlogit(logit_mu)
                dm.set_initial_value(key, mu)
                dm.set_mcmc('emp_prior_mean', key, mu)
                dm.set_mcmc('emp_prior_lower_ui', key, mc.invlogit(logit_mu - 1.96*dispersion))
                dm.set_mcmc('emp_prior_upper_ui', key, mc.invlogit(logit_mu + 1.96*dispersion))

    key = dismod3.gbd_key_for(param_type, 'world', 1997, 'total')
    logit_mu = predict_logit_rate(regional_covariates(key), **prior_vals)
    mu = mc.invlogit(logit_mu)
    dm.set_initial_value(key, mu)
    dm.set_mcmc('emp_prior_mean', key, mu)
    dm.set_mcmc('emp_prior_lower_ui', key, mc.invlogit(logit_mu - 1.96*dispersion))
    dm.set_mcmc('emp_prior_upper_ui', key, mc.invlogit(logit_mu + 1.96*dispersion))
Ejemplo n.º 10
0
def generate_disease_data(condition, cov):
    """ Generate csv files with gold-standard disease data,
    and somewhat good, somewhat dense disease data, as might be expected from a
    condition that is carefully studied in the literature
    """
    
    age_len = dismod3.MAX_AGE
    ages = np.arange(age_len, dtype='float')

    # incidence rate
    i0 = .005 + .02 * mc.invlogit((ages - 44) / 3)
    #i0 = np.maximum(0., .001 * (-.125 + np.ones_like(ages) + (ages / age_len)**2.))

    # remission rate
    #r = 0. * ages
    r = .1 * np.ones_like(ages)

    # excess-mortality rate
    #f_init = .085 * (ages / 100) ** 2.5
    SMR = 3. * np.ones_like(ages) - ages / age_len

    # all-cause mortality-rate
    mort = dismod3.get_disease_model('all-cause_mortality')

    #age_intervals = [[a, a+9] for a in range(0, dismod3.MAX_AGE-4, 10)] + [[0, 100] for ii in range(1)]
    age_intervals = [[a, a] for a in range(0, dismod3.MAX_AGE, 1)]
    
    # TODO:  take age structure from real data
    sparse_intervals = dict([[region, random.sample(age_intervals, (ii**3 * len(age_intervals)) / len(countries_for)**3 / 1)] for ii, region in enumerate(countries_for)])
    dense_intervals = dict([[region, random.sample(age_intervals, len(age_intervals)/2)] for ii, region in enumerate(countries_for)])

    gold_data = []
    noisy_data = []
            
    for ii, region in enumerate(sorted(countries_for)):
        if region == 'world':
            continue
        
        print region
        sys.stdout.flush()

        # introduce unexplained regional variation
        #i = i0 * (1 + float(ii) / 21)

        # or not
        i = i0
        
        for year in [1990, 2005]:
            for sex in ['male', 'female']:

                param_type = 'all-cause_mortality'
                key = dismod3.gbd_key_for(param_type, region, year, sex)
                m_all_cause = mort.mortality(key, mort.data)

                # calculate excess-mortality rate from smr
                f = (SMR - 1.) * m_all_cause


                ## compartmental model (bins S, C, D, M)
                import scipy.linalg
                from dismod3 import NEARLY_ZERO
                from dismod3.utils import trim

                SCDM = np.zeros([4, age_len])
                p = np.zeros(age_len)
                m = np.zeros(age_len)

                SCDM[0,0] = 1.
                SCDM[1,0] = 0.
                SCDM[2,0] = 0.
                SCDM[3,0] = 0.

                p[0] = SCDM[1,0] / (SCDM[0,0] + SCDM[1,0] + NEARLY_ZERO)
                m[0] = trim(m_all_cause[0] - f[0] * p[0], NEARLY_ZERO, 1-NEARLY_ZERO)

                for a in range(age_len - 1):
                    A = [[-i[a]-m[a],  r[a]          , 0., 0.],
                         [ i[a]     , -r[a]-m[a]-f[a], 0., 0.],
                         [      m[a],       m[a]     , 0., 0.],
                         [        0.,            f[a], 0., 0.]]

                    SCDM[:,a+1] = np.dot(scipy.linalg.expm(A), SCDM[:,a])

                    p[a+1] = SCDM[1,a+1] / (SCDM[0,a+1] + SCDM[1,a+1] + NEARLY_ZERO)
                    m[a+1] = m_all_cause[a+1] - f[a+1] * p[a+1]


                # duration = E[time in bin C]
                hazard = r + m + f
                pr_not_exit = np.exp(-hazard)
                X = np.empty(len(hazard))
                X[-1] = 1 / hazard[-1]
                for ii in reversed(range(len(X)-1)):
                    X[ii] = (pr_not_exit[ii] * (X[ii+1] + 1)) + (1 / hazard[ii] * (1 - pr_not_exit[ii]) - pr_not_exit[ii])

                country = countries_for[region][0]
                params = dict(age_intervals=age_intervals, condition=condition, gbd_region=region,
                              country=country, year=year, sex=sex, effective_sample_size=1000)

                params['age_intervals'] = [[0,99]]
                generate_and_append_data(gold_data, 'prevalence data', p, **params)
                generate_and_append_data(gold_data, 'incidence data', i, **params)
                generate_and_append_data(gold_data, 'excess-mortality data', f, **params)
                generate_and_append_data(gold_data, 'remission data', r, **params)
                generate_and_append_data(gold_data, 'duration data', X, **params)

                # TODO: use this approach to age standardize all gold data, and then change it to get iX as a direct sum
                params['age_intervals'] = [[0,99]]
                iX = i * X * (1-p) * regional_population(key)
                generate_and_append_data(gold_data, 'incidence_x_duration', iX, **params)
                

                params['effective_sample_size'] = 1000
                params['cov'] = 0.
                params['age_intervals'] = age_intervals
                generate_and_append_data(noisy_data, 'prevalence data', p, **params)
                generate_and_append_data(noisy_data, 'excess-mortality data', f, **params)
                generate_and_append_data(noisy_data, 'remission data', r, **params)
                generate_and_append_data(noisy_data, 'incidence data', i, **params)



    col_names = sorted(data_dict_for_csv(gold_data[0]).keys())

    f_file = open(OUTPUT_PATH + '%s_gold.tsv' % condition, 'w')
    csv_f = csv.writer(f_file, dialect='excel-tab')
    csv_f.writerow(col_names)
    for d in gold_data:
        dd = data_dict_for_csv(d)
        csv_f.writerow([dd[c] for c in col_names])
    f_file.close()

    f_name = OUTPUT_PATH + '%s_data.tsv' % condition
    f_file = open(f_name, 'w')
    csv_f = csv.writer(f_file, dialect='excel-tab')
    csv_f.writerow(col_names)

    for d in noisy_data:
        dd = data_dict_for_csv(d)
        csv_f.writerow([dd[c] for c in col_names])
    f_file.close()

    # upload data file
    from dismod3.disease_json import dismod_server_login, twc, DISMOD_BASE_URL
    dismod_server_login()
    twc.go(DISMOD_BASE_URL + 'dismod/data/upload/')
    twc.formvalue(1, 'tab_separated_values', open(f_name).read())

    # TODO: find or set the model number for this model, set the
    # expert priors and covariates, merge the covariate data into the
    # model, and add the "ground truth" to the disease json

    try:
        url = twc.submit()
    except Exception, e:
        print e
Ejemplo n.º 11
0
    for r in prediction_regions:
        r = dismod3.utils.clean(r)
        for t in [1990, 2005]:
            x = []
            y = []
            yl = []
            yu = []
            for a in age_mesh:
                x.append(a)
                y.append(param_predicted_stats['mean'][index_dict[(r, t, a)]])
                yl.append(param_predicted_stats['95% HPD interval'][index_dict[(r, t, a)],0])
                yu.append(param_predicted_stats['95% HPD interval'][index_dict[(r, t, a)],1])

            print r, t, zip(x,y)

            key = dismod3.gbd_key_for(param_type, r, t, 'all')
            est = dismod3.utils.interpolate(x, y, dm.get_estimate_age_mesh())
            dm.set_mcmc('mean', key, est)

            est = dismod3.utils.interpolate(x, yl, dm.get_estimate_age_mesh())
            dm.set_mcmc('lower_ui', key, est)

            est = dismod3.utils.interpolate(x, yu, dm.get_estimate_age_mesh())
            dm.set_mcmc('upper_ui', key, est)

            dismod3.tile_plot_disease_model(dm, [key], defaults={})
            try:
                pl.savefig(dismod3.settings.JOB_WORKING_DIR % id + '/dm-%d-posterior-%s-%s-%s.png' % (id, dismod3.utils.clean(r), 'all', t))   # TODO: refactor naming into its own function
            except IOError, e:
                print 'Warning: could not create png.  Maybe it exists already?\n%s' % e
Ejemplo n.º 12
0
      0.00924804,  0.01004529,  0.01089158,  0.01178793,  0.01274115,
      0.0137633 ,  0.01487031,  0.01608018,  0.01740874,  0.01886325,
      0.02044349,  0.02214463,  0.02396039,  0.02589065,  0.0279525 ,
      0.03017836,  0.03261135,  0.03530052,  0.03828981,  0.04160153,
      0.04523777,  0.04918468,  0.05341633,  0.05790466,  0.06263516,
      0.06760523,  0.07281963,  0.07828758,  0.08401736,  0.09000903,
      0.09625542,  0.10274424,  0.10945923,  0.11638187,  0.1234935 ,
      0.13077522,  0.13820759,  0.14577067,  0.15344416,  0.16120755,
      0.16904026,  0.17692176,  0.18483165,  0.19274966,  0.20065553,
      0.20852876,  0.2163489 ,  0.22409584,  0.23174999,  0.23929245,
      0.2467051 ])

for region in dismod3.gbd_regions:
    for year in dismod3.gbd_years:
        for sex in dismod3.gbd_sexes:
            key = dismod3.gbd_key_for('%s', region, year, sex)

            if clean(region) == 'north_america_high_income':
                regional_offset = 0.
            else:
                regional_offset = -.5

            time_offset = (int(year)-1997)/10.

            if clean(sex) == 'male':
                sex_offset = .1
            else:
                sex_offset = 0.
            
            # incidence rate
            i = mc.invlogit(mc.logit(.012 * mc.invlogit((ages - 44) / 3)) + regional_offset + time_offset + sex_offset)