def ds_initialize(model_num, data_type, area, thin, iter, replicate, 
                  ds_loc='/homes/peterhm/dismod_spline-20130115/build', 
                  save_loc='/homes/peterhm/dismod_spline-20130115/build/fit', 
                  bare_bones=False):
    '''
    Parameters
    ----------
    model_num : int
      dismod model number
    data_type : str
      one of the epidemiologic parameters allowed
      'p', 'i', 'r', 'f', 'pf', 'csmr', 'rr', 'smr', 'X'
    area : str
      level of heirarchy to keep
    thin : int
      thinning number for MCMC
    iter : int
      number of iterations in MCMC
    replicate : int
      number for random number
    bare_bones : bool
      True creates minimalist files, False uses DisMod-MR values, default set to False
    Results
    -------
    gets data and builds necessary files
    .. Note :: If bare_bones is False, parameter files must be filled.
    '''
    cwd = os.getcwd()
    os.chdir('%s' %ds_loc)
    # creates necessary files
    if bare_bones == True: 
        # creates data
        os.system('bin/get_data.py %s' %model_num)
        os.system('bin/fit.sh %s %s' %(model_num, c_data.convert_data_type(data_type)))
    else:
        # load data structure
        dm3 = mu.load_new_model(model_num, area, data_type)
        # create required files
        data_in = c_data.build_data_in(dm3, data_type, model_num)
        prior_in = c_prior.build_prior_in(dm3, data_type, model_num)
        parameter_in = build_param_in(dm3, thin, iter, replicate)
        # save files
        if not os.path.exists(save_loc): os.makedirs(save_loc)
        data_in.to_csv(save_loc + '/data_in.csv',index=False)
        prior_in.to_csv(save_loc + '/prior_in.csv',index=False)
        parameter_in.to_csv(save_loc + '/parameter_in.csv',index=False)
    # return to working directory
    os.chdir('%s' %cwd)
def build_data_in(dm3, data_type, model_num):
    # find standard error and use it for standard deviation
    dm3 = mu.create_uncertainty(dm3, "log_normal")
    # create data file
    data_in = empty_data_in(dm3.input_data.index)
    # add covariates
    cov = dm3.input_data.filter(like="x_")
    data_in = data_in.join(pandas.DataFrame(cov, columns=[""]))
    cov_z = dm3.input_data.filter(like="z_")
    if len(cov_z.columns) != 0:
        data_in = data_in.join(pandas.DataFrame(cov_z, columns=[""]))
    # add data
    data_in["integrand"] = convert_data_type(data_type)
    data_in["meas_value"] = dm3.input_data["value"]
    data_in["meas_stdev"] = dm3.input_data["standard_error"]
    data_in["sex"] = dm3.input_data["sex"]
    data_in["age_lower"] = dm3.input_data["age_start"]
    data_in["age_upper"] = dm3.input_data["age_end"] + 1.0
    data_in["time_lower"] = dm3.input_data["year_start"]
    data_in["time_upper"] = dm3.input_data["year_end"] + 1.0
    data_in["x_sex"] = dm3.input_data["sex"].map(dict(male=0.5, female=-0.5, total=0))
    # create data hierarchy
    model = mu.load_new_model(model_num, "all", data_type)
    superregion = set(model.hierarchy.neighbors("all"))
    region = set(pl.flatten([model.hierarchy.neighbors(sr) for sr in model.hierarchy.neighbors("all")]))
    country = set(
        pl.flatten(
            [
                [model.hierarchy.neighbors(r) for r in model.hierarchy.neighbors(sr)]
                for sr in model.hierarchy.neighbors("all")
            ]
        )
    )
    # create data area levels
    for i in dm3.input_data.index:
        if dm3.input_data.ix[i, "area"] in country:
            data_in.ix[i, "m_sub"] = dm3.input_data.ix[i, "area"]
            data_in.ix[i, "m_region"] = model.hierarchy.in_edges(dm3.input_data.ix[i, "area"])[0][0]
            data_in.ix[i, "m_super"] = model.hierarchy.in_edges(
                model.hierarchy.in_edges(dm3.input_data.ix[i, "area"])[0][0]
            )[0][0]
        elif dm3.input_data.ix[i, "area"] in region:
            data_in.ix[i, "m_region"] = dm3.input_data.ix[i, "area"]
            data_in.ix[i, "m_super"] = model.hierarchy.in_edges(dm3.input_data.ix[i, "area"])[0][0]
        elif dm3.input_data.ix[i, "area"] in superregion:
            data_in.ix[i, "m_super"] = dm3.input_data.ix[i, "area"]
    return data_in
def prior_m_area(dm3, model_num, data_type):
    # create 'm_sub'/'m_region' from unique input_data['area']
    prior_in = empty_prior_in(pl.unique(dm3.input_data['area']).index)
    prior_in['name'] = pl.unique(dm3.input_data['area'])
    prior_in['mean'] = 0.
    prior_in['std'] = 1.
    prior_in['lower'] = '-inf'
    prior_in['upper'] = 'inf'
    # create hierarchy
    model = mu.load_new_model(model_num, 'all', data_type)
    superregion = set(model.hierarchy.neighbors('all'))
    region = set(pl.flatten([model.hierarchy.neighbors(sr) for sr in model.hierarchy.neighbors('all')]))
    country = set(pl.flatten([[model.hierarchy.neighbors(r) for r in model.hierarchy.neighbors(sr)] for sr in model.hierarchy.neighbors('all')]))
    # create data area levels
    for i in pl.unique(dm3.input_data['area']).index:
        if dm3.input_data.ix[i,'area'] in country:
            prior_in.ix[i,'type'] = 'm_sub'
        elif dm3.input_data.ix[i,'area'] in region:
            prior_in.ix[i,'type'] = 'm_region'
        elif dm3.input_data.ix[i,'area'] in superregion:
            prior_in.ix[i,'type'] = 'm_super'
    return prior_in
# create output structures
stats = [
    "seed",
    "bias_" + rate_type,
    "rmse_" + rate_type,
    "mae_" + rate_type,
    "mare_" + rate_type,
    "pc_" + rate_type,
    "time_" + rate_type,
]
output = pandas.DataFrame(pl.zeros((1, len(stats))), columns=stats)
output["seed"] = replicate
failure = []

# load new model
model = mu.load_new_model(model_num, area, data_type)
# replace invalid uncertainty with 10% of data set
model = mu.create_uncertainty(model, rate_type)
# withhold 25% of data
model.input_data, test_ix = mu.test_train(model.input_data, data_type, replicate)

try:
    # create pymc nodes for model and fit the model
    model.vars += dismod3.ism.age_specific_rate(model, data_type, area, "male", 2005, rate_type=rate_type)
    # fit the model, using a hill-climbing alg to find an initial value
    # and then sampling from the posterior with MCMC
    start = time.clock()
    dismod3.fit.fit_asr(model, data_type, iter=iter, thin=thin, burn=burn)
    elapsed = time.clock() - start
    # extract posterior predicted values for data
    pred = pandas.DataFrame(
Esempio n. 5
0
import sys
sys.path += ['.', '..', '/homes/peterhm/gbd/', '/homes/peterhm/gbd/book']

import model_utilities as mu
reload(mu)

import dismod3
reload(dismod3)

model_num = 40418
test_area = 'europe_western'
data_type = 'p'
rate_type='binom'

# example model0, to test vars and test-train
model = mu.load_new_model(model_num, test_area, data_type)
nan_ix = list(model.input_data['effective_sample_size'][pl.isnan(model.input_data['effective_sample_size'])==1].index) # list of nan in effective sample size
model = mu.create_uncertainty(model, 'binom')
for cv in list(model.input_data.filter(like='x_').columns): # fill missing with 0
    model.input_data[cv] = model.input_data[cv].fillna([0])

# example model1, to test test-train
model1 = mu.load_new_model(model_num, test_area, data_type)
model1 = mu.create_uncertainty(model1, 'normal')

# example model2, to test loading and uncertainty
model2 = mu.load_new_model(model_num, test_area, data_type)
non_nan_ix2 = list(model2.input_data['effective_sample_size'][pl.isnan(model2.input_data['effective_sample_size'])==0].index) # list of nan in effective sample size
ten_percent = pl.percentile(model2.input_data.ix[non_nan_ix2, 'effective_sample_size'], 10.)
model2 = mu.create_uncertainty(model2, 'normal')