Beispiel #1
0
 def get_subtype_draws(self):
     meid_measures = self.identify_subtypes()
     df = []
     for mm in meid_measures:
         meid, msid = mm
         has_draws = False
         try:
             thisdf = gopher.draws(
                 {'modelable_entity_ids': [meid]},
                 location_ids=self.location_id,
                 year_ids=self.year_id,
                 sex_ids=self.sex_id,
                 age_group_ids=self.age_group_id,
                 measure_ids=msid,
                 source='dismod',
                 gbd_round_id={GBD ROUND ID})
             if len(thisdf) > 0:
                 has_draws = True
                 print 'Retrieved measure_id %s for meid %s' % (msid, meid)
         except:
             try:
                 thisdf = gopher.draws(
                     {'modelable_entity_ids': [meid]},
                     location_ids=self.location_id,
                     year_ids=self.year_id,
                     sex_ids=self.sex_id,
                     age_group_ids=self.age_group_id,
                     measure_ids=msid,
                     source='dismod',
                     status='latest')
                 if len(thisdf) > 0:
                     has_draws = True
                     print 'Retrieved measure_id %s for meid %s' % (
                             msid, meid)
             except:
                 pass
         if has_draws:
             df.append(thisdf)
         else:
             print 'meid %s draws not found. filling with zeros.' % meid
             dummy_draws = {
                 'modelable_entity_id': meid,
                 'model_version_id': 0,
                 'location_id': self.location_id,
                 'year_id': self.year_id,
                 'age_group_id': self.age_group_id,
                 'sex_id': self.sex_id}
             dummy_draws.update({'draw_%s' % d: 0
                                for d in range(1000)})
             df.append(pd.DataFrame([dummy_draws]))
     df = pd.concat(df)
     reqd_cols = ['modelable_entity_id']
     reqd_cols.extend(draw_cols)
     self.model_version_map = df[[
         'modelable_entity_id', 'model_version_id']]
     self.prevalence = df[reqd_cols].merge(
         self.subin, on='modelable_entity_id', how='left')
     return self.prevalence
def get_unsqueezed(sequelae_map, drawcols, location_id, year_id, sex_id):
    # Get all causes with epilepsy, ID, and blindness
    unsqueezed = []
    for idx, seqrow in sequelae_map.iterrows():
        me_id = int(seqrow[['me_id']])
        a = seqrow['acause']

        try:
            gbd_ids = {'modelable_entity_ids': [me_id]}
            df = gopher.draws(gbd_ids,
                              'dismod',
                              location_ids=location_id,
                              year_ids=year_id,
                              sex_ids=sex_id,
                              measure_ids=5)
            df['me_id'] = me_id
            unsqueezed.append(df)
        except:
            print('Failed retrieving %s. Filling with zeros' % (a))
            df = unsqueezed[0].copy()
            df['me_id'] = me_id
            df.ix[:, drawcols] = 0
            unsqueezed.append(df)

    unsqueezed = pd.concat(unsqueezed)
    unsqueezed = unsqueezed[
        ['me_id', 'location_id', 'year_id', 'age_group_id', 'sex_id'] +
        drawcols]
    unsqueezed = unsqueezed.merge(sequelae_map, on='me_id')
    age_range = range(2, 21) + [30, 31, 32, 235]
    unsqueezed = unsqueezed[unsqueezed['age_group_id'].isin(age_range)]

    return unsqueezed
Beispiel #3
0
def import_cod_model_draws(model_version_id, location_id, cause_id, sex_id,
                           required_columns, filter_years=None):
    """ Import model draws from CODEm/custom models

    Read in CODEm/custom model draws from a given filepath (filtered by a
    specific location_id) and then check to make sure that the imported draws
    are not missing any columns and do not have null values.

    """
    logger = logging.getLogger('io.import_cod_model_draws')
    try:
        data = draws(gbd_ids={'cause_ids': [cause_id]}, source='codem',
                     location_ids=[int(location_id)], sex_ids=[int(sex_id)],
                     year_ids=filter_years, status=model_version_id)
        data = data.ix[data.age_group_id.isin(range(2, 22) +
                                              [30, 31, 32, 235])]
    except Exception:
        logger.exception("Failed to read" + '/n' +
                         'Problem demographics were mvid {} cause {}, '
                         'location {}, sex {}, and years {}'
                         .format(model_version_id, cause_id, location_id,
                                 sex_id, ','.join(str(y) for y in filter_years)
                                 ))
        sys.exit()
    r = check_data_format(data, required_columns)
    if not r:
        print model_version_id, r
        return None
    data = data.ix[:, required_columns]
    return data
def process_location_daly_draws(location_id, test=False):
    """Pull mortality numbers, limiting to desired ages by cause

    Gets all years >1990 and ages for the location id as mortality numbers
    from transmogrifier's gopher library
    """
    dfs = []
    cause_age_sets = [[dw.DALY_ALL_AGE_CAUSE_IDS,
                       range(2, 22)],
                      [dw.DALY_THIRTY_SEVENTY_CAUSE_IDS,
                       range(11, 19)]]
    if test:
        years = [2015]
    else:
        years = []
    for causes, ages in cause_age_sets:
        gbd_ids = {'cause_ids': causes}
        df = gopher.draws(gbd_ids,
                          'dalynator',
                          location_ids=[location_id],
                          year_ids=years,
                          age_group_ids=ages,
                          sex_ids=[3],
                          verbose=True,
                          num_workers=5,
                          version=113)
        # without this here, it can give a too many inputs error
        df = df.query('metric_id == 1 & measure_id == 1')
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)

    df = df.ix[(df['year_id'] >= 1990) |
               ((df['cause_id'].isin(dw.PRE_1990_CAUSES)) &
                (df['year_id'] >= 1985))]

    # make sure it looks like we expect
    assert set(df.age_group_id) == set(range(2, 22)), \
        'unexpected age group ids found'
    assert set(df.sex_id) == set([3]), \
        'unexpected sex ids found'
    if not test:
        assert set(df.ix[df['cause_id'].isin(dw.PRE_1990_CAUSES)].year_id) == \
            set(range(1985, 2016, 1)), \
            'unexpected year ids found'
        assert set(df.ix[
            ~df['cause_id'].isin(dw.PRE_1990_CAUSES)
        ].year_id) == \
            set(range(1990, 2016, 1)), \
            'unexpected year ids found'
    assert set(df.location_id) == set([location_id]), \
        'unexpected location ids found'

    # age standardize
    df = age_standardize(df, 'dalynator')

    # write the output
    write_output(df, 'dalynator', location_id)

    return df
Beispiel #5
0
 def pull_codcorrect_draws(self):
     codcorrect_df = draws(gbd_ids={'cause_ids': self.cause_id},
                           year_ids=self.year_id,
                           source='codcorrect',
                           sex_ids=self.sex_id,
                           measure_ids=[1])
     codcorrect_df = codcorrect_df.ix[codcorrect_df.age_group_id.isin(
         self.age_group_ids + self.aggregated_age_group_ids.keys())]
     return codcorrect_df[self.index_cols + ['cause_id'] + self.draw_cols]
 def get_draws(self, measure_id=6):
     '''Uses gopher.draws to pull draws of the ME for this class instance'''
     draws = gopher.draws(gbd_ids={'modelable_entity_ids': [self.input_me]},
                          source='epi', measure_ids=[measure_id],
                          location_ids=[], year_ids=[self.year_id],
                          age_group_ids=[7, 8, 9, 10, 11, 12, 13, 14, 15],
                          sex_ids=[2])
     loc_df = self.get_locations(35)
     draws = draws.merge(loc_df, on='location_id', how='inner')
     draws.drop('most_detailed', axis=1, inplace=True)
     return draws
Beispiel #7
0
def grab_prevalence_draws(me_id, year, locations):
    # grabs prevalence draws for the given year, me_id, and locations
    gbd_round = {GBD ROUND ID}
    measure = {MEASURE ID}
    sexes = [{SEX ID}]
    ages = [{AGE GROUP IDS}]
    df = draws(source='epi', gbd_ids={"modelable_entity_ids": [me_id]},
               location_ids=locations, year_ids=year,
               age_group_ids=ages, sex_ids=sexes,
               status='best', measure_ids=[measure],
               gbd_round_id=gbd_round)
    return df
Beispiel #8
0
def create_env(location_id, year, sex):
    env_ids = {
            'epi': 2403, 'blind': 9805, 'id_bord': 9423, 'id_mild': 9424,
            'id_mod': 9425, 'id_sev': 9426, 'id_prof': 9427}
    envelope_dict = {}
    for envlab, id in env_ids.iteritems():
        env = gopher.draws(
                {'modelable_entity_ids': [id]},
                'dismod',
                location_ids=location_id,
                year_ids=year,
                sex_ids=sex,
                measure_ids=5)
        envelope_dict[envlab] = env.copy()
    return envelope_dict
def interp_loc(modelable_entity_id, measure_id, location_id, outpath):
    start_year = 1980
    epi_start_year = 1990
    end_year = 2015
    rank_year = 2005

    # Retrieve epi draws and interpolate
    epi_draws = []
    for y in range(epi_start_year, end_year + 1, 5):
        d = gopher.draws({'modelable_entity_ids': [modelable_entity_id]},
                         year_ids=[y],
                         location_ids=[location_id],
                         measure_ids=[measure_id],
                         verbose=False,
                         source="dismod",
                         age_group_ids=[
                             2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                             16, 17, 18, 19, 20, 21
                         ])
        assert len(d) > 0, (
            "Uh oh, couldn't find epi draws. Make sure you have "
            "proportion estimates for the supplied meids")
        epi_draws.append(d)
    epi_draws = pd.concat(epi_draws)
    ip_epi_draws = []
    for y in range(epi_start_year, end_year, 5):
        sy = y
        ey = y + 5
        ip_draws = maths.interpolate(
            epi_draws.query('year_id==%s' % sy),
            epi_draws.query('year_id==%s' % ey),
            ['age_group_id', 'model_version_id', 'sex_id'],
            'year_id', ['draw_%s' % i for i in range(1000)],
            sy,
            ey,
            rank_df=epi_draws.query('year_id==%s' % rank_year))
        if ey != end_year:
            ip_draws = ip_draws[ip_draws.year_id != ey]
        ip_epi_draws.append(ip_draws)
    ip_epi_draws = pd.concat(ip_epi_draws)
    extrap_draws = []
    for y in range(start_year, epi_start_year):
        esy_draws = ip_epi_draws.query('year_id==%s' % epi_start_year)
        esy_draws['year_id'] = y
        extrap_draws.append(esy_draws)
    epi_draws = pd.concat([ip_epi_draws] + extrap_draws)
    epi_draws.to_csv(outpath)
def get_props(args):
    location, year = args
    print location, year
    prev_dfs = gopher.draws({'modelable_entity_ids': [1951, 1952, 1953]},
                            source='dismod',
                            location_ids=location,
                            year_ids=year)
    prev_dfs['location_id'] = prev_dfs.location_id.astype(int)
    prev_dfs['year_id'] = prev_dfs.year_id.astype(int)

    # Extract proportions
    index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id']
    props = maths.scale(prev_dfs, drawcols, index_cols, scalar=1)
    props = props.groupby(['location_id', 'year_id', 'modelable_entity_id'])
    props = props.mean().reset_index()
    props = props[['location_id', 'year_id', 'modelable_entity_id'] + drawcols]
    return props
Beispiel #11
0
def collect_risk_attrib_burden(rei_ids, measure_id, locs=None):
    ''' Given a list of rei_ids, use gopher to get attributable mortality draws
    and save to out directory. Since these are from dalynator draws, no further
    processing should be necessary.
    (except perhaps interpolation? Can do that as final step)

    Note: run this with a big qlogin because I use extra cores to read more
    files in parallel
    '''
    # note -- untested since I don't have permission to create new directories
    if not locs:
        #locs = set(qry.queryToDF(qry.LOCATIONS.format(lsid=35)).location_id)
        query = "select location_id from locations where level = 3"  # Only 188 countries
        engine = sql.create_engine('strConnection')
        locs = set(pd.read_sql_query(query, engine).location_id.values)

    df = gopher.draws(gbd_ids={
        "rei_ids": rei_ids,
        "cause_ids": [294]
    },
                      source='dalynator',
                      version=dw.RISK_BURDEN_DALY_VERS,
                      location_ids=locs,
                      age_group_ids=[27],
                      sex_ids=[3],
                      year_ids=[1990, 1995, 2000, 2005, 2010, 2015],
                      measure_ids=[measure_id],
                      metric_ids=[1],
                      verbose=True,
                      num_workers=10)
    out_dir = dw.RISK_BURDEN_OUTDIR
    # everything is already formatted perfectly so it can just be saved
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    for rei_id in df.rei_id.unique():
        print rei_id
        odf = df.query("rei_id == @rei_id")
        #sdg_test.all_sdg_locations(odf)
        odf.to_hdf(out_dir + "/{}.h5".format(int(rei_id)),
                   key="data",
                   format="table",
                   data_columns=["location_id", "year_id"])
    return df
Beispiel #12
0
    def import_square(self, gopher_what, source, filler=None, **kwargs):
        """get draws for the specified modelable entity by dimensions"""
        if not kwargs:
            kwargs = self.idx_dmnsns.copy()

            # replace keys with their plural form for gopher.draws
            for k in kwargs.keys():
                kwargs[k + "s"] = kwargs.pop(k)

        if filler is None:
            filler = 0

        df = gopher.draws(gopher_what, source=source, verbose=False, **kwargs)
        for c in self.idx_dmnsns.keys():
            df[c] = pd.to_numeric(df[c])
        df = df.set_index(self.idx_dmnsns.keys())
        df = df[self.draw_cols]
        df = pd.concat([self.index_df, df], axis=1)
        df.fillna(value=filler, inplace=True)
        return df
Beispiel #13
0
def split_location(location_id, gbd_round):
    draws = gopher.draws({'modelable_entity_ids': [ss.parent_meid]},
                         source='dismod',
                         location_ids=location_id,
                         measure_ids=[5, 6],
                         gbd_round_id=year_map[gbd_round])
    draws['measure_id'] = draws.measure_id.astype(int)
    gprops = ss.gbdize_proportions(location_id)
    gprops = gprops[gprops.measure_id.isin(draws.measure_id.unique())]
    gprops = gprops[gprops.age_group_id.isin(draws.age_group_id.unique())]
    gprops = gprops[gprops.sex_id.isin(draws.sex_id.unique())]
    dcs = ['draw_%s' % i for i in range(1000)]
    splits = maths.merge_split(draws,
                               gprops,
                               group_cols=[
                                   'location_id', 'year_id', 'age_group_id',
                                   'sex_id', 'measure_id'
                               ],
                               value_cols=dcs)
    splits = splits.assign(modelable_entity_id=splits['child_meid'])
    return splits
Beispiel #14
0
def get_unsqueezed(sequelae_map, drawcols, location_id, year, sex):
    # Get all causes with epilepsy, ID, and blindness
    unsqueezed = []
    for idx, seqrow in sequelae_map.iterrows():
        me_id = int(seqrow[['me_id']])
        a = seqrow['acause']
        g = seqrow['grouping']
        h = seqrow['healthstate']

        try:
            gbd_ids = {'modelable_entity_ids': [me_id]}
            df = gopher.draws(
                    gbd_ids,
                    'dismod',
                    location_ids=location_id,
                    year_ids=year,
                    sex_ids=sex,
                    measure_ids=5)
            df['me_id'] = me_id
            unsqueezed.append(df)
        except:
            print('Failed retrieving %s %s %s' %
                  (a, g, h))
            df = unsqueezed[0].copy()
            df['me_id'] = me_id
            df.ix[:, drawcols] = 0
            unsqueezed.append(df)

    unsqueezed = pd.concat(unsqueezed)
    unsqueezed = unsqueezed[[
        'me_id', 'location_id', 'year_id', 'age_group_id', 'sex_id']+drawcols]
    unsqueezed = unsqueezed.merge(sequelae_map, on='me_id')
    unsqueezed = unsqueezed[unsqueezed.age_group_id < 22]
    unsqueezed = unsqueezed[unsqueezed.age_group_id > 1]

    return unsqueezed
Beispiel #15
0
    def test_diff_input(self, gbd_id_dict, measure_ids, location_ids,
                        start_year, end_year, sex_ids, status, source, version,
                        metric_ids, age_group_ids, change_type):
        ''' run pct_change on all the different inputs given, and verify
        the results match the value obtained after manually doing the math on
        one draw
        (Or perhaps only validate inputs that we already know the answer
        to and can use a db lookup to compare?)'''
        # Get draws
        df = draws(
            gbd_id_dict,
            measure_ids=measure_ids,
            location_ids=location_ids,
            year_ids=[start_year, end_year],
            age_group_ids=age_group_ids,
            sex_ids=sex_ids,
            status=status,
            source=source,
            include_risks=True,
            version=version).reset_index(drop=True)
        # standardize all inputs by transforming everything to rate space
        df = define_metric(df, source)
        if 1 in df.metric_id.unique():
            df.loc[df.metric_id == 1] = transform_metric(
                df.loc[df.metric_id == 1], to_id=3, from_id=1)

        # calculate pct_change
        # drop any 2's. transform only 3's.
        if change_type == 'pct_change_num':
            df = transform_metric(df[df.metric_id == 3], to_id=1, from_id=3)
        if change_type in ['pct_change_rate', 'pct_change_num']:
            change_type = 'pct_change'
        change_df = pct_change(df, start_year, end_year, change_type)

        # validate change_df result here...
        return change_df
Beispiel #16
0
locations = maternal_fns.get_locations()

# set up columns we want to subset
columns = maternal_fns.filter_cols()
index_cols = [col for col in columns if not col.startswith('draw_')]

# logging
rlog.open('FILEPATH.log' % (log_dir, year))
rlog.log('')
rlog.log('Starting to get late cause fractions')

##############################################
# GET LATE CAUSE FRACTIONS:
##############################################
codcorrect_df = draws(gbd_ids={'cause_ids': [env_id, late_id]},
                      source='codcorrect', year_ids=[year], sex_ids=[2],
                      measure_ids=[1])
codcorrect_df['measure_id'] = 1
codcorrect_df = codcorrect_df[codcorrect_df.age_group_id.isin(range(7, 16))]

envelope_df = codcorrect_df[codcorrect_df.cause_id == env_id]
late_df = codcorrect_df[codcorrect_df.cause_id == late_id]

# we only want index_cols and draws as columns
envelope_df = envelope_df[columns].set_index(index_cols).sort()
late_df = late_df[columns].set_index(index_cols).sort()

# calculate late cause fractions
rlog.log('Calculating late cfs for year %s' % year)
late_cfs = late_df / envelope_df
Beispiel #17
0
def get_models(cause_set, ages, years, locations):
    #############################################
    #Pull in active oldCorrect models given a
    #cause set, a set of ages, and a set of years
    #############################################
    sources, targets = get_cause_ids(cause_set)

    source_model_codem_dfs = []
    source_model_dismod_dfs = []
    hybrid_scale_input_mvt = 8
    custom_mvt = 4
    hybrid_mvt = 3
    for cause_id in sources:
        source_models_codem = pull_mvid(cause_id, hybrid_scale_input_mvt)
        source_models_dismod = pull_mvid(cause_id, custom_mvt)
        source_ids = {'cause_ids': [cause_id]}
        #############################################
        #Pull in codem models
        #############################################
        for source_model_codem in source_models_codem:
            source_codem = draws(source_ids,
                                 source='codem',
                                 measure_ids=[1],
                                 year_ids=years,
                                 age_group_ids=ages,
                                 location_ids=locations,
                                 status=source_model_codem)
            source_codem.drop('model_version_id', axis=1, inplace=True)
            try:
                source_codem.drop('measure_id', axis=1, inplace=True)
            except:
                pass
            try:
                source_codem.drop(['envelope', 'pop'], axis=1, inplace=True)
            except:
                pass

            source_model_codem_dfs.append(source_codem)
        #############################################
        #Pull in custom/dismod models
        #############################################
        for source_model_dismod in source_models_dismod:
            source_dismod = draws(source_ids,
                                  source='codem',
                                  measure_ids=[1],
                                  year_ids=years,
                                  age_group_ids=ages,
                                  location_ids=locations,
                                  status=source_model_dismod)
            source_dismod.drop('model_version_id', axis=1, inplace=True)
            try:
                source_dismod.drop('measure_id', axis=1, inplace=True)
            except:
                pass
            try:
                source_dismod.drop(['envelope', 'pop'], axis=1, inplace=True)
            except:
                pass

            source_model_dismod_dfs.append(source_dismod)

    source_codem = pd.concat(source_model_codem_dfs)
    source_dismod = pd.concat(source_model_dismod_dfs)

    target_dfs = []
    target_nulls = []
    for cause_id in targets:
        target_models = pull_mvid(cause_id, hybrid_mvt)
        target_ids = {'cause_ids': [cause_id]}
        #############################################
        #Pull in target models
        #############################################
        for target_model in target_models:
            target = draws(target_ids,
                           source='codem',
                           measure_ids=[1],
                           year_ids=years,
                           age_group_ids=ages,
                           location_ids=locations,
                           status=target_model)
            target.drop('model_version_id', axis=1, inplace=True)
            try:
                target.drop('measure_id', axis=1, inplace=True)
            except:
                pass
            try:
                target.drop(['envelope', 'pop'], axis=1, inplace=True)
            except:
                pass
            if len(target[target.isnull().any(axis=1)]) != 0:
                target_nulls.append(cause_id)
            target_dfs.append(target)

    assert len(target_nulls) == 0, "Nulls target: %s" % target_nulls
    target_df = pd.concat(target_dfs)

    assert len(source_dismod[source_dismod.isnull().any(axis=1)]) ==\
        0, "Nulls dismod"
    assert len(source_codem[source_codem.isnull().any(axis=1)]) ==\
        0, "Nulls codem"

    return source_codem, source_dismod, target_df
    # set demographic data
    me_id = 8691
    location_set = 35
    gbd_round = 4
    locations = get_most_detailed(location_set, gbd_round)
    years = [1990, 1995, 2000, 2005, 2010, 2016]
    ages = [164]
    sexes = [1, 2]
    measure = 5
    upload_me = 15803

    # grab the u_2500 birth prevalence
    prev_df = draws(source='dismod',
                    gbd_ids={"modelable_entity_ids": [me_id]},
                    location_ids=locations, year_ids=years,
                    age_group_ids=ages, sex_ids=sexes,
                    status='best', measure_ids=[measure],
                    gbd_round_id=gbd_round)
    prev_df = index_draws_by_demographics(prev_df)

    def mapping(x):
        y = predict_for_simple_ols(x, mean_parameters, cov_matrix)
        return y

    mean_weight_df = prev_df.applymap(mapping)
    mean_weight_df['age_group_id'] = 2
    save_to_hdf(mean_weight_df, savefile)
    description = ('Estimate of mean birth weight from simple linear'
                   'regression. Units in grams')
    al.save_custom_results(meid=upload_me, description=description,
                           input_dir=output_dir,
Beispiel #19
0
# logging
rlog.open('%s/%s.log' % (log_dir, jobname))
rlog.log('out_dir is %s' % out_dir)

# set up columns we want to subset
columns = maternal_fns.filter_cols()
columns.remove('measure_id')
index_cols = [col for col in columns if not col.startswith('draw_')]

# read maternal disorders envelope
# CAUSES get multiplied by the Late corrected env from codem
# TIMINGS get multiplied by the CoDcorrect env
rlog.log("reading in envelope draws")
if 'timing' in jobname:
    env = draws(gbd_ids={'cause_ids': [366]}, source='codcorrect',
                measure_ids=[1], sex_ids=[2], location_ids=locations)
else:
    env = draws(gbd_ids={'cause_ids': [366]}, source='codem', sex_ids=[2],
                status=int(env_model_vers))
env = env[env.location_id.isin(locations)]
# we only want maternal age groups
env = env[env.age_group_id.isin(range(7, 16))]
# we only want index cols & draws as columns, w multiindex
env = env[columns].set_index(index_cols).sort_index()

# read cfs
rlog.log("reading in cfs")
cfs = draws(gbd_ids={'modelable_entity_ids': [source_id]}, source='dismod',
            measure_ids=[18], sex_ids=[2])
cfs = cfs[cfs.location_id.isin(locations)]
# we only want maternal age groups
Beispiel #20
0
    held_constant_me = 9015

#######################################################################
# STEP 1: FOR EACH CAUSE, EXTRACT FILES, GET SUM BY GROUP + TOTAL SUM
#######################################################################
print 'getting data'
rlog.log('getting data')
all_data = {}
summed_idx = 0

for index, row in step_df.iterrows():
    target_id = row['target_id']
    try:
        subtype_df = draws(
            gbd_ids={'modelable_entity_ids': [row['source_id']]},
            source='dismod',
            measure_ids=[18],
            sex_ids=[2],
            year_ids=[year])
    except (ValueError, OSError):  # pull data from where interp saves it
        subtype_df = pd.read_hdf(
            '%s/%s/%s_2.h5' % (cluster_dir, row['source_id'], year), 'draws')
    subtype_df = subtype_df.ix[(subtype_df.location_id.isin(locs))
                               & (subtype_df.age_group_id.isin(range(7, 16)))]
    subtype_df = subtype_df[columns].set_index(index_cols).sort_index()

    if row['source_id'] == str(held_constant_me):
        held_constant_df = subtype_df.copy(deep=True)
    else:
        # save this dataframe, and also sum it to all other subtypes
        all_data[target_id] = subtype_df
def process_location_como_draws(location_id, measure_id, test=False):
    """Pull indidence rates, merging with population to make cases

    Using COMO because there are plans to make this store each year.

    Gets all years, ages, and sexes for the location id as incidence rates
    from transmogrifier's gopher library, and combines into all ages, both
    sexes cases.
    """
    db_pops = qry.get_pops()
    if measure_id == 6:
        gbd_ids = {'cause_ids': dw.COMO_INC_CAUSE_IDS}
    elif measure_id == 5:
        gbd_ids = {'cause_ids': dw.COMO_PREV_CAUSE_IDS}
    else:
        raise ValueError("bad measure_id: {}".format(measure_id))
    if test:
        years = [2015]
    else:
        years = []
    df = gopher.draws(gbd_ids,
                      'como',
                      measure_ids=[measure_id],
                      location_ids=[location_id],
                      year_ids=years,
                      age_group_ids=[],
                      sex_ids=[],
                      verbose=True,
                      num_workers=5,
                      version=dw.COMO_VERS)

    # make sure it looks like we expect
    assert set(df.age_group_id) == set(range(2, 22)), \
        'unexpected age group ids found'
    assert set(df.sex_id) == set([1, 2]), \
        'unexpected sex ids found'
    if not test:
        assert set(df.year_id) == set(range(1990, 2016, 5)), \
            'unexpected year ids found'
    assert set(df.location_id) == set([location_id]), \
        'unexpected location ids found'

    # these pull in as rates
    df['metric_id'] = 3

    # merge with pops to transform to cases
    df = df.merge(db_pops, how='left')
    assert df.mean_pop.notnull().values.all(), 'merge with populations failed'

    # concatenate the metadata with the draw cols times the pop
    # this multiplies each draw column by the mean_pop column
    df = pd.concat([
        df[dw.COMO_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['mean_pop'])
    ],
                   axis=1)

    # now its numbers (this line is for readability)
    df['metric_id'] = 1

    # aggregate sexes
    df['sex_id'] = 3

    # collapse sexes together
    df = df.groupby(dw.COMO_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum()

    # age standardize
    df = age_standardize(df, 'como')

    write_output(df, 'como', location_id)
    return df
Beispiel #22
0
    assert (end_year > start_year), "Yr end must be more recent than yr start"
    assert (source != 'risk'), "Risk as a source is not supported."

    # convert kwargs from a list of single key dicts to one dict with
    # multiple keys, if any specified from get_pct_change.ado
    for d in args.pop('kwargs'):
        for k, v in d.iteritems():
            args[k] = v

    # Get draws
    try:
        df = draws(gbd_id_dict,
                   measure_ids=args.pop('measure_ids'),
                   location_ids=args.pop('location_ids'),
                   year_ids=[start_year, end_year],
                   age_group_ids=args.pop('age_group_ids'),
                   sex_ids=args.pop('sex_ids'),
                   status=status,
                   source=args.pop('source'),
                   include_risks=args.pop('include_risks'),
                   **args).reset_index(drop=True)
    except Exception as e:
        # catch all exceptions, because we need to write something to stdout
        # no matter what error. Get_pct_change.ado creates a pipe and reads
        # from it -- if nothing is written to the pipe, stata hangs
        print "Encountered error while reading draws: {}".format(e)
        raise

    # If they want age-std, make sure that's possible
    if 27 in age_group_ids:
        assert change_type != 'pct_change_num', ('Cant calc pct_change_num '
                                                 'for age-std')
Beispiel #23
0
def allocate_residuals(usqzd, sqzd, location_id, year_id, sex_id, map_file,
                       drawcols):
    tmap = pd.read_csv(map_file)

    resids = usqzd.merge(
        sqzd,
        on=['location_id', 'year_id', 'age_group_id', 'sex_id', 'me_id'],
        suffixes=('.usqzd', '.sqzd'))
    resids = resids[resids['resid_target_me.usqzd'].notnull()]

    dscols = ['draw_%s.sqzd' % d for d in range(1000)]
    ducols = ['draw_%s.usqzd' % d for d in range(1000)]
    toalloc = resids[ducols].values - resids[dscols].values
    toalloc = toalloc.clip(min=0)
    resids = resids.join(
        pd.DataFrame(data=toalloc, index=resids.index, columns=drawcols))
    resids = resids[[
        'location_id', 'year_id', 'age_group_id', 'sex_id',
        'resid_target_me.usqzd'
    ] + drawcols]
    resids.rename(columns={'resid_target_me.usqzd': 'resid_target_me'},
                  inplace=True)
    resids = resids.groupby(['resid_target_me', 'age_group_id']).sum()
    resids = resids.reset_index()
    resids = resids[['resid_target_me', 'age_group_id'] + drawcols]

    for me_id, resid_df in resids.groupby('resid_target_me'):
        t_meid = tmap.query('modelable_entity_id_source == %s' % me_id)
        t_meid = t_meid.modelable_entity_id_target.squeeze()
        try:
            t_meid = int(t_meid)
        except:
            pass
        present = True
        try:
            gbd_ids = {'modelable_entity_ids': [me_id]}
            t_df = gopher.draws(gbd_ids,
                                'dismod',
                                location_ids=location_id,
                                year_ids=year_id,
                                sex_ids=sex_id,
                                measure_ids=5)
        except ValueError:
            present = False
        if present:
            t_df = t_df.merge(resid_df,
                              on='age_group_id',
                              suffixes=('#base', '#resid'))
            newvals = (t_df.filter(like="#base").values +
                       t_df.filter(like="#resid").values)
            t_df = t_df.join(
                pd.DataFrame(data=newvals, index=t_df.index, columns=drawcols))

            print('Writing residual %s to file' % t_meid)
            drawsdir = "/FILEPATH"
            fn = "%s/%s_%s_%s.h5" % (drawsdir, location_id, year_id, sex_id)
            try:
                os.makedirs(drawsdir)
            except OSError as e:
                if e.errno == errno.EEXIST:
                    pass
                else:
                    raise
            t_df['location_id'] = int(float(location_id))
            t_df['year_id'] = int(float(year_id))
            t_df['sex_id'] = int(float(sex_id))
            t_df['measure_id'] = 5
            t_df['age_group_id'] = t_df.age_group_id.astype(float).astype(int)
            datacols = [
                'location_id', 'year_id', 'age_group_id', 'sex_id',
                'measure_id'
            ]
            t_df[datacols + drawcols].to_hdf(fn,
                                             'draws',
                                             mode='w',
                                             format='table',
                                             data_columns=datacols)
        else:
            print('ME ID %s missing' % me_id)

    return resids
def process_risk_exposure_draws(location_id, test=False):
    """Return yearly age standardized estimates of each rei_id.

    1. Use gopher to pull data for each rei_id for the location_id
    the location id, and all years.
    2. Keep appropriate categories for given rei_id
    3. Draws only come with male/female in rates -
        change to cases and make both sexes aggregate.
    4. Revert back to rates and age standardize using custom weights.

    Arguments:
        location_id: the location_id to process

    Returns:
        pandas dataframe like so:
        [ID_COLS] : [dw.DRAW_COLS]
    """
    dfs = []

    version_df = pd.DataFrame()
    all_ids = set(dw.RISK_EXPOSURE_REI_IDS).union(
        set(dw.RISK_EXPOSURE_REI_IDS_MALN))
    if test:
        years = [2015]
    else:
        years = []
    for rei_id in all_ids:
        print("pulling {r}".format(r=rei_id))
        df = gopher.draws({"rei_ids": [rei_id]},
                          source='risk',
                          draw_type='exposure',
                          location_ids=[location_id],
                          year_ids=years,
                          age_group_ids=[],
                          sex_ids=[1, 2],
                          num_workers=5)
        # remove any other ages besides gbd ages
        df = df.query('age_group_id >= 2 & age_group_id <= 21')
        # only reporting since 1990
        df = df.query('year_id>=1990')

        if rei_id == 167:
            # change IPV to just women
            df = df.query('sex_id == 2')

        if rei_id in dw.RISK_EXPOSURE_REI_IDS_MALN:
            # these are childhood stunting - cat1 + cat2 equals <-2 std dev
            df = df.query('parameter=="cat1" | parameter=="cat2"')
        else:
            # cat1 represents the prevalence in these cases (can't test this?)
            df = df.query('parameter=="cat1"')

        # set the rei_id because it isnt in the gopher pull
        df['rei_id'] = rei_id

        # keep track of what model versions where used
        version_df = version_df.append(
            df[['rei_id', 'modelable_entity_id',
                'model_version_id']].drop_duplicates(),
            ignore_index=True)

        # these are prevalence rates
        df['metric_id'] = 3
        df['measure_id'] = 5

        dfs.append(df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS])

    df = pd.concat(dfs, ignore_index=True)

    # note the versions used by risk exposure vers (manufactured by me)
    version_df.to_csv(
        "/home/j/WORK/10_gbd/04_journals/"
        "gbd2015_capstone_lancet_SDG/02_inputs/"
        "risk_exposure_versions_{v}.csv".format(v=dw.RISK_EXPOSURE_VERS),
        index=False)

    # COLLAPSE SEX
    print("collapsing sex")
    df = df.merge(qry.get_pops(), how='left')
    assert df.mean_pop.notnull().values.all(), 'merge with pops fail'
    # overriding the sex variable for collapsing
    df['sex_id'] = df.rei_id.apply(lambda x: 2 if x == 167 else 3)

    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['mean_pop'])
    ],
                   axis=1)
    # so unnecessary programmatically but good for documentation -
    #  these are now prev cases
    df['metric_id'] = 1
    # now that its in cases it is possible to collapse sex
    df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False).sum()

    # RETURN TO RATES
    print("returning to rates")
    df = df.merge(qry.get_pops(), how='left')
    assert df.mean_pop.notnull().values.all(), 'merge with pops fail'
    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x / df['mean_pop'])
    ],
                   axis=1)
    df['metric_id'] = 3

    # AGE STANDARDIZE
    print("age standardizing")
    df['is_0_5'] = df.rei_id.apply(
        lambda x: 1 if x in dw.RISK_EXPOSURE_REI_IDS_MALN else 0)
    wgts = custom_age_weights(2, 21)
    wgts['is_0_5'] = 0
    wgts_2 = custom_age_weights(2, 5)
    wgts_2['is_0_5'] = 1
    wgts = wgts.append(wgts_2, ignore_index=True)
    df = df.merge(wgts, on=['is_0_5', 'age_group_id'], how='left')
    assert df.age_group_weight_value.notnull().values.all(), \
        'merge w wgts failed'
    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value'])
    ],
                   axis=1)
    df['age_group_id'] = 27
    df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS,
                    as_index=False)[dw.DRAW_COLS].sum()

    write_output(df, 'risk_exposure', location_id)
    return df
Beispiel #25
0
def allocate_residuals(usqzd, sqzd):
    tmap = pd.read_excel(
            "strCodeDir/map_pre_pos_mes.xlsx")

    resids = usqzd.merge(
            sqzd,
            on=['location_id', 'year_id', 'age_group_id', 'sex_id', 'me_id'],
            suffixes=('.usqzd', '.sqzd'))
    resids = resids[resids['resid_target_me.usqzd'].notnull()]

    dcols = ['draw_%s' % d for d in range(1000)]
    dscols = ['draw_%s.sqzd' % d for d in range(1000)]
    ducols = ['draw_%s.usqzd' % d for d in range(1000)]
    toalloc = resids[ducols].values - resids[dscols].values
    toalloc = toalloc.clip(min=0)
    resids = resids.join(pd.DataFrame(
        data=toalloc, index=resids.index, columns=dcols))
    resids = resids[[
        'location_id', 'year_id', 'age_group_id', 'sex_id',
        'resid_target_me.usqzd']+dcols]
    resids.rename(
            columns={'resid_target_me.usqzd': 'resid_target_me'},
            inplace=True)
    resids = resids.groupby(['resid_target_me', 'age_group_id']).sum()
    resids = resids.reset_index()
    resids = resids[['resid_target_me', 'age_group_id']+dcols]

    for me_id, resid_df in resids.groupby('resid_target_me'):
        t_meid = tmap.query('modelable_entity_id_source == %s' % me_id)
        t_meid = t_meid.modelable_entity_id_target.squeeze()
        try:
            t_meid = int(t_meid)
        except:
            pass

        gbd_ids = {'modelable_entity_ids': [me_id]}
        t_df = gopher.draws(
                gbd_ids,
                'dismod',
                location_ids=location_id,
                year_ids=year,
                sex_ids=sex,
                measure_ids=5)
        t_df = t_df.merge(
                resid_df, on='age_group_id', suffixes=('#base', '#resid'))
        newvals = (
            t_df.filter(like="#base").values +
            t_df.filter(like="#resid").values)
        t_df = t_df.join(pd.DataFrame(
            data=newvals, index=t_df.index, columns=dcols))

        print 'Writing residual %s to file' % t_meid
        drawsdir = "strOutDir/%s" % t_meid
        fn = "%s/%s_%s_%s.h5" % (drawsdir, location_id, year, sex)
        try:
            os.makedirs(drawsdir)
        except:
            pass
        t_df['location_id'] = int(float(location_id))
        t_df['year_id'] = int(float(year))
        t_df['sex_id'] = int(float(sex))
        t_df['measure_id'] = 5
        t_df['age_group_id'] = t_df.age_group_id.astype(float).astype(int)
        datacols = [
                'location_id', 'year_id', 'age_group_id', 'sex_id',
                'measure_id']
        t_df[datacols+dcols].to_hdf(
                fn,
                'draws',
                mode='w',
                format='table',
                data_columns=datacols)

    return resids