Esempio n. 1
0
def add_china_aggregate(df):
    """Append china aggregate to the dataframe.

    Take the pop-weighted mean, by draw, of each china subnational.
    """

    # subset to just china
    chn = df.ix[df['ihme_loc_id'].str.startswith("CHN")]

    # merge with populations
    pops = qry.get_pops()
    pops = pops.query('sex_id==3 & age_group_id == 22')
    pops = pops[['location_id', 'year_id', 'mean_pop']]
    chn = chn.merge(pops, how='left')
    assert chn.mean_pop.notnull().values.all(), 'merge with pops failed'

    # calculate the pop-weighted average of each draw column
    # lambda within a lambda! because x is a dataframe.
    # And y are series that use x columns.
    # 'chn' only has year & draws afterwards so add ihme_loc_id, location_id
    g = chn.groupby(['year_id'])
    chn = g.apply(lambda x: x[dw.DRAW_COLS].apply(lambda y: np.average(
        y, weights=x['mean_pop']))).reset_index()
    chn['ihme_loc_id'] = "CHN"
    chn['location_id'] = 6

    # add the national observation to df
    df = df.append(chn, ignore_index=True)
    return df
def age_standardize(df, indicator_type):
    """Make each draw in the dataframe a rate, then age standardize.
    """

    if indicator_type == 'como':
        group_cols = dw.COMO_GROUP_COLS
    elif indicator_type == 'codcorrect':
        group_cols = dw.CC_GROUP_COLS
    else:
        raise ValueError("bad type: {}".format(indicator_type))

    assert set(df.sex_id.unique()) == {3}, \
        'falsely assuming only both sexes included'
    assert set(df.metric_id.unique()) == {1}, \
        'falsely assuming df is all numbers'
    db_pops = qry.get_pops(both_sexes=True)
    db_pops = db_pops[[
        'location_id', 'year_id', 'sex_id', 'age_group_id', 'population'
    ]]

    # do special things for the 30-70 causes
    # merge special age weights on these cause ids using is_30_70 indicator
    df['is_30_70'] = df.cause_id.apply(
        lambda x: 1 if x in dw.CC_THIRTY_SEVENTY_CAUSE_IDS else 0)

    # get age weights with is_30_70 special weights
    age_weights = custom_age_weights(0, 125)
    age_weights['is_30_70'] = 0
    age_weights_30_70 = custom_age_weights(30, 70)
    age_weights_30_70['is_30_70'] = 1
    age_weights = age_weights.append(age_weights_30_70, ignore_index=True)
    all_age = pd.DataFrame(
        {
            'age_group_id': 22,
            'age_group_weight_value': 1,
            'is_30_70': 0
        },
        index=[0])
    age_weights = age_weights.append(all_age, ignore_index=True)

    df = df.merge(db_pops, how='left')
    assert df.population.notnull().values.all(), 'merge with pops failed'
    df = df[df.population.notnull()]
    df = df.merge(age_weights, on=['age_group_id', 'is_30_70'], how='left')
    assert df.age_group_weight_value.notnull().values.all(), 'age weights merg'

    # concatenate the metadata with the draw cols times the pop
    # this multiplies each draw column by the population column
    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(
            lambda x: (x / df['population']) * df['age_group_weight_value'])
    ],
                   axis=1)

    # now a rate, age standardized
    df['metric_id'] = 3
    df.loc[df.age_group_id != 22, 'age_group_id'] = 27

    df = df.groupby(group_cols, as_index=False)[dw.DRAW_COLS].sum()
    return df
Esempio n. 3
0
def add_england_aggregate(df, locsdf):
    """Append england aggregate to the dataframe.

    Take the pop-weighted mean, by draw, of each china subnational.
    """

    # subset to just UTLAs
    utlas = locsdf[locsdf['path_to_top_parent'].str.contains(',4749,')].query('most_detailed == 1')['location_id'].values
    eng = df[df['location_id'].isin(utlas)]

    # merge with populations
    pops = qry.get_pops(both_sexes=True)
    pops = pops.query('sex_id==3 & age_group_id == 22')
    eng = eng.merge(pops[['location_id', 'year_id', 'population']])
    assert eng.population.notnull().values.all(), 'merge with pops failed'

    # calculate the pop-weighted average of each draw column
    g = eng.groupby(['year_id'])
    eng = g.apply(lambda x: x[dw.DRAW_COLS].apply(
                    lambda y: np.average(y, weights=x['population'])
                )
    ).reset_index()
    eng['location_id'] = 4749

    # add the national observation to df
    df = df.append(eng, ignore_index=True)
    return df
Esempio n. 4
0
def load_age_disagg_births():

    df = qry.get_asfr()
    db_pops = qry.get_pops()
    df = df.merge(db_pops, how='left')
    assert df['population'].notnull().values.all(
    ), 'merge with population failed'

    df['asfr'] = df['asfr'] * df['population']
    df.drop('population', axis=1, inplace=True)
    df.rename(columns={'asfr': 'births'}, inplace=True)

    return df
def age_location_aggregate(past_future, group_cols, version):
    """multiply by population column and aggregate ages"""
    if past_future == 'past':
        path = '{dd}dismod/{v}'.format(dd=dw.INPUT_DATA_DIR, v=version)
        pops = qry.get_pops()
    elif past_future == 'future':
        path = '{dd}dismod/{v}'.format(dd=dw.FORECAST_DATA_DIR, v=version)
        pops = load_population()
    else:
        raise ValueError(
            'The past_future arg must be set to "past" or "future".')

    print("reading file")
    df = pd.read_feather(path + '/' + '1064_age_disagg.feather')

    df = df.merge(pops, how='left')
    assert df.population.notnull().values.all(), 'merge with pops fail'

    print("aggregatings ages")
    df.loc[:, 'age_group_id'] = 202

    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x * df['population']),
        df['population']
    ],
                   axis=1)

    # sums the rows by sex
    df = df.groupby(group_cols,
                    as_index=False)[dw.DRAW_COLS + ['population']].sum()

    # return to appropriate metric
    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x / df['population']),
        df['population']
    ],
                   axis=1)

    df_global = df.copy(deep=True)

    print("outputting feather")
    df.drop('population', axis=1, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_feather('{p}/1064.feather'.format(p=path))

    # global
    df_global = aggregate_locations_to_global(df_global, group_cols=group_cols)
    df_global.to_feather('{p}/1064_global.feather'.format(p=path))

    return df
Esempio n. 6
0
def convert_to_rates(df):
    """Convert back to rates by merging on pop"""
    pops = qry.get_pops(both_sexes=True)
    df = df.merge(pops, how = 'inner')#how='left')
    assert df.mean_pop.notnull().values.all(), 'pop merge failed'
    id_cols = dw.EPI_CHILD_OVRWGT_GROUP_COLS
    draws = [col for col in df.columns if 'draw_' in col]
    df = pd.concat([
        df[id_cols],
        df[draws].apply(lambda x: x / df['mean_pop'])
    ], axis=1
    )
    df['metric_id'] = 3
    return df
Esempio n. 7
0
def process_uhc_intervention_draws(version=dw.COV_VERS):
    data_dir = dw.INPUT_DATA_DIR + 'covariate' + '/' + str(version)
    component_ids = dw.UHC_INTERVENTION_COMP_IDS

    # first rename components that don't need further prep
    rename_ids = [str(rid) for rid in component_ids if rid not in [206, 209]]

    for rid in rename_ids:
        if path.isfile(data_dir + '/' + rid + '.feather'):
            print('renaming' + rid)
            rename(data_dir + '/' + rid + '.feather',
                   data_dir + '/' + rid + '_prepped' + '.feather')

    dfs = []
    for component_id in [
            206, 209
    ]:  # these are the interventions that require aggregation
        print("pulling {c}".format(c=component_id))
        df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather')
        dfs.append(df)

    print('concatenating')
    df = pd.concat(dfs, ignore_index=True)

    # merge populations
    db_pops = qry.get_pops()
    df = df.merge(db_pops, how='left')

    # age/sex aggregate
    df.loc[df.indicator_component_id == 209, 'sex_id'] = 3
    df.loc[df.indicator_component_id == 209, 'age_group_id'] = 29  # 15+
    df.loc[df.indicator_component_id == 206, 'age_group_id'] = 24  # 15-49
    df = agg.age_sex_aggregate(df,
                               group_cols=dw.COV_GROUP_COLS,
                               denominator='population')

    # output
    df = df[dw.COV_GROUP_COLS + dw.DRAW_COLS]

    for component_id in [206, 209]:
        print("outputting " + str(component_id))
        df_id = df[df.indicator_component_id == component_id]
        df_id.reset_index(drop=True, inplace=True)
        df_id.to_feather(data_dir + '/' + str(component_id) + '_prepped' +
                         '.feather')
Esempio n. 8
0
def process_como_prev_draws(past_future, version=dw.COMO_VERS):

    if past_future == 'past':
        data_dir = dw.INPUT_DATA_DIR + 'como_prev' + '/' + str(
            version)  # just como_prev for now
    elif past_future == 'future':
        data_dir = dw.FORECAST_DATA_DIR + 'como_prev' + '/' + str(version)
    else:
        raise ValueError(
            'The past_future arg must be set to "past" or "future".')

    db_pops = qry.get_pops()

    dfs = []

    # nonfatal and nema + fatal
    component_ids = [
        125, 128, 131, 1433, 149, 152, 140, 143, 146, 104, 107, 110, 113, 116,
        119, 122, 134, 137
    ]
    for component_id in component_ids:
        print("pulling " + str(component_id))
        df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather')
        df['metric_id'] = 3
        df = df.merge(db_pops, how='left')

        # Keep sex-split
        df = df[df.sex_id != 3]  # temporary for goalkeepers diagnostics
        df_sex_split = df.copy(deep=True)

        # aggregate sexes
        df = agg.aggregate_sexes(df, dw.COMO_GROUP_COLS)

        # age standardize
        print('appending sex split data')
        df = df.append(df_sex_split, ignore_index=True)
        df = agg.age_standardize(df, dw.COMO_GROUP_COLS, 0, 125)

        df = df[dw.COMO_GROUP_COLS + dw.DRAW_COLS]

        print("outputting " + str(component_id))
        df.to_feather(data_dir + '/' + str(component_id) + '_as.feather')
Esempio n. 9
0
def collapse_sex(df):
    """Convert prevalence to cases"""
    pops = qry.get_pops(both_sexes=False)
    df = df.merge(pops, how = 'left', on = ['location_id','age_group_id','sex_id','year_id'])

    draws = [col for col in df.columns if 'draw_' in col]
    id_cols = dw.EPI_CHILD_OVRWGT_GROUP_COLS
    # make sex 3 to collapse to both
    df['sex_id'] = 3
    # make metric id 1 to represent cases (will change when converted back to
    # rates)
    df['metric_id'] = 1
    # convert to cases by multiplying each draw by the population value
    df = pd.concat([df[id_cols],
                    df[draws].apply(lambda x: x * df['mean_pop'])
                    ], axis=1
                   )
    # sum sexes together
    df = df.groupby(id_cols, as_index=False)[draws].sum()
    return df
def sex_aggregate_components(group_cols, version):
    """multiply by population column and aggregate sexes"""
    path = '{dd}dismod/{v}'.format(dd=dw.INPUT_DATA_DIR, v=version)

    dfs = []
    for component_id in component_ids:
        print("pulling {c}".format(c=component_id))
        df = pd.read_feather('{p}/{id}.feather'.format(p=path,
                                                       id=component_id))
        dfs.append(df)

    # concat dfs and merge population
    df = pd.concat(dfs, ignore_index=True)

    pops = qry.get_pops()
    df = df.merge(pops, how='left')
    assert df.population.notnull().values.all(), 'merge with pops fail'

    print("aggregatings sexes")
    df = df[group_cols + dw.DRAW_COLS + ['population']]
    df['sex_id'] = 3

    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x * df['population']),
        df['population']
    ],
                   axis=1)

    # sums the rows by sex
    df = df.groupby(group_cols,
                    as_index=False)[dw.DRAW_COLS + ['population']].sum()

    # return to appropriate metric
    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x / df['population'])
    ],
                   axis=1)

    print("outputting feather")
    df.to_feather('{p}/1064_age_disagg.feather'.format(p=path))
def collapse_sex(df):
    """Convert prevalence to cases"""
    pops = qry.get_pops(both_sexes=False)
    df = df.merge(pops,
                  how='left',
                  on=['location_id', 'age_group_id', 'sex_id', 'year_id'])

    draws = [col for col in df.columns if 'draw_' in col]
    id_cols = dw.CHILD_OVERWEIGHT_GROUP_COLS
    # make sex 3 to collapse to both
    df['sex_id'] = 3
    # convert to cases by multiplying each draw by the population value
    df = pd.concat([
        df[id_cols], df[draws].apply(lambda x: x * df['population']),
        df['population']
    ],
                   axis=1)
    # sum sexes together
    df = df.groupby(id_cols, as_index=False)[draws + ['population']].sum()
    # Turn back to proportion
    df = pd.concat(
        [df[id_cols], df[draws].apply(lambda x: x / df['population'])], axis=1)
    return df
Esempio n. 12
0
def process_burdenator_draws(past_future, version=dw.BURDENATOR_VERS):
    if past_future == 'past':
        index_cols = dw.RISK_BURDEN_GROUP_COLS
        data_dir = dw.INPUT_DATA_DIR + 'risk_burden' + '/' + str(version)
        db_pops = qry.get_pops()
    elif past_future == 'future':
        index_cols = ['indicator_component_id'] + INDEX_COLS_FUTURE
        data_dir = dw.FORECAST_DATA_DIR + 'risk_burden' + '/' + str(version)
        db_pops = load_population()

    else:
        raise ValueError(
            'The past_future arg must be set to "past" or "future".')

    dfs = []

    component_ids = dw.RISK_BURDEN_COMPONENT_IDS + dw.RISK_BURDEN_DALY_COMPONENT_IDS
    for component_id in component_ids:
        print("pulling " + str(component_id))
        df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather')
        df.loc[:, 'indicator_component_id'] = component_id
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)

    # aggregate to both sexes but keep sex-split data as well
    df = df.merge(db_pops, how='left')
    df_sex_split = df.copy(deep=True)
    df['sex_id'] = 3
    df = agg.age_sex_aggregate(df, group_cols=index_cols)
    df = pd.concat([df, df_sex_split], axis=0, ignore_index=True)

    # global
    df_global = agg.aggregate_locations_to_global(df,
                                                  index_cols,
                                                  age_standardized=True,
                                                  age_group_years_start=0,
                                                  age_group_years_end=125,
                                                  age_group_id=27)

    # age-standardize
    df = agg.age_standardize(df, index_cols, 0, 125)

    # output
    df = df[index_cols + dw.DRAW_COLS]
    df_global = df_global[index_cols + dw.DRAW_COLS]

    file_dict = dw.RB_INPUT_FILE_DICT
    for component_id in file_dict.keys():

        path = data_dir + '/' + str(file_dict[component_id]) + '.feather'
        global_path = data_dir + '/' + str(
            file_dict[component_id]) + '_global' + '.feather'

        # sdg
        print('outputting ' + str(file_dict[component_id]))
        df_id = df[df.indicator_component_id == component_id]
        df_id.reset_index(drop=True, inplace=True)
        df_id.to_feather(path)

        # global
        print('outputting ' + str(file_dict[component_id]) + ' global')
        df_id_global = df_global[df_global.indicator_component_id ==
                                 component_id]
        df_id_global.reset_index(drop=True, inplace=True)
        df_id_global.to_feather(global_path)

    return df
def process_location_como_draws(location_id, measure_id, test=False):
    """Pull indidence rates, merging with population to make cases

    Gets all years, ages, and sexes for the location id as incidence rates
    from get_draws, and combines into all ages, both
    sexes cases.
    """
    db_pops = qry.get_pops()
    if measure_id == 6:
        causes = dw.COMO_INC_CAUSE_IDS
    elif measure_id == 5:
        causes = dw.COMO_PREV_CAUSE_IDS
    else:
        raise ValueError("bad measure_id: {}".format(measure_id))

    dfs = []
    if test:
        years = [2016]
    else:
        years = []
    for cause_id in causes:
        print("pulling {c}".format(c=cause_id))
        if test:
            df = get_draws(gbd_id_field='cause_id',
                           gbd_id=cause_id,
                           source='como',
                           version=dw.COMO_VERS,
                           location_ids=[location_id],
                           year_ids=years,
                           age_group_ids=[],
                           sex_ids=[1, 2],
                           measure_ids=[measure_id])
        else:
            df = interpolate(gbd_id_field='cause_id',
                             gbd_id=cause_id,
                             source='como',
                             version=dw.COMO_VERS,
                             reporting_year_start=1990,
                             reporting_year_end=2016,
                             location_ids=[location_id],
                             age_group_ids=[],
                             sex_ids=[1, 2],
                             measure_ids=[measure_id])

        # these pull in as rates
        df['metric_id'] = 3

        # make sure it looks like we expect
        assert set(df.age_group_id) == set(range(2, 21) + range(30, 33) + [235]), \
            'unexpected age group ids found'
        assert set(df.sex_id) == set([1, 2]), \
            'unexpected sex ids found'
        if not test:
            assert set(df.year_id) == set(range(1990, 2017)), \
                'unexpected year ids found'
        assert set(df.location_id) == set([location_id]), \
            'unexpected location ids found'

        # compile
        dfs.append(df[dw.COMO_GROUP_COLS + dw.DRAW_COLS])

    df = pd.concat(dfs, ignore_index=True)

    # merge with pops to transform to cases
    df = df.merge(db_pops, how='left')
    assert df.population.notnull().values.all(
    ), 'merge with populations failed'

    # concatenate the metadata with the draw cols times the pop
    # this multiplies each draw column by the population column
    df = pd.concat([
        df[dw.COMO_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['population']),
        df['population']
    ],
                   axis=1)

    # now its numbers (this line is for readability)
    df['metric_id'] = 1

    # aggregate sexes
    df['sex_id'] = 3

    # collapse sexes together
    df = df.groupby(dw.COMO_GROUP_COLS,
                    as_index=False)[dw.DRAW_COLS + ['population']].sum()
    df = pd.concat([
        df[dw.COMO_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x / df['population'])
    ],
                   axis=1)
    df['metric_id'] = 3

    # AGE STANDARDIZE
    print("age standardizing")
    wgts = custom_age_weights(0, 125)
    df = df.merge(wgts, on=['age_group_id'], how='left')
    assert df.age_group_weight_value.notnull().values.all(), \
        'merge w wgts failed'
    df = pd.concat([
        df[dw.COMO_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value'])
    ],
                   axis=1)
    df['age_group_id'] = 27
    df = df.groupby(dw.COMO_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum()

    df = df[dw.COMO_GROUP_COLS + dw.DRAW_COLS]
    write_output(df, 'como', location_id)
    return df
def process_location_risk_exposure_draws(location_id, test=False):
    """Return yearly age standardized estimates of each rei_id.

    Arguments:
        location_id: the location_id to process

    Returns:
        pandas dataframe like so:
        [ID_COLS] : [dw.DRAW_COLS]
    """
    dfs = []

    # version_df = pd.DataFrame()
    risks = set(dw.RISK_EXPOSURE_REI_IDS).union(
        set(dw.RISK_EXPOSURE_REI_IDS_MALN))
    if test:
        years = [2016]
    else:
        years = []
    for rei_id in risks:
        print("pulling {r}".format(r=rei_id))
        if test or rei_id == 166:
            df = get_draws(gbd_id_field='rei_id',
                           gbd_id=rei_id,
                           source='risk',
                           location_ids=[location_id],
                           year_ids=years,
                           age_group_ids=[],
                           sex_ids=[],
                           draw_type='exposure')
        elif not test and rei_id == 86:
            df = interpolate(gbd_id_field='rei_id',
                             gbd_id=rei_id,
                             source='risk',
                             reporting_year_start=1990,
                             reporting_year_end=2016,
                             location_ids=[location_id],
                             age_group_ids=[],
                             sex_ids=[],
                             measure_ids=19,
                             draw_type='exposure')
        else:
            df = interpolate(gbd_id_field='rei_id',
                             gbd_id=rei_id,
                             source='risk',
                             reporting_year_start=1990,
                             reporting_year_end=2016,
                             location_ids=[location_id],
                             age_group_ids=[],
                             sex_ids=[],
                             draw_type='exposure')

        # remove any other ages besides main gbd ages
        df = df.query(
            '(age_group_id >= 2 & age_group_id <= 20) or age_group_id in [30, 31, 32, 235] and sex_id in [1, 2]'
        )
        df = df.query('year_id >= 1990')

        if rei_id == 166:
            # only keep 10+ for smoking
            df = df.query('age_group_id >= 7')
            df = df.query('parameter=="cat1"')

        # set the rei_id because it isnt in the get_draws pull
        df['rei_id'] = rei_id

        # these are prevalence rates
        df['metric_id'] = 3
        if rei_id == 86:
            df['measure_id'] = 19
        else:
            df['measure_id'] = 5

        dfs.append(df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS])

    df = pd.concat(dfs, ignore_index=True)

    # COLLAPSE SEX
    print("collapsing sex")
    df = df.merge(qry.get_pops(), how='left')
    assert df.population.notnull().values.all(), 'merge with pops fail'
    # overriding the sex variable for collapsing
    df['sex_id'] = df.rei_id.apply(lambda x: 2 if x == 167 else 3)

    # for stunting and wasting (where we only have under-5), keep only under-5 and aggregate ages
    df.ix[df['rei_id'].isin(dw.RISK_EXPOSURE_REI_IDS_MALN), 'age_group_id'] = 1

    # make all ages for PM 2.5
    df.ix[df['rei_id'] == 86, 'age_group_id'] = 22

    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['population'])
    ],
                   axis=1)
    # so unnecessary programmatically but good for documentation -
    #  these are now prev cases
    df['metric_id'] = 1
    # now that its in cases it is possible to collapse sex
    df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False).sum()

    # RETURN TO RATES
    print("returning to rates")
    df = df.merge(qry.get_pops(), how='left')
    assert df.population.notnull().values.all(), 'merge with pops fail'
    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x / df['population'])
    ],
                   axis=1)
    df['metric_id'] = 3

    # AGE STANDARDIZE
    print("age standardizing")
    wgts = custom_age_weights(10, 125)  # FOR SMOKING ONLY
    df = df.merge(wgts, on=['age_group_id'], how='left')
    assert df.age_group_weight_value.notnull().values.all(), \
        'merge w wgts failed'
    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value'])
    ],
                   axis=1)
    df['age_group_id'] = 27
    df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS,
                    as_index=False)[dw.DRAW_COLS].sum()

    df = df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS]
    write_output(df, 'risk_exposure', location_id)
    return df
def process_location_risk_burden_draws(location_id, test=False):
    ''' Given a list of rei_ids, use gopher to get attributable burden draws
    and save to out directory.

    '''

    dfs = []
    for rei_id in dw.RISK_BURDEN_REI_IDS + dw.RISK_BURDEN_DALY_REI_IDS:
        print(rei_id)
        if rei_id in dw.RISK_BURDEN_REI_IDS:
            measure_id = 1
        elif rei_id in dw.RISK_BURDEN_DALY_REI_IDS:
            measure_id = 2
        else:
            raise ValueError("no measure found")
        print('Getting draws')
        df = get_draws(gbd_id_field=['cause_id', 'rei_id'],
                       gbd_id=[294, rei_id],
                       source='burdenator',
                       version=dw.BURDENATOR_VERS,
                       location_ids=location_id,
                       year_ids=[],
                       age_group_ids=[],
                       sex_ids=[],
                       num_workers=3,
                       n_draws=1000,
                       resample=True)

        # keep years we want
        df = df.query('measure_id == {}'.format(measure_id))
        df = df.query('metric_id == 1')
        df = df.query('age_group_id in {} and sex_id in [1, 2]'.format(
            range(2, 21) + range(30, 33) + [235]))
        df = df.query('year_id in {}'.format(range(1990, 2011, 5) + [2016]))

        # aggregate to both sexes
        df['sex_id'] = 3
        df = df.groupby(dw.RISK_BURDEN_GROUP_COLS,
                        as_index=False)[dw.DRAW_COLS].sum()
        pops = qry.get_pops(both_sexes=True)
        df = df.merge(pops,
                      how='left',
                      on=['location_id', 'age_group_id', 'sex_id', 'year_id'])
        df = pd.concat([
            df[dw.RISK_BURDEN_GROUP_COLS],
            df[dw.DRAW_COLS].apply(lambda x: x / df['population'])
        ],
                       axis=1)
        df['metric_id'] = 3

        # keep the right columns
        df = df[dw.RISK_BURDEN_GROUP_COLS + dw.DRAW_COLS]

        # interpolate years
        print('Interpolating')
        df = custom_interpolate(df)

        # age-standardize
        age_weights = qry.get_age_weights(4)
        df = df.merge(age_weights)
        df = pd.concat([
            df[dw.RISK_BURDEN_GROUP_COLS],
            df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value'])
        ],
                       axis=1)
        df['age_group_id'] = 27
        df = df.groupby(dw.RISK_BURDEN_GROUP_COLS,
                        as_index=False)[dw.DRAW_COLS].sum()
        dfs.append(df)

    df = pd.concat(dfs)
    write_output(df, 'risk_burden', location_id)
    return df
Esempio n. 16
0
def process_risk_exposure_draws(past_future, version=dw.RISK_EXPOSURE_VERS):

    if past_future == 'past':
        index_cols = dw.RISK_EXPOSURE_GROUP_COLS
        data_dir = dw.INPUT_DATA_DIR + 'risk_exposure' + '/' + str(version)
        db_pops = qry.get_pops()
    if past_future == 'future':
        index_cols = ['indicator_component_id'] + INDEX_COLS_FUTURE
        data_dir = dw.FORECAST_DATA_DIR + 'risk_exposure' + '/' + str(version)
        db_pops = load_population()

    component_ids = dw.RISK_EXPOSURE_COMPONENT_IDS

    dfs = []
    for component_id in component_ids:
        print("pulling {c}".format(c=component_id))
        df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather')
        df.loc[:, 'indicator_component_id'] = component_id
        dfs.append(df)

    print("concatenating")
    df = pd.concat(dfs, ignore_index=True)

    # collapse sex/ages
    df = df.merge(db_pops, how='left')

    # set age-groups for aggregation now so it doesn't have to be done twice (for sex_split)
    df.loc[df.indicator_component_id.isin([35, 41]),
           'age_group_id'] = 1  # Malnutrition
    df.loc[df.indicator_component_id == 44,
           'age_group_id'] = 5  # Child Overweight

    # keep these for later
    df_sex_split = df[~df.indicator_component_id.isin([5, 227])]
    df_smoking = df[df.indicator_component_id == 227]
    df_smoking_sex_split = df_smoking.copy(deep=True)

    df = df[df.indicator_component_id != 227]  # remove smoking from main df

    # age/sex aggregate
    df['sex_id'] = 3  # changes everything but Mean PM2.5 which is already aggregated
    print("concatenating")  # concat sex-split data
    df = pd.concat([df, df_sex_split], ignore_index=True)
    df = agg.age_sex_aggregate(df,
                               group_cols=index_cols,
                               denominator='population')

    # sex aggregate smoking data before age-standardizing
    df_smoking['sex_id'] = 3
    df = agg.age_sex_aggregate(df,
                               group_cols=index_cols,
                               denominator='population')
    print("concatenating")
    df_smoking = pd.concat([df_smoking, df_smoking_sex_split], axis=0)

    # aggregate all but smoking to global
    df_global = agg.aggregate_locations_to_global(df, index_cols)

    # aggregate smoking to global and age-standardize global and non-global
    df_smoking_global = agg.aggregate_locations_to_global(
        df_smoking,
        index_cols,
        age_standardized=True,
        age_group_years_start=10,
        age_group_years_end=125,
        age_group_id=194)
    # df_smoking_global['units'] = 'sdg'
    df_smoking = agg.age_standardize(df_smoking, index_cols, 10, 125, 194)

    # concat smoking
    df = pd.concat([df, df_smoking], axis=0)
    df_global = pd.concat([df_global, df_smoking_global], axis=0)

    # output
    df = df[index_cols + dw.DRAW_COLS]
    df_global = df_global[index_cols + dw.DRAW_COLS]

    file_dict = dw.RE_FILE_DICT
    for component_id in file_dict.keys():

        path = data_dir + '/' + str(file_dict[component_id]) + '.feather'
        global_path = data_dir + '/' + str(
            file_dict[component_id]) + '_global' + '.feather'

        # sdg
        print('outputting ' + str(file_dict[component_id]))
        df_id = df[df.indicator_component_id == component_id]
        df_id.reset_index(drop=True, inplace=True)
        df_id.to_feather(path)

        # global
        print('outputting ' + str(file_dict[component_id]) + ' global')
        df_id_global = df_global[df_global.indicator_component_id ==
                                 component_id]
        df_id_global.reset_index(drop=True, inplace=True)
        df_id_global.to_feather(global_path)

    return df_global
Esempio n. 17
0
    df['run_num'] = 'draw_' + (df['run_num'] - 1).astype(str)
    df = pd.pivot_table(df,
                        values='value',
                        index=['location_id', 'year_id', 'age_group_id', 'sex_id'],
                        columns='run_num')
    df = df.reset_index()
    return df

if __name__ == '__main__':
    # Supplementary datasets
    print('Collecting supplementary datasets')
    age_weights = qry.get_age_weights(4)
    age_weights.loc[age_weights.age_group_id.isin([30, 31, 32, 235]), 'age_group_id'] = 21
    age_weights = age_weights.groupby(['age_group_id'], as_index=False)['age_group_weight_value'].sum()

    gbd_popdf = qry.get_pops()
    gbd_popdf.loc[gbd_popdf.age_group_id.isin([30, 31, 32, 235]), 'age_group_id'] = 21
    gbd_popdf = gbd_popdf.groupby(['location_id', 'year_id', 'age_group_id', 'sex_id'], as_index=False)['population'].sum()

    wpp_popdf = pd.read_csv('FILEPATH/wpp2015_to2063.csv')
    wpp_popdf = wpp_popdf.loc[wpp_popdf.year_id >= 2016]
    wpp_popdf = wpp_popdf.rename(index=str, columns={'pop':'population'})

    locsdf = qry.get_sdg_reporting_locations()
    locsdf['L3_loc'] = [loc[3] for loc in locsdf.path_to_top_parent.str.split(',').tolist()]

    # Compile all countries
    print('Fetching location-specific datasets')
    pool = Pool(15)
    dfs = pool.map(load_location_file, locsdf['ihme_loc_id'].values)
    pool.close()
Esempio n. 18
0
def process_dismod_draws(past_future, version=dw.DISMOD_VERS):

    if past_future == 'past':
        index_cols = dw.DISMOD_GROUP_COLS
        data_dir = dw.INPUT_DATA_DIR + 'dismod' + '/' + str(version)
        db_pops = qry.get_pops()
    elif past_future == 'future':
        index_cols = ['indicator_component_id'] + INDEX_COLS_FUTURE
        data_dir = dw.FORECAST_DATA_DIR + 'dismod' + '/' + str(version)
        db_pops = load_population()
    else:
        raise ValueError(
            'The past_future arg must be set to "past" or "future".')

    component_ids = [14, 17, 242, 245]  # no child sex abuse (pulled later)
    dfs = []

    for component_id in component_ids:
        print("pulling {c}".format(c=component_id))
        df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather')
        df.loc[:, 'indicator_component_id'] = component_id
        dfs.append(df)

    print("concatenating")
    df = pd.concat(dfs, ignore_index=True)

    df = df[index_cols + dw.DRAW_COLS]

    # COLLAPSE SEX/AGES
    df = df.merge(db_pops, how='left')
    df_sex_split = df[df.indicator_component_id.isin([14, 17])]
    df.loc[df['indicator_component_id'].isin([14, 17]),
           'sex_id'] = 3  # physical and sexual violence sex aggregation
    df = agg.age_sex_aggregate(df, group_cols=index_cols)
    df = pd.concat([df, df_sex_split], ignore_index=True)

    # AGE STANDARDIZE
    df_age_stand_all_age = df.loc[df['indicator_component_id'].isin([14, 17])]
    df_age_stand_15_plus = df.loc[df['indicator_component_id'].isin(
        [242, 245])]  # int partner and non-int partner violence

    # global
    df_aa_global = agg.aggregate_locations_to_global(df_age_stand_all_age,
                                                     index_cols,
                                                     age_standardized=True,
                                                     age_group_years_start=0,
                                                     age_group_years_end=125,
                                                     age_group_id=27)

    df_15_plus_global = agg.aggregate_locations_to_global(
        df_age_stand_15_plus,
        index_cols,
        age_standardized=True,
        age_group_years_start=15,
        age_group_years_end=125,
        age_group_id=29)

    # national/subnational
    df_age_stand_all_age = agg.age_standardize(df_age_stand_all_age,
                                               index_cols, 0, 125, 27)
    df_age_stand_15_plus = agg.age_standardize(df_age_stand_15_plus,
                                               index_cols, 15, 125, 29)

    # concat
    print("concatenating")
    df = pd.concat([df_age_stand_all_age, df_age_stand_15_plus],
                   ignore_index=True)
    df_global = pd.concat([df_aa_global, df_15_plus_global], ignore_index=True)

    # output
    df = df[index_cols + dw.DRAW_COLS]

    file_dict = dict(zip(component_ids, ['1094', '1095', '1047', '1098']))

    for component_id in file_dict.keys():

        path = data_dir + '/' + file_dict[component_id] + '.feather'
        global_path = data_dir + '/' + file_dict[
            component_id] + '_global' + '.feather'

        # sdg
        print('outputting ' + file_dict[component_id])
        df_id = df[df.indicator_component_id == component_id]
        df_id.reset_index(drop=True, inplace=True)
        df_id.to_feather(path)

        # global
        print('outputting ' + file_dict[component_id] + ' global')
        df_id_global = df_global[df_global.indicator_component_id ==
                                 component_id]
        df_id_global.reset_index(drop=True, inplace=True)
        df_id_global.to_feather(global_path)

    #############################################
    # child sex abuse
    index_cols = index_cols.remove('indicator_component_id')

    df_csa = pd.read_feather(data_dir + '/' + '1064_age_disagg.feather')
    df_csa = df_csa.merge(db_pops, how='left')

    # aggregation and output
    df_csa.loc[:, 'age_group_id'] = 202
    df_csa = agg.age_sex_aggregate(df_csa, group_cols=index_cols)

    df_csa_global = df_csa.copy(deep=True)

    df_csa = df_csa[index_cols + dw.DRAW_COLS]
    df_csa.reset_index(drop=True, inplace=True)
    print('outputting 1064')
    df_csa.to_feather(data_dir + '/' + '1064.feather')

    df_csa_global = agg.aggregate_locations_to_global(df_csa_global,
                                                      index_cols)
    df_csa_global = df_csa_global[index_cols + dw.DRAW_COLS]
    df_csa_global.reset_index(drop=True, inplace=True)
    print('outputting 1064 global')
    df_csa_global.to_feather(data_dir + '/' + '1064_global.feather')

    return df
Esempio n. 19
0
def process_codcorrect_draws(version=dw.CC_VERS):

    index_cols = dw.CC_GROUP_COLS
    component_ids = dw.CC_ALL_AGE_COMPONENT_IDS + dw.CC_THIRTY_SEVENTY_COMPONENT_IDS + dw.CONF_DIS_COMPONENT_IDS
    data_dir = dw.INPUT_DATA_DIR + 'codcorrect' + '/' + str(version)

    dfs = []
    for component_id in component_ids:
        print("pulling {c}".format(c=component_id))
        df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather')
        df.loc[:, 'indicator_component_id'] = component_id
        dfs.append(df)

    print("concatenating")
    df = pd.concat(dfs, ignore_index=True)

    # convert to numbers
    db_pops = qry.get_pops()
    df = df.merge(db_pops, how='left')

    # keep sex split for certain indicators (ncds, road mort, poisoning, homicide)
    df_keep_sex_split = df.loc[df['indicator_component_id'].isin(
        dw.CC_THIRTY_SEVENTY_COMPONENT_IDS + dw.CC_ALL_AGE_COMPONENT_IDS
    ), :]  # these age_groups get standardized later

    # collapse sex (and age for conlict and distaster mort)
    df['sex_id'] = 3
    df.loc[df['indicator_component_id'].isin(dw.CONF_DIS_COMPONENT_IDS),
           'age_group_id'] = 22
    df = agg.age_sex_aggregate(df, group_cols=index_cols)

    # make sure it looks like we expect
    assert set(df.loc[df['cause_id'].isin(dw.CONF_DIS_CAUSES)].age_group_id) == set([22]), \
        'unexpected age group ids found'
    assert set(df.loc[~df['cause_id'].isin(dw.CONF_DIS_CAUSES)].age_group_id) == \
        set(range(2, 21) + range(30, 33) + [235]), \
        'unexpected age group ids found'
    assert set(df.sex_id) == set([3]), 'unexpected sex ids found'

    # concat sex-split data before age-standardizing
    df = pd.concat([df, df_keep_sex_split], axis=0)

    # prepare for age-standardization
    # all age-standardized except for conflict and disaster mort
    df_conf_dis = df.loc[df['indicator_component_id'].isin(
        dw.CONF_DIS_COMPONENT_IDS)]
    df_ncds = df.loc[df['indicator_component_id'].isin(
        dw.CC_THIRTY_SEVENTY_COMPONENT_IDS)]
    df_all_ages = df.loc[df['indicator_component_id'].isin(
        dw.CC_ALL_AGE_COMPONENT_IDS)]

    # global aggregation
    df_cd_global = agg.aggregate_locations_to_global(df_conf_dis, index_cols)

    df_ncds_global = agg.aggregate_locations_to_global(
        df_ncds,
        index_cols,
        age_standardized=True,
        age_group_years_start=30,
        age_group_years_end=70,
        age_group_id=214)

    df_aa_global = agg.aggregate_locations_to_global(df_all_ages,
                                                     index_cols,
                                                     age_standardized=True,
                                                     age_group_years_start=0,
                                                     age_group_years_end=125,
                                                     age_group_id=27)

    # age standardize
    df_ncds = agg.age_standardize(df_ncds, index_cols, 30, 70, 214)
    df_all_ages = agg.age_standardize(df_all_ages, index_cols, 0, 125, 27)

    # concat all
    print('concatenating')
    df = pd.concat([df_ncds, df_all_ages, df_conf_dis], axis=0)
    df_global = pd.concat([df_ncds_global, df_aa_global, df_cd_global], axis=0)

    # output
    df = df[index_cols + dw.DRAW_COLS]
    df_global = df_global[index_cols + dw.DRAW_COLS]

    file_dict = dw.CC_FILE_DICT
    for component_id in file_dict.keys():

        path = data_dir + '/' + file_dict[component_id] + '.feather'
        global_path = data_dir + '/' + file_dict[
            component_id] + '_global' + '.feather'

        # sdg
        print('outputting ' + file_dict[component_id])
        df_id = df[df.indicator_component_id == component_id]
        df_id.reset_index(drop=True, inplace=True)
        df_id.to_feather(path)

        # global
        print('outputting ' + file_dict[component_id] + ' global')
        df_id_global = df_global[df_global.indicator_component_id ==
                                 component_id]
        df_id_global.reset_index(drop=True, inplace=True)
        df_id_global.to_feather(global_path)
Esempio n. 20
0
    df = df.reset_index()
    return df[[
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id',
        'metric_id'
    ] + dw.DRAW_COLS]


# get locations
locsdf = qry.get_sdg_reporting_locations()

# read past
print 'prepping past file...'
past_df = pd.read_csv(dw.ADOL_FERT_PAST_FILE)
past_df = past_df.loc[past_df.location_id.isin(locsdf.location_id.values)]

pop_df = qry.get_pops()
pop_df = pop_df.loc[(pop_df.age_group_id.isin([7, 8])) & \
                    (pop_df.sex_id == 2)]
past_df = agg_fertility(past_df, pop_df)

print 'writing...'
try:
    if not os.path.exists(dw.ADOL_FERT_DIR):
        os.makedirs(dw.ADOL_FERT_DIR)
except OSError:
    pass
past_df.to_hdf("{d}/asfr_clean.h5".format(d=dw.ADOL_FERT_DIR),
               key="data",
               format="table",
               data_columns=['location_id', 'year_id'])
Esempio n. 21
0
import sdg_utils.draw_files as dw
import sdg_utils.queries as qry

# get locations
locsdf = qry.get_sdg_reporting_locations()

# get main dataset
df = pd.read_csv(dw.MET_NEED_FILE)
df = df.query('year_id >= 1990')
df = df.query('location_id in {}'.format(list(locsdf['location_id'])))
df['metric_id'] = 2
df['measure_id'] = 18

# get weights
if 'mod_contra' in dw.MET_NEED_VERS:
    agesdf = qry.get_pops()
    agesdf = agesdf.loc[agesdf.age_group_id.isin(df.age_group_id.unique())]
    agesdf['totpop'] = agesdf.groupby(
        ['location_id', 'year_id', 'sex_id'],
        as_index=False)['population'].transform('sum')
    agesdf['weights'] = agesdf['population'] / agesdf['totpop']
else:
    agesdf = pd.read_csv(dw.MET_NEED_WEIGHTS_FILE)
    agesdf = agesdf.query('location_id in {}'.format(
        list(locsdf['location_id'])))
    agesdf = agesdf.query('year_id >= 1990')
    agesdf['weights'] = agesdf[['weight_' + str(i)
                                for i in range(0, 1000)]].mean(axis=1)
agesdf = agesdf[[
    'location_id', 'year_id', 'age_group_id', 'sex_id', 'weights'
]]
def process_location_como_draws(location_id, measure_id, test=False):
    """Pull indidence rates, merging with population to make cases

    Using COMO because there are plans to make this store each year.

    Gets all years, ages, and sexes for the location id as incidence rates
    from transmogrifier's gopher library, and combines into all ages, both
    sexes cases.
    """
    db_pops = qry.get_pops()
    if measure_id == 6:
        gbd_ids = {'cause_ids': dw.COMO_INC_CAUSE_IDS}
    elif measure_id == 5:
        gbd_ids = {'cause_ids': dw.COMO_PREV_CAUSE_IDS}
    else:
        raise ValueError("bad measure_id: {}".format(measure_id))
    if test:
        years = [2015]
    else:
        years = []
    df = gopher.draws(gbd_ids,
                      'como',
                      measure_ids=[measure_id],
                      location_ids=[location_id],
                      year_ids=years,
                      age_group_ids=[],
                      sex_ids=[],
                      verbose=True,
                      num_workers=5,
                      version=dw.COMO_VERS)

    # make sure it looks like we expect
    assert set(df.age_group_id) == set(range(2, 22)), \
        'unexpected age group ids found'
    assert set(df.sex_id) == set([1, 2]), \
        'unexpected sex ids found'
    if not test:
        assert set(df.year_id) == set(range(1990, 2016, 5)), \
            'unexpected year ids found'
    assert set(df.location_id) == set([location_id]), \
        'unexpected location ids found'

    # these pull in as rates
    df['metric_id'] = 3

    # merge with pops to transform to cases
    df = df.merge(db_pops, how='left')
    assert df.mean_pop.notnull().values.all(), 'merge with populations failed'

    # concatenate the metadata with the draw cols times the pop
    # this multiplies each draw column by the mean_pop column
    df = pd.concat([
        df[dw.COMO_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['mean_pop'])
    ],
                   axis=1)

    # now its numbers (this line is for readability)
    df['metric_id'] = 1

    # aggregate sexes
    df['sex_id'] = 3

    # collapse sexes together
    df = df.groupby(dw.COMO_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum()

    # age standardize
    df = age_standardize(df, 'como')

    write_output(df, 'como', location_id)
    return df
def process_risk_exposure_draws(location_id, test=False):
    """Return yearly age standardized estimates of each rei_id.

    1. Use gopher to pull data for each rei_id for the location_id
    the location id, and all years.
    2. Keep appropriate categories for given rei_id
    3. Draws only come with male/female in rates -
        change to cases and make both sexes aggregate.
    4. Revert back to rates and age standardize using custom weights.

    Arguments:
        location_id: the location_id to process

    Returns:
        pandas dataframe like so:
        [ID_COLS] : [dw.DRAW_COLS]
    """
    dfs = []

    version_df = pd.DataFrame()
    all_ids = set(dw.RISK_EXPOSURE_REI_IDS).union(
        set(dw.RISK_EXPOSURE_REI_IDS_MALN))
    if test:
        years = [2015]
    else:
        years = []
    for rei_id in all_ids:
        print("pulling {r}".format(r=rei_id))
        df = gopher.draws({"rei_ids": [rei_id]},
                          source='risk',
                          draw_type='exposure',
                          location_ids=[location_id],
                          year_ids=years,
                          age_group_ids=[],
                          sex_ids=[1, 2],
                          num_workers=5)
        # remove any other ages besides gbd ages
        df = df.query('age_group_id >= 2 & age_group_id <= 21')
        # only reporting since 1990
        df = df.query('year_id>=1990')

        if rei_id == 167:
            # change IPV to just women
            df = df.query('sex_id == 2')

        if rei_id in dw.RISK_EXPOSURE_REI_IDS_MALN:
            # these are childhood stunting - cat1 + cat2 equals <-2 std dev
            df = df.query('parameter=="cat1" | parameter=="cat2"')
        else:
            # cat1 represents the prevalence in these cases (can't test this?)
            df = df.query('parameter=="cat1"')

        # set the rei_id because it isnt in the gopher pull
        df['rei_id'] = rei_id

        # keep track of what model versions where used
        version_df = version_df.append(
            df[['rei_id', 'modelable_entity_id',
                'model_version_id']].drop_duplicates(),
            ignore_index=True)

        # these are prevalence rates
        df['metric_id'] = 3
        df['measure_id'] = 5

        dfs.append(df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS])

    df = pd.concat(dfs, ignore_index=True)

    # note the versions used by risk exposure vers (manufactured by me)
    version_df.to_csv(
        "/home/j/WORK/10_gbd/04_journals/"
        "gbd2015_capstone_lancet_SDG/02_inputs/"
        "risk_exposure_versions_{v}.csv".format(v=dw.RISK_EXPOSURE_VERS),
        index=False)

    # COLLAPSE SEX
    print("collapsing sex")
    df = df.merge(qry.get_pops(), how='left')
    assert df.mean_pop.notnull().values.all(), 'merge with pops fail'
    # overriding the sex variable for collapsing
    df['sex_id'] = df.rei_id.apply(lambda x: 2 if x == 167 else 3)

    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['mean_pop'])
    ],
                   axis=1)
    # so unnecessary programmatically but good for documentation -
    #  these are now prev cases
    df['metric_id'] = 1
    # now that its in cases it is possible to collapse sex
    df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False).sum()

    # RETURN TO RATES
    print("returning to rates")
    df = df.merge(qry.get_pops(), how='left')
    assert df.mean_pop.notnull().values.all(), 'merge with pops fail'
    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x / df['mean_pop'])
    ],
                   axis=1)
    df['metric_id'] = 3

    # AGE STANDARDIZE
    print("age standardizing")
    df['is_0_5'] = df.rei_id.apply(
        lambda x: 1 if x in dw.RISK_EXPOSURE_REI_IDS_MALN else 0)
    wgts = custom_age_weights(2, 21)
    wgts['is_0_5'] = 0
    wgts_2 = custom_age_weights(2, 5)
    wgts_2['is_0_5'] = 1
    wgts = wgts.append(wgts_2, ignore_index=True)
    df = df.merge(wgts, on=['is_0_5', 'age_group_id'], how='left')
    assert df.age_group_weight_value.notnull().values.all(), \
        'merge w wgts failed'
    df = pd.concat([
        df[dw.RISK_EXPOSURE_GROUP_COLS],
        df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value'])
    ],
                   axis=1)
    df['age_group_id'] = 27
    df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS,
                    as_index=False)[dw.DRAW_COLS].sum()

    write_output(df, 'risk_exposure', location_id)
    return df
Esempio n. 24
0
def process_covariate_draws(version=dw.COV_VERS):
    data_dir = dw.INPUT_DATA_DIR + 'covariate' + '/' + str(version)
    component_ids = dw.NON_UHC_COV_COMPONENT_IDS

    dfs = []
    for component_id in component_ids:  # read in all components
        print("pulling {c}".format(c=component_id))
        df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather')
        dfs.append(df)

    print('concatenating')
    df = pd.concat(dfs, ignore_index=True)

    # merge populations
    db_pops = qry.get_pops()

    db_pops_adol_birth = db_pops[db_pops.age_group_id.isin(
        [7, 8])]  # create adol birth age group
    db_pops_adol_birth['age_group_id'] = 162
    pop_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id']
    db_pops_adol_birth = db_pops_adol_birth.groupby(
        pop_cols, as_index=False)['population'].sum()
    db_pops = db_pops.append(db_pops_adol_birth, ignore_index=True)
    df = df.merge(db_pops, how='left')

    # aggregate met need
    df_met_need_15_plus = df[df.indicator_component_id == 179]
    df_met_need_15_24 = df_met_need_15_plus[
        df_met_need_15_plus.age_group_id.isin([8, 9])]
    df_met_need_15_plus['age_group_id'] = 24
    df_met_need_15_24['age_group_id'] = 149
    df_met_need = df_met_need_15_plus.append(df_met_need_15_24,
                                             ignore_index=True)
    df_met_need = agg.age_sex_aggregate(df_met_need,
                                        group_cols=dw.COV_GROUP_COLS,
                                        denominator='population')

    df = df[df.indicator_component_id != 179]
    df = df.append(df_met_need, ignore_index=True)

    # global aggregation
    df_global = df_global[~df_global.indicator_component_id.isin(
        [1457, 1460, 1463, 1556])]  # hrh aggregated later

    df_global = agg.aggregate_locations_to_global(df,
                                                  dw.COV_GROUP_COLS,
                                                  denominator='population')

    # output
    df = df[dw.COV_GROUP_COLS + dw.DRAW_COLS]
    df_global = df_global[dw.COV_GROUP_COLS + dw.DRAW_COLS]

    file_dict = dw.COV_FILE_DICT
    for component_id in file_dict.keys():
        if file_dict[component_id] == component_id:
            path = data_dir + '/' + str(component_id) + '_prepped' + '.feather'
            global_path = data_dir + '/' + str(
                component_id) + '_global' + '.feather'
        else:
            path = data_dir + '/' + str(file_dict[component_id]) + '.feather'
            global_path = data_dir + '/' + str(
                file_dict[component_id]) + '_global' + '.feather'

        print('outputting ' + str(file_dict[component_id]))
        df_id = df[(df.indicator_component_id == component_id)
                   & (df.age_group_id != 149)]
        df_id.reset_index(drop=True, inplace=True)
        df_id.to_feather(path)

        if component_id not in [1457, 1460, 1463, 1556]:  # save global dfs
            print('outputting ' + str(file_dict[component_id]) + ' global')
            df_id_global = df_global[df_global.indicator_component_id ==
                                     component_id]
            df_id_global.reset_index(drop=True, inplace=True)
            df_id_global.to_feather(global_path)