def add_china_aggregate(df): """Append china aggregate to the dataframe. Take the pop-weighted mean, by draw, of each china subnational. """ # subset to just china chn = df.ix[df['ihme_loc_id'].str.startswith("CHN")] # merge with populations pops = qry.get_pops() pops = pops.query('sex_id==3 & age_group_id == 22') pops = pops[['location_id', 'year_id', 'mean_pop']] chn = chn.merge(pops, how='left') assert chn.mean_pop.notnull().values.all(), 'merge with pops failed' # calculate the pop-weighted average of each draw column # lambda within a lambda! because x is a dataframe. # And y are series that use x columns. # 'chn' only has year & draws afterwards so add ihme_loc_id, location_id g = chn.groupby(['year_id']) chn = g.apply(lambda x: x[dw.DRAW_COLS].apply(lambda y: np.average( y, weights=x['mean_pop']))).reset_index() chn['ihme_loc_id'] = "CHN" chn['location_id'] = 6 # add the national observation to df df = df.append(chn, ignore_index=True) return df
def age_standardize(df, indicator_type): """Make each draw in the dataframe a rate, then age standardize. """ if indicator_type == 'como': group_cols = dw.COMO_GROUP_COLS elif indicator_type == 'codcorrect': group_cols = dw.CC_GROUP_COLS else: raise ValueError("bad type: {}".format(indicator_type)) assert set(df.sex_id.unique()) == {3}, \ 'falsely assuming only both sexes included' assert set(df.metric_id.unique()) == {1}, \ 'falsely assuming df is all numbers' db_pops = qry.get_pops(both_sexes=True) db_pops = db_pops[[ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'population' ]] # do special things for the 30-70 causes # merge special age weights on these cause ids using is_30_70 indicator df['is_30_70'] = df.cause_id.apply( lambda x: 1 if x in dw.CC_THIRTY_SEVENTY_CAUSE_IDS else 0) # get age weights with is_30_70 special weights age_weights = custom_age_weights(0, 125) age_weights['is_30_70'] = 0 age_weights_30_70 = custom_age_weights(30, 70) age_weights_30_70['is_30_70'] = 1 age_weights = age_weights.append(age_weights_30_70, ignore_index=True) all_age = pd.DataFrame( { 'age_group_id': 22, 'age_group_weight_value': 1, 'is_30_70': 0 }, index=[0]) age_weights = age_weights.append(all_age, ignore_index=True) df = df.merge(db_pops, how='left') assert df.population.notnull().values.all(), 'merge with pops failed' df = df[df.population.notnull()] df = df.merge(age_weights, on=['age_group_id', 'is_30_70'], how='left') assert df.age_group_weight_value.notnull().values.all(), 'age weights merg' # concatenate the metadata with the draw cols times the pop # this multiplies each draw column by the population column df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply( lambda x: (x / df['population']) * df['age_group_weight_value']) ], axis=1) # now a rate, age standardized df['metric_id'] = 3 df.loc[df.age_group_id != 22, 'age_group_id'] = 27 df = df.groupby(group_cols, as_index=False)[dw.DRAW_COLS].sum() return df
def add_england_aggregate(df, locsdf): """Append england aggregate to the dataframe. Take the pop-weighted mean, by draw, of each china subnational. """ # subset to just UTLAs utlas = locsdf[locsdf['path_to_top_parent'].str.contains(',4749,')].query('most_detailed == 1')['location_id'].values eng = df[df['location_id'].isin(utlas)] # merge with populations pops = qry.get_pops(both_sexes=True) pops = pops.query('sex_id==3 & age_group_id == 22') eng = eng.merge(pops[['location_id', 'year_id', 'population']]) assert eng.population.notnull().values.all(), 'merge with pops failed' # calculate the pop-weighted average of each draw column g = eng.groupby(['year_id']) eng = g.apply(lambda x: x[dw.DRAW_COLS].apply( lambda y: np.average(y, weights=x['population']) ) ).reset_index() eng['location_id'] = 4749 # add the national observation to df df = df.append(eng, ignore_index=True) return df
def load_age_disagg_births(): df = qry.get_asfr() db_pops = qry.get_pops() df = df.merge(db_pops, how='left') assert df['population'].notnull().values.all( ), 'merge with population failed' df['asfr'] = df['asfr'] * df['population'] df.drop('population', axis=1, inplace=True) df.rename(columns={'asfr': 'births'}, inplace=True) return df
def age_location_aggregate(past_future, group_cols, version): """multiply by population column and aggregate ages""" if past_future == 'past': path = '{dd}dismod/{v}'.format(dd=dw.INPUT_DATA_DIR, v=version) pops = qry.get_pops() elif past_future == 'future': path = '{dd}dismod/{v}'.format(dd=dw.FORECAST_DATA_DIR, v=version) pops = load_population() else: raise ValueError( 'The past_future arg must be set to "past" or "future".') print("reading file") df = pd.read_feather(path + '/' + '1064_age_disagg.feather') df = df.merge(pops, how='left') assert df.population.notnull().values.all(), 'merge with pops fail' print("aggregatings ages") df.loc[:, 'age_group_id'] = 202 df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x * df['population']), df['population'] ], axis=1) # sums the rows by sex df = df.groupby(group_cols, as_index=False)[dw.DRAW_COLS + ['population']].sum() # return to appropriate metric df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x / df['population']), df['population'] ], axis=1) df_global = df.copy(deep=True) print("outputting feather") df.drop('population', axis=1, inplace=True) df.reset_index(drop=True, inplace=True) df.to_feather('{p}/1064.feather'.format(p=path)) # global df_global = aggregate_locations_to_global(df_global, group_cols=group_cols) df_global.to_feather('{p}/1064_global.feather'.format(p=path)) return df
def convert_to_rates(df): """Convert back to rates by merging on pop""" pops = qry.get_pops(both_sexes=True) df = df.merge(pops, how = 'inner')#how='left') assert df.mean_pop.notnull().values.all(), 'pop merge failed' id_cols = dw.EPI_CHILD_OVRWGT_GROUP_COLS draws = [col for col in df.columns if 'draw_' in col] df = pd.concat([ df[id_cols], df[draws].apply(lambda x: x / df['mean_pop']) ], axis=1 ) df['metric_id'] = 3 return df
def process_uhc_intervention_draws(version=dw.COV_VERS): data_dir = dw.INPUT_DATA_DIR + 'covariate' + '/' + str(version) component_ids = dw.UHC_INTERVENTION_COMP_IDS # first rename components that don't need further prep rename_ids = [str(rid) for rid in component_ids if rid not in [206, 209]] for rid in rename_ids: if path.isfile(data_dir + '/' + rid + '.feather'): print('renaming' + rid) rename(data_dir + '/' + rid + '.feather', data_dir + '/' + rid + '_prepped' + '.feather') dfs = [] for component_id in [ 206, 209 ]: # these are the interventions that require aggregation print("pulling {c}".format(c=component_id)) df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather') dfs.append(df) print('concatenating') df = pd.concat(dfs, ignore_index=True) # merge populations db_pops = qry.get_pops() df = df.merge(db_pops, how='left') # age/sex aggregate df.loc[df.indicator_component_id == 209, 'sex_id'] = 3 df.loc[df.indicator_component_id == 209, 'age_group_id'] = 29 # 15+ df.loc[df.indicator_component_id == 206, 'age_group_id'] = 24 # 15-49 df = agg.age_sex_aggregate(df, group_cols=dw.COV_GROUP_COLS, denominator='population') # output df = df[dw.COV_GROUP_COLS + dw.DRAW_COLS] for component_id in [206, 209]: print("outputting " + str(component_id)) df_id = df[df.indicator_component_id == component_id] df_id.reset_index(drop=True, inplace=True) df_id.to_feather(data_dir + '/' + str(component_id) + '_prepped' + '.feather')
def process_como_prev_draws(past_future, version=dw.COMO_VERS): if past_future == 'past': data_dir = dw.INPUT_DATA_DIR + 'como_prev' + '/' + str( version) # just como_prev for now elif past_future == 'future': data_dir = dw.FORECAST_DATA_DIR + 'como_prev' + '/' + str(version) else: raise ValueError( 'The past_future arg must be set to "past" or "future".') db_pops = qry.get_pops() dfs = [] # nonfatal and nema + fatal component_ids = [ 125, 128, 131, 1433, 149, 152, 140, 143, 146, 104, 107, 110, 113, 116, 119, 122, 134, 137 ] for component_id in component_ids: print("pulling " + str(component_id)) df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather') df['metric_id'] = 3 df = df.merge(db_pops, how='left') # Keep sex-split df = df[df.sex_id != 3] # temporary for goalkeepers diagnostics df_sex_split = df.copy(deep=True) # aggregate sexes df = agg.aggregate_sexes(df, dw.COMO_GROUP_COLS) # age standardize print('appending sex split data') df = df.append(df_sex_split, ignore_index=True) df = agg.age_standardize(df, dw.COMO_GROUP_COLS, 0, 125) df = df[dw.COMO_GROUP_COLS + dw.DRAW_COLS] print("outputting " + str(component_id)) df.to_feather(data_dir + '/' + str(component_id) + '_as.feather')
def collapse_sex(df): """Convert prevalence to cases""" pops = qry.get_pops(both_sexes=False) df = df.merge(pops, how = 'left', on = ['location_id','age_group_id','sex_id','year_id']) draws = [col for col in df.columns if 'draw_' in col] id_cols = dw.EPI_CHILD_OVRWGT_GROUP_COLS # make sex 3 to collapse to both df['sex_id'] = 3 # make metric id 1 to represent cases (will change when converted back to # rates) df['metric_id'] = 1 # convert to cases by multiplying each draw by the population value df = pd.concat([df[id_cols], df[draws].apply(lambda x: x * df['mean_pop']) ], axis=1 ) # sum sexes together df = df.groupby(id_cols, as_index=False)[draws].sum() return df
def sex_aggregate_components(group_cols, version): """multiply by population column and aggregate sexes""" path = '{dd}dismod/{v}'.format(dd=dw.INPUT_DATA_DIR, v=version) dfs = [] for component_id in component_ids: print("pulling {c}".format(c=component_id)) df = pd.read_feather('{p}/{id}.feather'.format(p=path, id=component_id)) dfs.append(df) # concat dfs and merge population df = pd.concat(dfs, ignore_index=True) pops = qry.get_pops() df = df.merge(pops, how='left') assert df.population.notnull().values.all(), 'merge with pops fail' print("aggregatings sexes") df = df[group_cols + dw.DRAW_COLS + ['population']] df['sex_id'] = 3 df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x * df['population']), df['population'] ], axis=1) # sums the rows by sex df = df.groupby(group_cols, as_index=False)[dw.DRAW_COLS + ['population']].sum() # return to appropriate metric df = pd.concat([ df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x / df['population']) ], axis=1) print("outputting feather") df.to_feather('{p}/1064_age_disagg.feather'.format(p=path))
def collapse_sex(df): """Convert prevalence to cases""" pops = qry.get_pops(both_sexes=False) df = df.merge(pops, how='left', on=['location_id', 'age_group_id', 'sex_id', 'year_id']) draws = [col for col in df.columns if 'draw_' in col] id_cols = dw.CHILD_OVERWEIGHT_GROUP_COLS # make sex 3 to collapse to both df['sex_id'] = 3 # convert to cases by multiplying each draw by the population value df = pd.concat([ df[id_cols], df[draws].apply(lambda x: x * df['population']), df['population'] ], axis=1) # sum sexes together df = df.groupby(id_cols, as_index=False)[draws + ['population']].sum() # Turn back to proportion df = pd.concat( [df[id_cols], df[draws].apply(lambda x: x / df['population'])], axis=1) return df
def process_burdenator_draws(past_future, version=dw.BURDENATOR_VERS): if past_future == 'past': index_cols = dw.RISK_BURDEN_GROUP_COLS data_dir = dw.INPUT_DATA_DIR + 'risk_burden' + '/' + str(version) db_pops = qry.get_pops() elif past_future == 'future': index_cols = ['indicator_component_id'] + INDEX_COLS_FUTURE data_dir = dw.FORECAST_DATA_DIR + 'risk_burden' + '/' + str(version) db_pops = load_population() else: raise ValueError( 'The past_future arg must be set to "past" or "future".') dfs = [] component_ids = dw.RISK_BURDEN_COMPONENT_IDS + dw.RISK_BURDEN_DALY_COMPONENT_IDS for component_id in component_ids: print("pulling " + str(component_id)) df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather') df.loc[:, 'indicator_component_id'] = component_id dfs.append(df) df = pd.concat(dfs, ignore_index=True) # aggregate to both sexes but keep sex-split data as well df = df.merge(db_pops, how='left') df_sex_split = df.copy(deep=True) df['sex_id'] = 3 df = agg.age_sex_aggregate(df, group_cols=index_cols) df = pd.concat([df, df_sex_split], axis=0, ignore_index=True) # global df_global = agg.aggregate_locations_to_global(df, index_cols, age_standardized=True, age_group_years_start=0, age_group_years_end=125, age_group_id=27) # age-standardize df = agg.age_standardize(df, index_cols, 0, 125) # output df = df[index_cols + dw.DRAW_COLS] df_global = df_global[index_cols + dw.DRAW_COLS] file_dict = dw.RB_INPUT_FILE_DICT for component_id in file_dict.keys(): path = data_dir + '/' + str(file_dict[component_id]) + '.feather' global_path = data_dir + '/' + str( file_dict[component_id]) + '_global' + '.feather' # sdg print('outputting ' + str(file_dict[component_id])) df_id = df[df.indicator_component_id == component_id] df_id.reset_index(drop=True, inplace=True) df_id.to_feather(path) # global print('outputting ' + str(file_dict[component_id]) + ' global') df_id_global = df_global[df_global.indicator_component_id == component_id] df_id_global.reset_index(drop=True, inplace=True) df_id_global.to_feather(global_path) return df
def process_location_como_draws(location_id, measure_id, test=False): """Pull indidence rates, merging with population to make cases Gets all years, ages, and sexes for the location id as incidence rates from get_draws, and combines into all ages, both sexes cases. """ db_pops = qry.get_pops() if measure_id == 6: causes = dw.COMO_INC_CAUSE_IDS elif measure_id == 5: causes = dw.COMO_PREV_CAUSE_IDS else: raise ValueError("bad measure_id: {}".format(measure_id)) dfs = [] if test: years = [2016] else: years = [] for cause_id in causes: print("pulling {c}".format(c=cause_id)) if test: df = get_draws(gbd_id_field='cause_id', gbd_id=cause_id, source='como', version=dw.COMO_VERS, location_ids=[location_id], year_ids=years, age_group_ids=[], sex_ids=[1, 2], measure_ids=[measure_id]) else: df = interpolate(gbd_id_field='cause_id', gbd_id=cause_id, source='como', version=dw.COMO_VERS, reporting_year_start=1990, reporting_year_end=2016, location_ids=[location_id], age_group_ids=[], sex_ids=[1, 2], measure_ids=[measure_id]) # these pull in as rates df['metric_id'] = 3 # make sure it looks like we expect assert set(df.age_group_id) == set(range(2, 21) + range(30, 33) + [235]), \ 'unexpected age group ids found' assert set(df.sex_id) == set([1, 2]), \ 'unexpected sex ids found' if not test: assert set(df.year_id) == set(range(1990, 2017)), \ 'unexpected year ids found' assert set(df.location_id) == set([location_id]), \ 'unexpected location ids found' # compile dfs.append(df[dw.COMO_GROUP_COLS + dw.DRAW_COLS]) df = pd.concat(dfs, ignore_index=True) # merge with pops to transform to cases df = df.merge(db_pops, how='left') assert df.population.notnull().values.all( ), 'merge with populations failed' # concatenate the metadata with the draw cols times the pop # this multiplies each draw column by the population column df = pd.concat([ df[dw.COMO_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['population']), df['population'] ], axis=1) # now its numbers (this line is for readability) df['metric_id'] = 1 # aggregate sexes df['sex_id'] = 3 # collapse sexes together df = df.groupby(dw.COMO_GROUP_COLS, as_index=False)[dw.DRAW_COLS + ['population']].sum() df = pd.concat([ df[dw.COMO_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x / df['population']) ], axis=1) df['metric_id'] = 3 # AGE STANDARDIZE print("age standardizing") wgts = custom_age_weights(0, 125) df = df.merge(wgts, on=['age_group_id'], how='left') assert df.age_group_weight_value.notnull().values.all(), \ 'merge w wgts failed' df = pd.concat([ df[dw.COMO_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value']) ], axis=1) df['age_group_id'] = 27 df = df.groupby(dw.COMO_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() df = df[dw.COMO_GROUP_COLS + dw.DRAW_COLS] write_output(df, 'como', location_id) return df
def process_location_risk_exposure_draws(location_id, test=False): """Return yearly age standardized estimates of each rei_id. Arguments: location_id: the location_id to process Returns: pandas dataframe like so: [ID_COLS] : [dw.DRAW_COLS] """ dfs = [] # version_df = pd.DataFrame() risks = set(dw.RISK_EXPOSURE_REI_IDS).union( set(dw.RISK_EXPOSURE_REI_IDS_MALN)) if test: years = [2016] else: years = [] for rei_id in risks: print("pulling {r}".format(r=rei_id)) if test or rei_id == 166: df = get_draws(gbd_id_field='rei_id', gbd_id=rei_id, source='risk', location_ids=[location_id], year_ids=years, age_group_ids=[], sex_ids=[], draw_type='exposure') elif not test and rei_id == 86: df = interpolate(gbd_id_field='rei_id', gbd_id=rei_id, source='risk', reporting_year_start=1990, reporting_year_end=2016, location_ids=[location_id], age_group_ids=[], sex_ids=[], measure_ids=19, draw_type='exposure') else: df = interpolate(gbd_id_field='rei_id', gbd_id=rei_id, source='risk', reporting_year_start=1990, reporting_year_end=2016, location_ids=[location_id], age_group_ids=[], sex_ids=[], draw_type='exposure') # remove any other ages besides main gbd ages df = df.query( '(age_group_id >= 2 & age_group_id <= 20) or age_group_id in [30, 31, 32, 235] and sex_id in [1, 2]' ) df = df.query('year_id >= 1990') if rei_id == 166: # only keep 10+ for smoking df = df.query('age_group_id >= 7') df = df.query('parameter=="cat1"') # set the rei_id because it isnt in the get_draws pull df['rei_id'] = rei_id # these are prevalence rates df['metric_id'] = 3 if rei_id == 86: df['measure_id'] = 19 else: df['measure_id'] = 5 dfs.append(df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS]) df = pd.concat(dfs, ignore_index=True) # COLLAPSE SEX print("collapsing sex") df = df.merge(qry.get_pops(), how='left') assert df.population.notnull().values.all(), 'merge with pops fail' # overriding the sex variable for collapsing df['sex_id'] = df.rei_id.apply(lambda x: 2 if x == 167 else 3) # for stunting and wasting (where we only have under-5), keep only under-5 and aggregate ages df.ix[df['rei_id'].isin(dw.RISK_EXPOSURE_REI_IDS_MALN), 'age_group_id'] = 1 # make all ages for PM 2.5 df.ix[df['rei_id'] == 86, 'age_group_id'] = 22 df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['population']) ], axis=1) # so unnecessary programmatically but good for documentation - # these are now prev cases df['metric_id'] = 1 # now that its in cases it is possible to collapse sex df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False).sum() # RETURN TO RATES print("returning to rates") df = df.merge(qry.get_pops(), how='left') assert df.population.notnull().values.all(), 'merge with pops fail' df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x / df['population']) ], axis=1) df['metric_id'] = 3 # AGE STANDARDIZE print("age standardizing") wgts = custom_age_weights(10, 125) # FOR SMOKING ONLY df = df.merge(wgts, on=['age_group_id'], how='left') assert df.age_group_weight_value.notnull().values.all(), \ 'merge w wgts failed' df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value']) ], axis=1) df['age_group_id'] = 27 df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() df = df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS] write_output(df, 'risk_exposure', location_id) return df
def process_location_risk_burden_draws(location_id, test=False): ''' Given a list of rei_ids, use gopher to get attributable burden draws and save to out directory. ''' dfs = [] for rei_id in dw.RISK_BURDEN_REI_IDS + dw.RISK_BURDEN_DALY_REI_IDS: print(rei_id) if rei_id in dw.RISK_BURDEN_REI_IDS: measure_id = 1 elif rei_id in dw.RISK_BURDEN_DALY_REI_IDS: measure_id = 2 else: raise ValueError("no measure found") print('Getting draws') df = get_draws(gbd_id_field=['cause_id', 'rei_id'], gbd_id=[294, rei_id], source='burdenator', version=dw.BURDENATOR_VERS, location_ids=location_id, year_ids=[], age_group_ids=[], sex_ids=[], num_workers=3, n_draws=1000, resample=True) # keep years we want df = df.query('measure_id == {}'.format(measure_id)) df = df.query('metric_id == 1') df = df.query('age_group_id in {} and sex_id in [1, 2]'.format( range(2, 21) + range(30, 33) + [235])) df = df.query('year_id in {}'.format(range(1990, 2011, 5) + [2016])) # aggregate to both sexes df['sex_id'] = 3 df = df.groupby(dw.RISK_BURDEN_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() pops = qry.get_pops(both_sexes=True) df = df.merge(pops, how='left', on=['location_id', 'age_group_id', 'sex_id', 'year_id']) df = pd.concat([ df[dw.RISK_BURDEN_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x / df['population']) ], axis=1) df['metric_id'] = 3 # keep the right columns df = df[dw.RISK_BURDEN_GROUP_COLS + dw.DRAW_COLS] # interpolate years print('Interpolating') df = custom_interpolate(df) # age-standardize age_weights = qry.get_age_weights(4) df = df.merge(age_weights) df = pd.concat([ df[dw.RISK_BURDEN_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value']) ], axis=1) df['age_group_id'] = 27 df = df.groupby(dw.RISK_BURDEN_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() dfs.append(df) df = pd.concat(dfs) write_output(df, 'risk_burden', location_id) return df
def process_risk_exposure_draws(past_future, version=dw.RISK_EXPOSURE_VERS): if past_future == 'past': index_cols = dw.RISK_EXPOSURE_GROUP_COLS data_dir = dw.INPUT_DATA_DIR + 'risk_exposure' + '/' + str(version) db_pops = qry.get_pops() if past_future == 'future': index_cols = ['indicator_component_id'] + INDEX_COLS_FUTURE data_dir = dw.FORECAST_DATA_DIR + 'risk_exposure' + '/' + str(version) db_pops = load_population() component_ids = dw.RISK_EXPOSURE_COMPONENT_IDS dfs = [] for component_id in component_ids: print("pulling {c}".format(c=component_id)) df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather') df.loc[:, 'indicator_component_id'] = component_id dfs.append(df) print("concatenating") df = pd.concat(dfs, ignore_index=True) # collapse sex/ages df = df.merge(db_pops, how='left') # set age-groups for aggregation now so it doesn't have to be done twice (for sex_split) df.loc[df.indicator_component_id.isin([35, 41]), 'age_group_id'] = 1 # Malnutrition df.loc[df.indicator_component_id == 44, 'age_group_id'] = 5 # Child Overweight # keep these for later df_sex_split = df[~df.indicator_component_id.isin([5, 227])] df_smoking = df[df.indicator_component_id == 227] df_smoking_sex_split = df_smoking.copy(deep=True) df = df[df.indicator_component_id != 227] # remove smoking from main df # age/sex aggregate df['sex_id'] = 3 # changes everything but Mean PM2.5 which is already aggregated print("concatenating") # concat sex-split data df = pd.concat([df, df_sex_split], ignore_index=True) df = agg.age_sex_aggregate(df, group_cols=index_cols, denominator='population') # sex aggregate smoking data before age-standardizing df_smoking['sex_id'] = 3 df = agg.age_sex_aggregate(df, group_cols=index_cols, denominator='population') print("concatenating") df_smoking = pd.concat([df_smoking, df_smoking_sex_split], axis=0) # aggregate all but smoking to global df_global = agg.aggregate_locations_to_global(df, index_cols) # aggregate smoking to global and age-standardize global and non-global df_smoking_global = agg.aggregate_locations_to_global( df_smoking, index_cols, age_standardized=True, age_group_years_start=10, age_group_years_end=125, age_group_id=194) # df_smoking_global['units'] = 'sdg' df_smoking = agg.age_standardize(df_smoking, index_cols, 10, 125, 194) # concat smoking df = pd.concat([df, df_smoking], axis=0) df_global = pd.concat([df_global, df_smoking_global], axis=0) # output df = df[index_cols + dw.DRAW_COLS] df_global = df_global[index_cols + dw.DRAW_COLS] file_dict = dw.RE_FILE_DICT for component_id in file_dict.keys(): path = data_dir + '/' + str(file_dict[component_id]) + '.feather' global_path = data_dir + '/' + str( file_dict[component_id]) + '_global' + '.feather' # sdg print('outputting ' + str(file_dict[component_id])) df_id = df[df.indicator_component_id == component_id] df_id.reset_index(drop=True, inplace=True) df_id.to_feather(path) # global print('outputting ' + str(file_dict[component_id]) + ' global') df_id_global = df_global[df_global.indicator_component_id == component_id] df_id_global.reset_index(drop=True, inplace=True) df_id_global.to_feather(global_path) return df_global
df['run_num'] = 'draw_' + (df['run_num'] - 1).astype(str) df = pd.pivot_table(df, values='value', index=['location_id', 'year_id', 'age_group_id', 'sex_id'], columns='run_num') df = df.reset_index() return df if __name__ == '__main__': # Supplementary datasets print('Collecting supplementary datasets') age_weights = qry.get_age_weights(4) age_weights.loc[age_weights.age_group_id.isin([30, 31, 32, 235]), 'age_group_id'] = 21 age_weights = age_weights.groupby(['age_group_id'], as_index=False)['age_group_weight_value'].sum() gbd_popdf = qry.get_pops() gbd_popdf.loc[gbd_popdf.age_group_id.isin([30, 31, 32, 235]), 'age_group_id'] = 21 gbd_popdf = gbd_popdf.groupby(['location_id', 'year_id', 'age_group_id', 'sex_id'], as_index=False)['population'].sum() wpp_popdf = pd.read_csv('FILEPATH/wpp2015_to2063.csv') wpp_popdf = wpp_popdf.loc[wpp_popdf.year_id >= 2016] wpp_popdf = wpp_popdf.rename(index=str, columns={'pop':'population'}) locsdf = qry.get_sdg_reporting_locations() locsdf['L3_loc'] = [loc[3] for loc in locsdf.path_to_top_parent.str.split(',').tolist()] # Compile all countries print('Fetching location-specific datasets') pool = Pool(15) dfs = pool.map(load_location_file, locsdf['ihme_loc_id'].values) pool.close()
def process_dismod_draws(past_future, version=dw.DISMOD_VERS): if past_future == 'past': index_cols = dw.DISMOD_GROUP_COLS data_dir = dw.INPUT_DATA_DIR + 'dismod' + '/' + str(version) db_pops = qry.get_pops() elif past_future == 'future': index_cols = ['indicator_component_id'] + INDEX_COLS_FUTURE data_dir = dw.FORECAST_DATA_DIR + 'dismod' + '/' + str(version) db_pops = load_population() else: raise ValueError( 'The past_future arg must be set to "past" or "future".') component_ids = [14, 17, 242, 245] # no child sex abuse (pulled later) dfs = [] for component_id in component_ids: print("pulling {c}".format(c=component_id)) df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather') df.loc[:, 'indicator_component_id'] = component_id dfs.append(df) print("concatenating") df = pd.concat(dfs, ignore_index=True) df = df[index_cols + dw.DRAW_COLS] # COLLAPSE SEX/AGES df = df.merge(db_pops, how='left') df_sex_split = df[df.indicator_component_id.isin([14, 17])] df.loc[df['indicator_component_id'].isin([14, 17]), 'sex_id'] = 3 # physical and sexual violence sex aggregation df = agg.age_sex_aggregate(df, group_cols=index_cols) df = pd.concat([df, df_sex_split], ignore_index=True) # AGE STANDARDIZE df_age_stand_all_age = df.loc[df['indicator_component_id'].isin([14, 17])] df_age_stand_15_plus = df.loc[df['indicator_component_id'].isin( [242, 245])] # int partner and non-int partner violence # global df_aa_global = agg.aggregate_locations_to_global(df_age_stand_all_age, index_cols, age_standardized=True, age_group_years_start=0, age_group_years_end=125, age_group_id=27) df_15_plus_global = agg.aggregate_locations_to_global( df_age_stand_15_plus, index_cols, age_standardized=True, age_group_years_start=15, age_group_years_end=125, age_group_id=29) # national/subnational df_age_stand_all_age = agg.age_standardize(df_age_stand_all_age, index_cols, 0, 125, 27) df_age_stand_15_plus = agg.age_standardize(df_age_stand_15_plus, index_cols, 15, 125, 29) # concat print("concatenating") df = pd.concat([df_age_stand_all_age, df_age_stand_15_plus], ignore_index=True) df_global = pd.concat([df_aa_global, df_15_plus_global], ignore_index=True) # output df = df[index_cols + dw.DRAW_COLS] file_dict = dict(zip(component_ids, ['1094', '1095', '1047', '1098'])) for component_id in file_dict.keys(): path = data_dir + '/' + file_dict[component_id] + '.feather' global_path = data_dir + '/' + file_dict[ component_id] + '_global' + '.feather' # sdg print('outputting ' + file_dict[component_id]) df_id = df[df.indicator_component_id == component_id] df_id.reset_index(drop=True, inplace=True) df_id.to_feather(path) # global print('outputting ' + file_dict[component_id] + ' global') df_id_global = df_global[df_global.indicator_component_id == component_id] df_id_global.reset_index(drop=True, inplace=True) df_id_global.to_feather(global_path) ############################################# # child sex abuse index_cols = index_cols.remove('indicator_component_id') df_csa = pd.read_feather(data_dir + '/' + '1064_age_disagg.feather') df_csa = df_csa.merge(db_pops, how='left') # aggregation and output df_csa.loc[:, 'age_group_id'] = 202 df_csa = agg.age_sex_aggregate(df_csa, group_cols=index_cols) df_csa_global = df_csa.copy(deep=True) df_csa = df_csa[index_cols + dw.DRAW_COLS] df_csa.reset_index(drop=True, inplace=True) print('outputting 1064') df_csa.to_feather(data_dir + '/' + '1064.feather') df_csa_global = agg.aggregate_locations_to_global(df_csa_global, index_cols) df_csa_global = df_csa_global[index_cols + dw.DRAW_COLS] df_csa_global.reset_index(drop=True, inplace=True) print('outputting 1064 global') df_csa_global.to_feather(data_dir + '/' + '1064_global.feather') return df
def process_codcorrect_draws(version=dw.CC_VERS): index_cols = dw.CC_GROUP_COLS component_ids = dw.CC_ALL_AGE_COMPONENT_IDS + dw.CC_THIRTY_SEVENTY_COMPONENT_IDS + dw.CONF_DIS_COMPONENT_IDS data_dir = dw.INPUT_DATA_DIR + 'codcorrect' + '/' + str(version) dfs = [] for component_id in component_ids: print("pulling {c}".format(c=component_id)) df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather') df.loc[:, 'indicator_component_id'] = component_id dfs.append(df) print("concatenating") df = pd.concat(dfs, ignore_index=True) # convert to numbers db_pops = qry.get_pops() df = df.merge(db_pops, how='left') # keep sex split for certain indicators (ncds, road mort, poisoning, homicide) df_keep_sex_split = df.loc[df['indicator_component_id'].isin( dw.CC_THIRTY_SEVENTY_COMPONENT_IDS + dw.CC_ALL_AGE_COMPONENT_IDS ), :] # these age_groups get standardized later # collapse sex (and age for conlict and distaster mort) df['sex_id'] = 3 df.loc[df['indicator_component_id'].isin(dw.CONF_DIS_COMPONENT_IDS), 'age_group_id'] = 22 df = agg.age_sex_aggregate(df, group_cols=index_cols) # make sure it looks like we expect assert set(df.loc[df['cause_id'].isin(dw.CONF_DIS_CAUSES)].age_group_id) == set([22]), \ 'unexpected age group ids found' assert set(df.loc[~df['cause_id'].isin(dw.CONF_DIS_CAUSES)].age_group_id) == \ set(range(2, 21) + range(30, 33) + [235]), \ 'unexpected age group ids found' assert set(df.sex_id) == set([3]), 'unexpected sex ids found' # concat sex-split data before age-standardizing df = pd.concat([df, df_keep_sex_split], axis=0) # prepare for age-standardization # all age-standardized except for conflict and disaster mort df_conf_dis = df.loc[df['indicator_component_id'].isin( dw.CONF_DIS_COMPONENT_IDS)] df_ncds = df.loc[df['indicator_component_id'].isin( dw.CC_THIRTY_SEVENTY_COMPONENT_IDS)] df_all_ages = df.loc[df['indicator_component_id'].isin( dw.CC_ALL_AGE_COMPONENT_IDS)] # global aggregation df_cd_global = agg.aggregate_locations_to_global(df_conf_dis, index_cols) df_ncds_global = agg.aggregate_locations_to_global( df_ncds, index_cols, age_standardized=True, age_group_years_start=30, age_group_years_end=70, age_group_id=214) df_aa_global = agg.aggregate_locations_to_global(df_all_ages, index_cols, age_standardized=True, age_group_years_start=0, age_group_years_end=125, age_group_id=27) # age standardize df_ncds = agg.age_standardize(df_ncds, index_cols, 30, 70, 214) df_all_ages = agg.age_standardize(df_all_ages, index_cols, 0, 125, 27) # concat all print('concatenating') df = pd.concat([df_ncds, df_all_ages, df_conf_dis], axis=0) df_global = pd.concat([df_ncds_global, df_aa_global, df_cd_global], axis=0) # output df = df[index_cols + dw.DRAW_COLS] df_global = df_global[index_cols + dw.DRAW_COLS] file_dict = dw.CC_FILE_DICT for component_id in file_dict.keys(): path = data_dir + '/' + file_dict[component_id] + '.feather' global_path = data_dir + '/' + file_dict[ component_id] + '_global' + '.feather' # sdg print('outputting ' + file_dict[component_id]) df_id = df[df.indicator_component_id == component_id] df_id.reset_index(drop=True, inplace=True) df_id.to_feather(path) # global print('outputting ' + file_dict[component_id] + ' global') df_id_global = df_global[df_global.indicator_component_id == component_id] df_id_global.reset_index(drop=True, inplace=True) df_id_global.to_feather(global_path)
df = df.reset_index() return df[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'metric_id' ] + dw.DRAW_COLS] # get locations locsdf = qry.get_sdg_reporting_locations() # read past print 'prepping past file...' past_df = pd.read_csv(dw.ADOL_FERT_PAST_FILE) past_df = past_df.loc[past_df.location_id.isin(locsdf.location_id.values)] pop_df = qry.get_pops() pop_df = pop_df.loc[(pop_df.age_group_id.isin([7, 8])) & \ (pop_df.sex_id == 2)] past_df = agg_fertility(past_df, pop_df) print 'writing...' try: if not os.path.exists(dw.ADOL_FERT_DIR): os.makedirs(dw.ADOL_FERT_DIR) except OSError: pass past_df.to_hdf("{d}/asfr_clean.h5".format(d=dw.ADOL_FERT_DIR), key="data", format="table", data_columns=['location_id', 'year_id'])
import sdg_utils.draw_files as dw import sdg_utils.queries as qry # get locations locsdf = qry.get_sdg_reporting_locations() # get main dataset df = pd.read_csv(dw.MET_NEED_FILE) df = df.query('year_id >= 1990') df = df.query('location_id in {}'.format(list(locsdf['location_id']))) df['metric_id'] = 2 df['measure_id'] = 18 # get weights if 'mod_contra' in dw.MET_NEED_VERS: agesdf = qry.get_pops() agesdf = agesdf.loc[agesdf.age_group_id.isin(df.age_group_id.unique())] agesdf['totpop'] = agesdf.groupby( ['location_id', 'year_id', 'sex_id'], as_index=False)['population'].transform('sum') agesdf['weights'] = agesdf['population'] / agesdf['totpop'] else: agesdf = pd.read_csv(dw.MET_NEED_WEIGHTS_FILE) agesdf = agesdf.query('location_id in {}'.format( list(locsdf['location_id']))) agesdf = agesdf.query('year_id >= 1990') agesdf['weights'] = agesdf[['weight_' + str(i) for i in range(0, 1000)]].mean(axis=1) agesdf = agesdf[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'weights' ]]
def process_location_como_draws(location_id, measure_id, test=False): """Pull indidence rates, merging with population to make cases Using COMO because there are plans to make this store each year. Gets all years, ages, and sexes for the location id as incidence rates from transmogrifier's gopher library, and combines into all ages, both sexes cases. """ db_pops = qry.get_pops() if measure_id == 6: gbd_ids = {'cause_ids': dw.COMO_INC_CAUSE_IDS} elif measure_id == 5: gbd_ids = {'cause_ids': dw.COMO_PREV_CAUSE_IDS} else: raise ValueError("bad measure_id: {}".format(measure_id)) if test: years = [2015] else: years = [] df = gopher.draws(gbd_ids, 'como', measure_ids=[measure_id], location_ids=[location_id], year_ids=years, age_group_ids=[], sex_ids=[], verbose=True, num_workers=5, version=dw.COMO_VERS) # make sure it looks like we expect assert set(df.age_group_id) == set(range(2, 22)), \ 'unexpected age group ids found' assert set(df.sex_id) == set([1, 2]), \ 'unexpected sex ids found' if not test: assert set(df.year_id) == set(range(1990, 2016, 5)), \ 'unexpected year ids found' assert set(df.location_id) == set([location_id]), \ 'unexpected location ids found' # these pull in as rates df['metric_id'] = 3 # merge with pops to transform to cases df = df.merge(db_pops, how='left') assert df.mean_pop.notnull().values.all(), 'merge with populations failed' # concatenate the metadata with the draw cols times the pop # this multiplies each draw column by the mean_pop column df = pd.concat([ df[dw.COMO_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['mean_pop']) ], axis=1) # now its numbers (this line is for readability) df['metric_id'] = 1 # aggregate sexes df['sex_id'] = 3 # collapse sexes together df = df.groupby(dw.COMO_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() # age standardize df = age_standardize(df, 'como') write_output(df, 'como', location_id) return df
def process_risk_exposure_draws(location_id, test=False): """Return yearly age standardized estimates of each rei_id. 1. Use gopher to pull data for each rei_id for the location_id the location id, and all years. 2. Keep appropriate categories for given rei_id 3. Draws only come with male/female in rates - change to cases and make both sexes aggregate. 4. Revert back to rates and age standardize using custom weights. Arguments: location_id: the location_id to process Returns: pandas dataframe like so: [ID_COLS] : [dw.DRAW_COLS] """ dfs = [] version_df = pd.DataFrame() all_ids = set(dw.RISK_EXPOSURE_REI_IDS).union( set(dw.RISK_EXPOSURE_REI_IDS_MALN)) if test: years = [2015] else: years = [] for rei_id in all_ids: print("pulling {r}".format(r=rei_id)) df = gopher.draws({"rei_ids": [rei_id]}, source='risk', draw_type='exposure', location_ids=[location_id], year_ids=years, age_group_ids=[], sex_ids=[1, 2], num_workers=5) # remove any other ages besides gbd ages df = df.query('age_group_id >= 2 & age_group_id <= 21') # only reporting since 1990 df = df.query('year_id>=1990') if rei_id == 167: # change IPV to just women df = df.query('sex_id == 2') if rei_id in dw.RISK_EXPOSURE_REI_IDS_MALN: # these are childhood stunting - cat1 + cat2 equals <-2 std dev df = df.query('parameter=="cat1" | parameter=="cat2"') else: # cat1 represents the prevalence in these cases (can't test this?) df = df.query('parameter=="cat1"') # set the rei_id because it isnt in the gopher pull df['rei_id'] = rei_id # keep track of what model versions where used version_df = version_df.append( df[['rei_id', 'modelable_entity_id', 'model_version_id']].drop_duplicates(), ignore_index=True) # these are prevalence rates df['metric_id'] = 3 df['measure_id'] = 5 dfs.append(df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS]) df = pd.concat(dfs, ignore_index=True) # note the versions used by risk exposure vers (manufactured by me) version_df.to_csv( "/home/j/WORK/10_gbd/04_journals/" "gbd2015_capstone_lancet_SDG/02_inputs/" "risk_exposure_versions_{v}.csv".format(v=dw.RISK_EXPOSURE_VERS), index=False) # COLLAPSE SEX print("collapsing sex") df = df.merge(qry.get_pops(), how='left') assert df.mean_pop.notnull().values.all(), 'merge with pops fail' # overriding the sex variable for collapsing df['sex_id'] = df.rei_id.apply(lambda x: 2 if x == 167 else 3) df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['mean_pop']) ], axis=1) # so unnecessary programmatically but good for documentation - # these are now prev cases df['metric_id'] = 1 # now that its in cases it is possible to collapse sex df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False).sum() # RETURN TO RATES print("returning to rates") df = df.merge(qry.get_pops(), how='left') assert df.mean_pop.notnull().values.all(), 'merge with pops fail' df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x / df['mean_pop']) ], axis=1) df['metric_id'] = 3 # AGE STANDARDIZE print("age standardizing") df['is_0_5'] = df.rei_id.apply( lambda x: 1 if x in dw.RISK_EXPOSURE_REI_IDS_MALN else 0) wgts = custom_age_weights(2, 21) wgts['is_0_5'] = 0 wgts_2 = custom_age_weights(2, 5) wgts_2['is_0_5'] = 1 wgts = wgts.append(wgts_2, ignore_index=True) df = df.merge(wgts, on=['is_0_5', 'age_group_id'], how='left') assert df.age_group_weight_value.notnull().values.all(), \ 'merge w wgts failed' df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value']) ], axis=1) df['age_group_id'] = 27 df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() write_output(df, 'risk_exposure', location_id) return df
def process_covariate_draws(version=dw.COV_VERS): data_dir = dw.INPUT_DATA_DIR + 'covariate' + '/' + str(version) component_ids = dw.NON_UHC_COV_COMPONENT_IDS dfs = [] for component_id in component_ids: # read in all components print("pulling {c}".format(c=component_id)) df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather') dfs.append(df) print('concatenating') df = pd.concat(dfs, ignore_index=True) # merge populations db_pops = qry.get_pops() db_pops_adol_birth = db_pops[db_pops.age_group_id.isin( [7, 8])] # create adol birth age group db_pops_adol_birth['age_group_id'] = 162 pop_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id'] db_pops_adol_birth = db_pops_adol_birth.groupby( pop_cols, as_index=False)['population'].sum() db_pops = db_pops.append(db_pops_adol_birth, ignore_index=True) df = df.merge(db_pops, how='left') # aggregate met need df_met_need_15_plus = df[df.indicator_component_id == 179] df_met_need_15_24 = df_met_need_15_plus[ df_met_need_15_plus.age_group_id.isin([8, 9])] df_met_need_15_plus['age_group_id'] = 24 df_met_need_15_24['age_group_id'] = 149 df_met_need = df_met_need_15_plus.append(df_met_need_15_24, ignore_index=True) df_met_need = agg.age_sex_aggregate(df_met_need, group_cols=dw.COV_GROUP_COLS, denominator='population') df = df[df.indicator_component_id != 179] df = df.append(df_met_need, ignore_index=True) # global aggregation df_global = df_global[~df_global.indicator_component_id.isin( [1457, 1460, 1463, 1556])] # hrh aggregated later df_global = agg.aggregate_locations_to_global(df, dw.COV_GROUP_COLS, denominator='population') # output df = df[dw.COV_GROUP_COLS + dw.DRAW_COLS] df_global = df_global[dw.COV_GROUP_COLS + dw.DRAW_COLS] file_dict = dw.COV_FILE_DICT for component_id in file_dict.keys(): if file_dict[component_id] == component_id: path = data_dir + '/' + str(component_id) + '_prepped' + '.feather' global_path = data_dir + '/' + str( component_id) + '_global' + '.feather' else: path = data_dir + '/' + str(file_dict[component_id]) + '.feather' global_path = data_dir + '/' + str( file_dict[component_id]) + '_global' + '.feather' print('outputting ' + str(file_dict[component_id])) df_id = df[(df.indicator_component_id == component_id) & (df.age_group_id != 149)] df_id.reset_index(drop=True, inplace=True) df_id.to_feather(path) if component_id not in [1457, 1460, 1463, 1556]: # save global dfs print('outputting ' + str(file_dict[component_id]) + ' global') df_id_global = df_global[df_global.indicator_component_id == component_id] df_id_global.reset_index(drop=True, inplace=True) df_id_global.to_feather(global_path)