def collect_all_processed_draws(indicator_type): """Append together all the processed draws and write output per cause id""" if indicator_type == 'dalynator': gbd_ids = set(dw.DALY_ALL_AGE_CAUSE_IDS).union( set(dw.DALY_THIRTY_SEVENTY_CAUSE_IDS)) group_cols = dw.DALY_GROUP_COLS temp_dir = dw.DALY_TEMP_OUT_DIR version_id = dw.DALY_VERS elif indicator_type in ['como_prev', 'como_inc']: if indicator_type == 'como_inc': gbd_ids = set(dw.COMO_INC_CAUSE_IDS) else: gbd_ids = set(dw.COMO_PREV_CAUSE_IDS) group_cols = dw.COMO_GROUP_COLS temp_dir = dw.COMO_TEMP_OUT_DIR version_id = dw.COMO_VERS elif indicator_type == 'risk_exposure': gbd_ids = set(dw.RISK_EXPOSURE_REI_IDS).union( set(dw.RISK_EXPOSURE_REI_IDS_MALN)) group_cols = dw.RISK_EXPOSURE_GROUP_COLS temp_dir = dw.RISK_EXPOSURE_TEMP_OUT_DIR version_id = dw.RISK_EXPOSURE_VERS else: raise ValueError("bad indicator type: {}".format(indicator_type)) out_dir = '{d}/{it}/{v}'.format(d=dw.INPUT_DATA_DIR, it=indicator_type, v=version_id) if not os.path.exists(out_dir): os.makedirs(out_dir) err_list = [] for gbd_id in gbd_ids: gbd_id_dir = os.path.join(temp_dir, str(gbd_id)) processed_draws = os.listdir(gbd_id_dir) gbd_id_dfs = [] for f in processed_draws: path = os.path.join(gbd_id_dir, f) gbd_id_df = pd.read_hdf(path) gbd_id_dfs.append(gbd_id_df) gbd_id_df = pd.concat(gbd_id_dfs, ignore_index=True) assert not gbd_id_df[group_cols].duplicated().any(), 'duplicates' # some locations are strings, make all ints gbd_id_df['location_id'] = gbd_id_df.location_id.astype(int) try: # test that all level three locations are present, but don't break # all the writing if just one is wrong sdg_test.all_sdg_locations(gbd_id_df) gbd_id_df.to_hdf('{d}/{gbd_id}.h5'.format(d=out_dir, gbd_id=gbd_id), key="data", format="table", data_columns=['location_id', 'year_id']) print("{g} finished".format(g=gbd_id)) except ValueError, e: err_list.append(e) print("Failed: {g}".format(g=gbd_id), file=sys.stderr) continue
def main(): """read, standardize columns, add location id, add china aggregate""" df = pd.read_csv(dw.MEAN_PM25_INFILE) assert not df[['iso3', 'year']].duplicated().any(), \ 'unexpected id columns, should be iso3 and year' df = df.rename(columns={'iso3': 'ihme_loc_id', 'year': 'year_id'}) df = df.rename(columns={'draw_1000': 'draw_0'}) df = df[['ihme_loc_id', 'year_id'] + dw.DRAW_COLS] df = add_location_id(df) df = add_china_aggregate(df) # standardize column structure again # (thought age and sex would be confusing, that doesnt make sense here) df['metric_id'] = 3 df['measure_id'] = 19 df = df[dw.MEAN_PM25_GROUP_COLS + dw.DRAW_COLS] sdg_test.all_sdg_locations(df) # save df.to_hdf(dw.MEAN_PM25_OUTFILE, format="table", key="data", data_columns=['location_id', 'year_id'])
import pandas as pd import sys from getpass import getuser sys.path.append(SDG_REPO) import sdg_utils.draw_files as dw import sdg_utils.tests as sdg_test # read df = pd.read_csv(dw.SBA_PATH) # set metric to proportion df['metric_id'] = 2 # save id columns id_cols = [ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'metric_id', 'measure_id' ] # keep necessary variables df = df[id_cols + dw.DRAW_COLS] # test sdg_test.all_sdg_locations(gbd_id_df) # convert to hdf df.to_hdf(dw.SBA_OUT_PATH, format="table", key="data", data_columns=['location_id', 'year_id'])
def convert_to_rates(df): """Convert back to rates by merging on pop""" pops = qry.get_pops(both_sexes=True) df = df.merge(pops, how = 'inner')#how='left') assert df.mean_pop.notnull().values.all(), 'pop merge failed' id_cols = dw.EPI_CHILD_OVRWGT_GROUP_COLS draws = [col for col in df.columns if 'draw_' in col] df = pd.concat([ df[id_cols], df[draws].apply(lambda x: x / df['mean_pop']) ], axis=1 ) df['metric_id'] = 3 return df if __name__ == "__main__": df0 = collect_childhood_overweight(force_repull=True) df1 = collapse_sex(df0) df2 = add_sdi_aggregates(df1) df3 = convert_to_rates(df2) # test that locations are present sdg_test.all_sdg_locations(df3) # todo generalize this filepath out_path = "/ihme/scratch/projects/sdg/input_data/" \ "epi/{v}/9363.h5".format(v=dw.EPI_CHILD_OVRWGT_VERS) df3.to_hdf( out_path, key="data", format="table", data_columns=['location_id', 'year_id'])