def collect_all_processed_draws(indicator_type):
    """Append together all the processed draws and write output per cause id"""
    if indicator_type == 'dalynator':
        gbd_ids = set(dw.DALY_ALL_AGE_CAUSE_IDS).union(
            set(dw.DALY_THIRTY_SEVENTY_CAUSE_IDS))
        group_cols = dw.DALY_GROUP_COLS
        temp_dir = dw.DALY_TEMP_OUT_DIR
        version_id = dw.DALY_VERS
    elif indicator_type in ['como_prev', 'como_inc']:
        if indicator_type == 'como_inc':
            gbd_ids = set(dw.COMO_INC_CAUSE_IDS)
        else:
            gbd_ids = set(dw.COMO_PREV_CAUSE_IDS)
        group_cols = dw.COMO_GROUP_COLS
        temp_dir = dw.COMO_TEMP_OUT_DIR
        version_id = dw.COMO_VERS
    elif indicator_type == 'risk_exposure':
        gbd_ids = set(dw.RISK_EXPOSURE_REI_IDS).union(
            set(dw.RISK_EXPOSURE_REI_IDS_MALN))
        group_cols = dw.RISK_EXPOSURE_GROUP_COLS
        temp_dir = dw.RISK_EXPOSURE_TEMP_OUT_DIR
        version_id = dw.RISK_EXPOSURE_VERS
    else:
        raise ValueError("bad indicator type: {}".format(indicator_type))

    out_dir = '{d}/{it}/{v}'.format(d=dw.INPUT_DATA_DIR,
                                    it=indicator_type,
                                    v=version_id)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    err_list = []
    for gbd_id in gbd_ids:
        gbd_id_dir = os.path.join(temp_dir, str(gbd_id))
        processed_draws = os.listdir(gbd_id_dir)
        gbd_id_dfs = []
        for f in processed_draws:
            path = os.path.join(gbd_id_dir, f)
            gbd_id_df = pd.read_hdf(path)
            gbd_id_dfs.append(gbd_id_df)

        gbd_id_df = pd.concat(gbd_id_dfs, ignore_index=True)
        assert not gbd_id_df[group_cols].duplicated().any(), 'duplicates'
        # some locations are strings, make all ints
        gbd_id_df['location_id'] = gbd_id_df.location_id.astype(int)

        try:
            # test that all level three locations are present, but don't break
            #   all the writing if just one is wrong
            sdg_test.all_sdg_locations(gbd_id_df)
            gbd_id_df.to_hdf('{d}/{gbd_id}.h5'.format(d=out_dir,
                                                      gbd_id=gbd_id),
                             key="data",
                             format="table",
                             data_columns=['location_id', 'year_id'])
            print("{g} finished".format(g=gbd_id))
        except ValueError, e:
            err_list.append(e)
            print("Failed: {g}".format(g=gbd_id), file=sys.stderr)
            continue
Exemple #2
0
def main():
    """read, standardize columns, add location id, add china aggregate"""
    df = pd.read_csv(dw.MEAN_PM25_INFILE)
    assert not df[['iso3', 'year']].duplicated().any(), \
        'unexpected id columns, should be iso3 and year'
    df = df.rename(columns={'iso3': 'ihme_loc_id', 'year': 'year_id'})
    df = df.rename(columns={'draw_1000': 'draw_0'})
    df = df[['ihme_loc_id', 'year_id'] + dw.DRAW_COLS]
    df = add_location_id(df)
    df = add_china_aggregate(df)

    # standardize column structure again
    # (thought age and sex would be confusing, that doesnt make sense here)
    df['metric_id'] = 3
    df['measure_id'] = 19
    df = df[dw.MEAN_PM25_GROUP_COLS + dw.DRAW_COLS]

    sdg_test.all_sdg_locations(df)
    # save
    df.to_hdf(dw.MEAN_PM25_OUTFILE,
              format="table",
              key="data",
              data_columns=['location_id', 'year_id'])
Exemple #3
0
import pandas as pd
import sys

from getpass import getuser
sys.path.append(SDG_REPO)
import sdg_utils.draw_files as dw
import sdg_utils.tests as sdg_test

# read
df = pd.read_csv(dw.SBA_PATH)
# set metric to proportion
df['metric_id'] = 2
# save id columns
id_cols = [
    'location_id', 'year_id', 'age_group_id', 'sex_id', 'metric_id',
    'measure_id'
]
# keep necessary variables
df = df[id_cols + dw.DRAW_COLS]
# test
sdg_test.all_sdg_locations(gbd_id_df)
# convert to hdf
df.to_hdf(dw.SBA_OUT_PATH,
          format="table",
          key="data",
          data_columns=['location_id', 'year_id'])
Exemple #4
0
def convert_to_rates(df):
    """Convert back to rates by merging on pop"""
    pops = qry.get_pops(both_sexes=True)
    df = df.merge(pops, how = 'inner')#how='left')
    assert df.mean_pop.notnull().values.all(), 'pop merge failed'
    id_cols = dw.EPI_CHILD_OVRWGT_GROUP_COLS
    draws = [col for col in df.columns if 'draw_' in col]
    df = pd.concat([
        df[id_cols],
        df[draws].apply(lambda x: x / df['mean_pop'])
    ], axis=1
    )
    df['metric_id'] = 3
    return df


if __name__ == "__main__":
    df0 = collect_childhood_overweight(force_repull=True)
    df1 = collapse_sex(df0)
    df2 = add_sdi_aggregates(df1)
    df3 = convert_to_rates(df2)
    # test that locations are present
    sdg_test.all_sdg_locations(df3)
    # todo generalize this filepath
    out_path = "/ihme/scratch/projects/sdg/input_data/" \
               "epi/{v}/9363.h5".format(v=dw.EPI_CHILD_OVRWGT_VERS)
    df3.to_hdf(
        out_path,
        key="data",
        format="table", data_columns=['location_id', 'year_id'])