def aggregate_locations_to_global(df, group_cols, denominator='population'):
    """multiply by population column and aggregate sexes and ages"""
    assert denominator in df.columns, '{d} column not in dataframe'.format(
        d=denominator)
    assert df[denominator].notnull().values.all(
    ), 'merge with {d} failed'.format(d=denominator)
    print("aggregatings locations")

    # select only level 3 locations
    locs = qry.get_sdg_reporting_locations(level_3=True)
    df = df[df.location_id.isin(locs.location_id)]

    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x * df[denominator]),
        df[denominator]
    ],
                   axis=1)

    # set location_id, groupby group_cols, and sum
    df['location_id'] = 1
    df = df.groupby(group_cols,
                    as_index=False)[dw.DRAW_COLS + [denominator]].sum()

    # return to appropriate metric
    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x / df[denominator]),
        df[denominator]
    ],
                   axis=1)

    df.drop('population', axis=1, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df
Example #2
0
def process_all_qx_files(test=True):
    """Sequentially write results from all qx files"""
    qx_files = os.listdir(dw.QX_DIR)
    qx_files = [q for q in qx_files if 'qx_' in q]
    if test:
        qx_files = qx_files[0:40]
    dfs = []
    print('assembling qx files')
    n = len(qx_files)
    i = 0
    for qx_file in qx_files:
        df = process_qx_file(qx_file)
        dfs.append(df)
        i = i + 1
        if i % (n / 10) == 0:
            print('{i}/{n} qx_files complete'.format(i=i, n=n))
    df = pd.concat(dfs, ignore_index=True)
    locsdf = qry.get_sdg_reporting_locations()
    df = df[df['location_id'].isin(locsdf['location_id'].values)]
    id_cols = [
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id',
        'metric_id'
    ]
    df = df[id_cols + dw.DRAW_COLS]
    assert set(df.age_group_id) == {1, 42}, 'unexpected age group ids'
    assert set(df.year_id) == set(range(1990, 2017)), 'unexpected year ids'
    return df
Example #3
0
def clean_compiled_indicators(df):
    # replace disaster with multi year moving average
    if 1019 in df['indicator_id'].unique():
        df = multi_year_avg(df, 1019)

    indic_table = qry.get_indicator_table()
    df = df.merge(
        indic_table[['indicator_id', 'invert', 'scale', 'indicator_stamp']],
        how='left')
    assert df.invert.notnull().values.all(), 'merge with indicator meta fail'

    # get sdg locations and filter to these
    sdg_locs = set(qry.get_sdg_reporting_locations().location_id)
    df = df.loc[df['location_id'].isin(sdg_locs)]

    # filter to sdg reporting years (move this to global in config file)
    sdg_years = range(1990, 2031)
    df = df.loc[df['year_id'].isin(sdg_years)]

    # make sure each id column is an integer
    for id_col in dw.INDICATOR_ID_COLS:
        df[id_col] = df[id_col].astype(int)

    # return
    return df
Example #4
0
def compile_output(df, add_rank=False, collapse_means=True):
    # collapse draws to means
    if collapse_means:
        df = collapse_to_means(df)

    # test that the data is square
    #sdg_test.df_is_square_on_indicator_location_year(df) # Data was square and still failing test

    # add indicator metadata
    indic_table = qry.get_indicator_table()
    indic_table = indic_table[[
        'indicator_id', 'indicator_short', 'indicator_stamp',
        'indicator_paperorder'
    ]]
    df = df.merge(indic_table, how='left')
    assert df.indicator_stamp.notnull().values.all(), \
        'merge with indic table failed'

    # add location metadata
    locs = qry.get_sdg_reporting_locations()
    locs = locs[['location_id', 'location_name', 'ihme_loc_id']]
    df = df.merge(locs, how='left')
    assert df.location_name.notnull().values.all(), \
        'merge with locations failed'
    print 'Number of locations: {}'.format(len(df.location_id.unique()))

    # make sure its just reporting years
    df = df.loc[df.year_id.isin(range(1990, 2031))]

    # set column order
    col_order = [
        'indicator_id', 'location_id', 'year_id', 'indicator_short',
        'indicator_stamp', 'indicator_paperorder', 'ihme_loc_id',
        'location_name', 'rank', 'mean_val', 'upper', 'lower'
    ]

    # optionally add rank by sdg index
    if add_rank:
        # keep sdg index
        sdg_index = df.query('indicator_id==1054')
        # calculate rank
        sdg_index['rank'] = sdg_index.groupby('year_id').mean_val.transform(
            lambda x: pd.Series.rank(x, method='first', ascending=False))
        # add it to the data
        df = df.merge(sdg_index[['location_id', 'year_id',
                                 'rank']].drop_duplicates(),
                      how='left')
        assert df['rank'].notnull().values.all(), 'merge failed'
        return df[col_order]
    else:
        col_order.remove('rank')
        return df[col_order]
Example #5
0
def all_sdg_locations(df):
    """Test that all level three locations are present in df"""
    sdg_locs = set(qry.get_sdg_reporting_locations().location_id)
    missing_locs = sdg_locs - set(df.location_id)
    if len(missing_locs) > 0:
        raise ValueError(
            "Found {n} missing locations: {l}".format(
                n=len(missing_locs),
                l=missing_locs
            )
        )
    else:
        return True
def run_all(indicator_type, test=False):
    """Run each cod correct location-year draw job"""

    # set some locals based on indicator type
    if indicator_type == 'codcorrect':
        temp_dir = dw.CC_TEMP_OUT_DIR
        delete_dir = dw.CC_TEMP_OUT_DIR_DELETE
    elif indicator_type in ['como_prev', 'como_inc']:
        temp_dir = dw.COMO_TEMP_OUT_DIR
        delete_dir = dw.COMO_TEMP_OUT_DIR_DELETE
    elif indicator_type == 'risk_exposure':
        temp_dir = dw.RISK_EXPOSURE_TEMP_OUT_DIR
        delete_dir = dw.RISK_EXPOSURE_TEMP_OUT_DIR_DELETE
    elif indicator_type == 'risk_burden':
        temp_dir = dw.RISK_BURDEN_TEMP_OUT_DIR
        delete_dir = dw.RISK_BURDEN_TEMP_OUT_DIR_DELETE
    else:
        raise ValueError("{it} not supported".format(it=indicator_type))
    # set location ids to run
    locations = qry.get_sdg_reporting_locations()
    location_ids = list(locations.location_id.unique())

    # change those if it is a test
    if test:
        location_ids = [68, 165]
    # location_ids = [133, 6, 71, 39, 76, 173, 142, 208, 81, 116, 117, 217, 25, 93, 95]

    # make a new directory to use and delete the other one in a new thread
    print("making a new temp directory and starting a thread to delete old...")
    assert os.path.exists(temp_dir), '{} doesnt exist'.format(temp_dir)
    shutil.move(temp_dir, delete_dir)
    # start a thread that deletes the delete dir
    thread_deleting_old = Thread(target=shutil.rmtree, args=(delete_dir, ))
    thread_deleting_old.start()
    os.mkdir(temp_dir)

    print("processing draws...")
    # initialize list of job ids to hold on
    job_ids = []
    for location_id in location_ids:

        job_ids.append(submit_job_location_draws(indicator_type, location_id))

    print('collecting all output')
    if not test:
        submit_job_collect(job_ids, indicator_type)

    print('waiting for temp dir deletion to finish...')
    thread_deleting_old.join()
    print('... done deleting and all jobs submitted.')
def aggregate_locations_to_global(df,
                                  group_cols,
                                  denominator='population',
                                  age_standardized=False,
                                  age_group_years_start=None,
                                  age_group_years_end=None,
                                  age_group_id=None):
    """multiply by population column and aggregate sexes and ages"""
    assert denominator in df.columns, '{d} column not in dataframe'.format(
        d=denominator)
    assert df[denominator].notnull().values.all(
    ), 'merge with {d} failed'.format(d=denominator)
    print("aggregatings locations")

    # select only level 3 locations
    locs = qry.get_sdg_reporting_locations(level_3=True)
    df = df[df.location_id.isin(locs.location_id)]

    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x * df[denominator]),
        df[denominator]
    ],
                   axis=1)

    # set location_id, groupby group_cols, and sum
    df['location_id'] = 1
    df = df.groupby(group_cols,
                    as_index=False)[dw.DRAW_COLS + [denominator]].sum()

    # return to appropriate metric
    df = pd.concat([
        df[group_cols], df[dw.DRAW_COLS].apply(lambda x: x / df[denominator]),
        df[denominator]
    ],
                   axis=1)

    if age_standardized == True:
        standardize_args = [
            age_group_years_start, age_group_years_end, age_group_id
        ]
        assert all(isinstance(arg, int) for arg in standardize_args
                   ), 'age-standardization parameters must be integers'
        df = age_standardize(df, group_cols, age_group_years_start,
                             age_group_years_end, age_group_id)

    return df
Example #8
0
def main():
    """read, standardize columns, add location id, add china aggregate"""
    # get all the things
    locsdf = qry.get_sdg_reporting_locations()
    dfs = []
    print('assembling pm25 files')
    n = len(locsdf['ihme_loc_id'])
    i = 0
    for ihme_loc_id in locsdf['ihme_loc_id']:
        if os.path.isfile(os.path.join(dw.MEAN_PM25_INDIR, ihme_loc_id + '.csv')):
            df = pd.read_csv(os.path.join(dw.MEAN_PM25_INDIR, ihme_loc_id + '.csv'))
            df['ihme_loc_id'] = ihme_loc_id
            dfs.append(df)
        else:
            print("Does not exist:" + os.path.join(dw.MEAN_PM25_INDIR, ihme_loc_id + '.csv'))
        i = i + 1
        if i % (n / 10) == 0:
            print('{i}/{n} files complete'.format(i=i, n=n))
    df = pd.concat(dfs, ignore_index=True)
    df = df.merge(locsdf[['ihme_loc_id', 'location_id']])
    df = df.rename(columns={'year': 'year_id'})
    df = df.rename(columns={'draw_1000': 'draw_0'})
    df = df[['location_id', 'year_id'] + dw.DRAW_COLS]
    df = add_england_aggregate(df, locsdf)

    # standardize column structure again
    df['metric_id'] = 3
    df['measure_id'] = 19
    df['age_group_id'] = 22
    df['sex_id'] = 3
    df = df[dw.MEAN_PM25_GROUP_COLS + dw.DRAW_COLS]

    # interpolate pre-2010
    post2010df = df.query('year_id >= 2010')
    df = custom_interpolate(df)
    df = df.append(post2010df)

    # save
    try:
        if not os.path.exists(dw.MEAN_PM25_OUTDIR):
            os.makedirs(dw.MEAN_PM25_OUTDIR)
    except OSError:
        pass
    df.to_hdf(dw.MEAN_PM25_OUTDIR + "/air_pm_draws_clean.h5",
              format="table", key="data",
              data_columns=['location_id', 'year_id'])
Example #9
0
def df_is_square_on_indicator_location_year(df, return_fail_df=False):
    """Test that the dataframe has all combos of indicator-location-year"""
    # construct square dataframe that contains level 3 locations, all years in
    #   five year increments of 1990-2015, and all status 1 indicators
    indic_table = _get_indicator_table()
    locs_table = qry.get_sdg_reporting_locations()
    sdg_locs = list(set(locs_table.location_id))
    locs = pd.DataFrame({'location_id': sdg_locs})
    stat1indic = indic_table.query(
        'indicator_status_id==1').indicator_id.unique()
    indicators = pd.DataFrame({'indicator_id': stat1indic})
    years5inc = range(1990, 2016, 5)
    years = pd.DataFrame({'year_id': years5inc})
    for sdf in [locs, indicators, years]:
        sdf['key'] = 0
    square_df = locs.merge(indicators, on='key')
    square_df = square_df.merge(years, on='key')
    square_df = square_df.drop('key', axis=1)

    # store dataframe of missing values from the square dataframe
    df['in_data'] = 1
    square_df['in_square'] = 1
    mdf = square_df.merge(df, how='outer')
    mdf = mdf.ix[(mdf['in_data'].isnull()) | (mdf['in_square'].isnull())]
    mdf = mdf.ix[~((mdf['in_data']==1) & \
             (mdf['indicator_id'].isin([1054, 1055, 1060])))]

    if len(mdf) > 0:
        # pretty print the missing stuff
        mdf = mdf[['location_id', 'year_id', 'indicator_id',
                   'in_data', 'in_square']]
        mdf = mdf.merge(locs_table[['location_id', 'location_name']])
        mdf = mdf.merge(indic_table[['indicator_id', 'indicator_short']])
        mdf = mdf[['location_name', 'indicator_short', 'year_id',
                   'in_data', 'in_square']]
        if return_fail_df:
            return mdf
        else:
            raise ValueError(
                    "Mismatch in the following: \n{df}".format(df=mdf)
            )
    else:
        return True
Example #10
0
def output_hrh_components_full_time_series(cov_version=dw.COV_VERS,
                                           sdg_version=dw.SDG_VERS):

    past_path = dw.INPUT_DATA_DIR + 'covariate' + '/' + cov_version
    future_path = dw.FORECAST_DATA_DIR + 'covariate' + '/' + cov_version

    dfs = []
    for component_id in [1457, 1460, 1463]:
        print("pulling {c}".format(c=component_id))

        df_past = pd.read_feather(past_path + '/' + str(component_id) +
                                  '.feather')
        df_past = df_past[['indicator_component_id', 'location_id', 'year_id']
                          + dw.DRAW_COLS]

        df_future = pd.read_feather(future_path + '/' + str(component_id) +
                                    '.feather')
        df_future = df_future.loc[df_future.scenario == 0, :]
        df_future['indicator_component_id'] = component_id
        df_future = df_future[
            ['indicator_component_id', 'location_id', 'year_id'] +
            dw.DRAW_COLS]

        dfs.extend([df_past, df_future])

    print "concatenating"
    df = pd.concat(dfs, ignore_index=True)

    # adding level column
    locsdf = qry.get_sdg_reporting_locations().loc[:, ['location_id', 'level']]
    df = df.merge(locsdf, on='location_id')

    print "outputting feather"
    df['indicator_id'] = 1096
    df = df[[
        'indicator_id', 'indicator_component_id', 'location_id', 'year_id',
        'level'
    ] + dw.DRAW_COLS]
    df.columns = df.columns.astype(str)
    df.to_feather(dw.INDICATOR_DATA_DIR +
                  'gbd2017/hrh_unscaled_components_v{v}.feather'.format(
                      v=sdg_version))
    print("done")
if getuser() == 'USER':
    SDG_REPO = 'FILEPATH'
sys.path.append(SDG_REPO)
import sdg_utils.draw_files as dw
import sdg_utils.queries as qry

POP_FILE = 'FILEPATH'
component_ids = [20, 23]
group_cols_past = [
    'location_id', 'year_id', 'measure_id', 'metric_id', 'age_group_id',
    'sex_id'
]
group_cols_future = [
    'location_id', 'year_id', 'age_group_id', 'sex_id', 'scenario'
]
NATS = qry.get_sdg_reporting_locations(level_3=True)


def add_denom_rows(df, agg_var, agg_dim, agg_parent, agg_children=None):
    '''
    Create expanded demographics dataframe for denominators.
    '''
    index_cols = [
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'scenario'
    ]

    agg_df = df.copy()
    if agg_children is not None:
        agg_df = agg_df.loc[agg_df[agg_dim].isin(agg_children)]
    agg_df[agg_dim] = agg_parent
    agg_df = agg_df.groupby(index_cols, as_index=False)[agg_var].sum()
Example #12
0
import pandas as pd
import sys
import os

from getpass import getuser

import sdg_utils.draw_files as dw
import sdg_utils.queries as qry

# get locations
locsdf = qry.get_sdg_reporting_locations()

# get main dataset
df = pd.read_csv(dw.MET_NEED_FILE)
df = df.query('year_id >= 1990')
df = df.query('location_id in {}'.format(list(locsdf['location_id'])))
df['metric_id'] = 2
df['measure_id'] = 18

# get weights
if 'mod_contra' in dw.MET_NEED_VERS:
    agesdf = qry.get_pops()
    agesdf = agesdf.loc[agesdf.age_group_id.isin(df.age_group_id.unique())]
    agesdf['totpop'] = agesdf.groupby(
        ['location_id', 'year_id', 'sex_id'],
        as_index=False)['population'].transform('sum')
    agesdf['weights'] = agesdf['population'] / agesdf['totpop']
else:
    agesdf = pd.read_csv(dw.MET_NEED_WEIGHTS_FILE)
    agesdf = agesdf.query('location_id in {}'.format(
        list(locsdf['location_id'])))
def fetch_indicators(sdg_version=dw.SDG_VERS, force_recompile=False):
    """Fetch all indicator data and save in shared scratch space."""
    # write/read output here
    path = "{idd}/gbd2017/".format(idd=dw.INDICATOR_DATA_DIR)
    out_file = path + "all_indicators_unscaled_v{}_conflict_disaster_truncated.feather".format(
        sdg_version)
    if os.path.exists(out_file) and not force_recompile:
        print "reading from existing file"
        df = pd.read_feather(out_file)
    else:
        print "compiling {}".format(sdg_version)
        print out_file
        # get an input_file dict to determine where to read data for each
        input_file_dict = fetch_input_file_dict()
        forecast_file_dict = fetch_forecast_file_dict()

        # list of dataframes for fast concatenation
        dfs = []

        for indicator_id in input_file_dict.keys():

            # read indicator data file if both past and future exist
            if os.path.exists(
                    input_file_dict[indicator_id]) and os.path.exists(
                        forecast_file_dict[indicator_id]):
                print "\t{}".format(indicator_id)

                df = pd.read_feather(input_file_dict[indicator_id])
                if len(df.sex_id.unique()) > 1:
                    df = df.loc[df.sex_id == 3, :]
                df = df[['location_id', 'year_id', 'age_group_id', 'sex_id'] +
                        dw.DRAW_COLS]

                futuredf = pd.read_feather(forecast_file_dict[indicator_id])

                if indicator_id == 1037:
                    futuredf['sex_id'] = 3
                    futuredf['age_group_id'] = 22
                    futuredf.rename(index=str,
                                    columns={
                                        'draw_1000': 'draw_0',
                                        'year': 'year_id',
                                        'iso3': 'ihme_loc_id'
                                    },
                                    inplace=True)
                    futuredf[(futuredf.year_id >= 2018)
                             & (futuredf.year_id <= 2030)]

                    locs = qry.get_sdg_reporting_locations(level_3=True)
                    locs = locs[['location_id', 'ihme_loc_id']]
                    futuredf = futuredf.merge(locs, how='left')

                if len(futuredf.sex_id.unique()) > 1:
                    futuredf = futuredf.loc[futuredf.sex_id == 3, :]
                if 'scenario' in futuredf.columns:
                    futuredf = futuredf.loc[futuredf.scenario == 0, :]
                futuredf = futuredf[futuredf.year_id != 2017]
                futuredf = futuredf[
                    ['location_id', 'year_id', 'age_group_id', 'sex_id'] +
                    dw.DRAW_COLS]

                if indicator_id in [1033, 1037]:
                    futuredf = futuredf.loc[futuredf.year_id.isin(
                        range(2018, 2031))]

                df = df.append(futuredf)

                # set indicator id to the key in the dict
                df['indicator_id'] = indicator_id

                if len(df.age_group_id.unique() > 1):
                    df = df[df.age_group_id != 159]

            # should be one age and one sex
                assert len(df.sex_id.unique()) == 1, 'multiple sexes present'
                assert len(
                    df.age_group_id.unique()) == 1, 'multiple ages present'

                if indicator_id in [1019, 1031]:
                    print "truncating"
                    df['upper'] = df[dw.DRAW_COLS].quantile(q=0.975, axis=1)
                    df['lower'] = df[dw.DRAW_COLS].quantile(q=0.025, axis=1)
                    df[dw.DRAW_COLS] = df[dw.DRAW_COLS].clip(lower=df.lower,
                                                             upper=df.upper,
                                                             axis=0)
                df = df[dw.INDICATOR_ID_COLS + dw.DRAW_COLS]

                dfs.append(df)

        print "concatenating"
        df = pd.concat(dfs, ignore_index=True)

        # set floor of < 1e-12
        for i in xrange(1000):
            df.loc[df['draw_' + str(i)] < 1e-12, 'draw_' + str(i)] = 1e-12
            df.loc[df['draw_' + str(i)].isnull(), 'draw_' + str(i)] = 1e-12

        # remove level 6 (UTLAs) and level 5 india urban/rural (but keep English regions)
        locsdf = qry.get_sdg_reporting_locations(
        ).loc[:, ['location_id', 'level']]
        locsdf = locsdf.loc[locsdf['level'] != 6]
        locsdf = locsdf.loc[~(
            (locsdf['level'] == 5) &
            (~locsdf['location_id'].isin(range(4618, 4627))))]
        df = df.merge(locsdf, on='location_id')

        # write all the indicator data to the directory
        df = df[dw.INDICATOR_ID_COLS + ['level'] + dw.DRAW_COLS]
        df.columns = df.columns.astype(str)
        print "outputting feather"
        df.to_feather(out_file)

    return df
Example #14
0
def fetch_indicators(sdg_version, force_recompile=False):
    # write/read output here
    version_dir = "{idd}/{v}".format(idd=dw.INDICATOR_DATA_DIR, v=sdg_version)
    out_file = version_dir + "/all_indicators.h5"
    if os.path.exists(out_file) and not force_recompile:
        print "reading from existing file"
        df = pd.read_hdf(out_file)
    else:
        print "recompiling"
        # get an input_file dict to determine where to read data for each
        # indicator
        input_file_dict = fetch_input_file_dict()
        # list of dataframes for fast concatenation
        dfs = []
        for indicator_id in input_file_dict.keys():
            print "\t{}".format(indicator_id)
            # read indicator data file
            if os.path.exists(input_file_dict[indicator_id]):
                df = pd.read_hdf(input_file_dict[indicator_id])
            else:
                df = pd.read_csv(input_file_dict[indicator_id].replace(
                    '.h5', '.csv'))
            df = df[['location_id', 'year_id', 'age_group_id', 'sex_id'] +
                    dw.DRAW_COLS]
            forecastpath = input_file_dict[indicator_id].replace(
                '/input_data/', '/forecasts/')
            futuredf = pd.read_hdf(forecastpath)
            futuredf = futuredf[
                ['location_id', 'year_id', 'age_group_id', 'sex_id'] +
                dw.DRAW_COLS]
            df = df.append(futuredf)
            # for now, limit locations to admin0
            locsdf = qry.get_sdg_reporting_locations()
            df = df.loc[df.location_id.isin(locsdf.location_id.values)]
            assert set(df.location_id.values) == set(
                locsdf.location_id.values
            ), 'Missing location(s) for ' + str(indicator_id)
            # set indicator id to the key in the dict
            df['indicator_id'] = indicator_id
            # should be one age and one sex
            assert len(df.sex_id.unique()) == 1, 'multiple sexes present'
            assert len(df.age_group_id.unique()) == 1, 'multiple ages present'
            # if len(df.age_group_id.unique()) > 1:
            #     df = age_standardize(df, weights_dict[indicator_id])
            # keep & verify required columns
            df = df[dw.INDICATOR_ID_COLS + dw.DRAW_COLS]
            # append to dataframe list
            dfs.append(df)
        df = pd.concat(dfs, ignore_index=True)

        # set floor of < 1e-12
        for i in xrange(1000):
            df.loc[df['draw_' + str(i)] < 1e-12, 'draw_' + str(i)] = 1e-12

        # make version directory if it doesnt exist yet (likely)
        if not os.path.exists(version_dir):
            os.mkdir(version_dir)
        # save the input dictionary as a pickle for convenience
        pd.to_pickle(input_file_dict, version_dir + "/input_file_dict.pickle")
        # write all the indicator data to the version directory
        df.to_hdf(version_dir + "/all_indicators.h5",
                  format="table",
                  key="data",
                  data_columns=dw.INDICATOR_ID_COLS)
    # return
    return df
Example #15
0
def process_demo_draws(past_future, version=dw.DEMO_VERS):
    '''process demographics draws'''

    if past_future == 'past':
        data_dir = dw.INPUT_DATA_DIR + 'demographics' + '/' + str(version)
    elif past_future == 'future':
        data_dir = dw.FORECAST_DATA_DIR + 'demographics' + '/' + str(version)
    else:
        raise ValueError(
            'The past_future arg must be set to "past" or "future".')

    component_ids = dw.DEMO_COMPONENT_IDS
    dfs = []

    for component_id in component_ids:
        df = pd.read_feather(data_dir + '/' + str(component_id) + '.feather')

        dfs.append(df)

    print('concatenating')
    df = pd.concat(dfs, ignore_index=True)

    #temporary for gk
    locs = qry.get_sdg_reporting_locations(level_3=True)
    df = df[df.location_id.isin(locs.location_id)]

    # get live births
    births = load_births()
    if past_future == 'past':
        births = births[births.scenario == 0]
        births.drop('scenario', inplace=True, axis=1)

    df = df.merge(births, how='left')

    # Keep sex-split
    df_sex_split = df.copy(deep=True)

    # aggregate sexes
    df['sex_id'] == 3
    df = agg.age_sex_aggregate(df, dw.DEMO_GROUP_COLS, denominator='births')
    df = df.append(df_sex_split, ignore_index=True)

    # global aggregation
    df_global = agg.aggregate_locations_to_global(df,
                                                  dw.DEMO_GROUP_COLS,
                                                  denominator='births')

    # output
    df = df[dw.DEMO_GROUP_COLS + dw.DRAW_COLS]
    df_global = df_global[dw.DEMO_GROUP_COLS + dw.DRAW_COLS]

    for component_id in component_ids:
        if component_id == 56:
            ind_id = '1040'
        else:
            ind_id = '1041'

        print("outputting " + ind_id)
        df_id = df[df.indicator_component_id == component_id]
        df_id.reset_index(drop=True, inplace=True)
        df_id.to_feather(data_dir + '/' + ind_id + '.feather')

        df_id_global = df_global[df_global.indicator_component_id ==
                                 component_id]
        df_id_global.reset_index(drop=True, inplace=True)
        df_id_global.to_feather(data_dir + '/' + ind_id + '_' + 'global' +
                                '.feather')
Example #16
0
df = df.ix[df['source_type'].str.startswith("VR")]
df = df.query("~(ihme_loc_id=='ZAF' & source_type=='VR-SSA')")

# add China national estimate
df = df.append(assemble_china_df(), ignore_index=True)
# add India national estimate after removing other india
df = df.query("ihme_loc_id != 'IND'")
df = df.append(assemble_ind_df(), ignore_index=True)
# add England as copy of UK
df = add_england(df)
# create square dataset of all sdg reporting locations and years, and fill in
# any missing completeness
years = range(1990, 2016, 1)
years = pd.DataFrame({'year_id': years})
years['key'] = 0
locs = qry.get_sdg_reporting_locations()[['location_id']]
locs['key'] = 0
sqr_df = years.merge(locs, on='key')
sqr_df = sqr_df.drop('key', axis=1)
sqr_df['sex_id'] = 3
df = sqr_df.merge(df, how='left')
df[dw.COMPLETENESS_DATA_COL] = df[dw.COMPLETENESS_DATA_COL].fillna(0)

# make a fake draws dataframe by copying trunc_pred (completeness)
#   into each draw, so there is no uncertainty
idx = df.index
draws = pd.DataFrame(index=df.index, columns=dw.DRAW_COLS)
draws = draws.apply(lambda x: x.fillna(df[dw.COMPLETENESS_DATA_COL]))

# add the draws to the dataframe, leveraging the shared index
df = pd.concat([df, draws], axis=1)