Esempio n. 1
0
def interpolate_year(data):
    # Hide the central comp dependency unless required.
    from core_maths.interpolate import pchip_interpolate

    id_cols = list(set(data.columns).difference(DRAW_COLUMNS))
    fillin_data = pchip_interpolate(data, id_cols, DRAW_COLUMNS)
    return pd.concat([data, fillin_data], sort=True)
Esempio n. 2
0
    def read_single_en_injury(self,
                              modelable_entity_id,
                              model_version_id,
                              measure_id=[
                                  measures.YLD, measures.INCIDENCE,
                                  measures.ST_PREVALENCE,
                                  measures.LT_PREVALENCE
                              ]):
        injury_source = (
            self._ss_factory.get_en_injuries_modelable_entity_source(
                modelable_entity_id, model_version_id))
        dim = self.dimensions.get_simulation_dimensions(measure_id=measure_id,
                                                        at_birth=False)

        # get filters w/ added years if interpolation is needed
        filters = dim.index_dim.to_dict()["levels"]
        req_years = filters["year_id"]
        if not set(req_years).issubset(set(self._estim_years)):
            filters["year_id"] = list(set(req_years + self._estim_years))

        # read data
        df = injury_source.content(filters=filters)
        if df.empty:
            raise Exception(f"No data returned for ME {modelable_entity_id}, "
                            f"model version {model_version_id}.")
        draw_cols = [col for col in df.columns if "draw_" in col]

        # add indices to dimensions object from draw source transforms
        dim.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist())
        dim.index_dim.add_level("cause_id", df.cause_id.unique().tolist())
        dim.index_dim.add_level("healthstate_id",
                                df.healthstate_id.unique().tolist())
        dim.index_dim.add_level("rei_id", df.rei_id.unique().tolist())

        # interpolate missing years
        if not set(df.year_id.unique()).issuperset(set(req_years)):
            interp_df = pchip_interpolate(df=df,
                                          id_cols=dim.index_names,
                                          value_cols=draw_cols,
                                          time_col="year_id",
                                          time_vals=req_years)
            df = df[df.year_id.isin(req_years)]
            df = df.append(interp_df)
        else:
            df = df[df.year_id.isin(req_years)]

        # resample
        if len(dim.data_list()) != len(draw_cols):
            gbdizer = gbdize.GBDizeDataFrame(dim)
            df = gbdizer.correlated_percentile_resample(df)

        return df
Esempio n. 3
0
def to_como(como_dir, location_set_id, gbd_round_id):
    df = pd.read_csv("FILEPATH/urolith_symp_dws.csv")

    # fill for new locs
    lt = dbtrees.loctree(location_set_id=location_set_id,
                         gbd_round_id=gbd_round_id)
    locmap = lt.flatten()
    reg_avgs = df.merge(locmap[['leaf_node', 'level_2']],
                        left_on='location_id',
                        right_on='leaf_node')
    reg_avgs = reg_avgs[['level_2', 'year_id', 'healthstate_id'] +
                        list(df.filter(like='draw').columns)]
    reg_avgs = reg_avgs.groupby(['level_2', 'year_id'])
    reg_avgs = reg_avgs.mean().reset_index()
    reg_avgs.rename(columns={'level_2': 'location_id'}, inplace=True)
    df = df.append(reg_avgs)

    filllen = 0
    for ln in list(locmap.leaf_node.unique()):
        if ln not in list(df.location_id):
            for i in reversed(range(6)):
                fill_loc = locmap.loc[locmap.leaf_node == ln,
                                      'level_%s' % i].squeeze()
                filldf = df[df.location_id == fill_loc]
                if len(filldf) > 0:
                    filldf['location_id'] = ln
                    df = df.append(filldf)
                    filllen = filllen + 1
                    break
    df = df[df.location_id.isin([l.id for l in lt.leaves()])]

    # fill in missing years
    extra = df.query("year_id == 2013")
    extra['year_id'] = 2019
    df = df.append(extra)
    df = df.filter(regex='(.*_id|draw_)')
    interp = pchip_interpolate(df=df,
                               id_cols=['location_id', 'healthstate_id'],
                               value_cols=['draw_%s' % d for d in range(1000)],
                               time_col="year_id",
                               time_vals=list(range(1990, 2020)))
    df = df.append(interp)
    df = df[df.year_id.isin(list(range(1990, 2020)))]

    # save for como run
    df.to_hdf(f"{como_dir}/info/urolith_dws.h5",
              'draws',
              mode='w',
              format='table',
              data_columns=['location_id', 'year_id'])
Esempio n. 4
0
    def read_single_en_injury(self,
                              modelable_entity_id,
                              model_version_id,
                              measure_id=[3, 6, 35, 36]):
        injury_source = (
            self._ss_factory.get_en_injuries_modelable_entity_source(
                modelable_entity_id, model_version_id))
        dim = self.dimensions.get_simulation_dimensions(measure_id)

        # get filters w/ added years if interpolation is needed
        filters = dim.index_dim.to_dict()["levels"]
        req_years = filters["year_id"]
        if not set(req_years).issubset(set(self._estim_years)):
            filters["year_id"] = list(set(req_years + self._estim_years))

        # read data
        df = injury_source.content(filters=filters)
        if df.empty:
            raise Exception("No data returned for meid:{} and mvid:{}".format(
                modelable_entity_id, model_version_id))

        # add indices to dimensions object from draw source transforms
        dim.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist())
        dim.index_dim.add_level("cause_id", df.cause_id.unique().tolist())
        dim.index_dim.add_level("healthstate_id",
                                df.healthstate_id.unique().tolist())
        dim.index_dim.add_level("rei_id", df.rei_id.unique().tolist())

        # interpolate missing years
        if not set(df.year_id.unique()).issuperset(set(req_years)):
            interp_df = pchip_interpolate(df=df,
                                          id_cols=dim.index_names,
                                          value_cols=self._draw_cols,
                                          time_col="year_id",
                                          time_vals=req_years)
            df = df[df.year_id.isin(req_years)]
            df = df.append(interp_df)
        else:
            df = df[df.year_id.isin(req_years)]

        # resample if ndraws is less than 1000
        if len(dim.data_list()) != 1000:
            gbdizer = gbdize.GBDizeDataFrame(dim)
            df = gbdizer.correlated_percentile_resample(df)

        return df
Esempio n. 5
0
    def _get_inj_dws(self):
        draw_cols = ["draw_{}".format(i) for i in range(1000)]
        loc_id = self.dims.index_dim.get_level("location_id")[0]
        year_id = self.dims.index_dim.get_level("year_id")
        fp = ("FILEPATH/inputs/lt_dw.h5")
        inj_dws = read_hdf(fp, "draws", hdf_filters={"location_id": loc_id})
        inj_dws = inj_dws.reset_index()

        # interpolate
        interp = pchip_interpolate(df=inj_dws,
                                   id_cols=["location_id", "ncode"],
                                   value_cols=draw_cols,
                                   time_col="year_id",
                                   time_vals=year_id)
        inj_dws = inj_dws.append(interp)
        inj_dws = inj_dws[inj_dws.year_id.isin(year_id)]

        seq_map = self.cv.injury_dws_by_sequela[[
            "sequela_id", "healthstate_id", "n_code"
        ]].drop_duplicates()
        inj_dws = inj_dws.merge(seq_map, left_on="ncode", right_on="n_code")
        inj_dws = inj_dws[['sequela_id', 'healthstate_id'] + draw_cols]
        return inj_dws
Esempio n. 6
0
def epilepsy_any(cv):
    standard_dws = pd.read_csv(
        "FILEPATH/dw.csv")
    standard_dws.rename(
            columns={d: d.replace("draw", "dw") for d in DRAWCOLS},
            inplace=True)

    healthstates = cv.sequela_list[["modelable_entity_id", "healthstate_id"]]

    # Get country-year specific prevalences for back-calculation
    locations = cv.simulation_index['location_id']
    years = estimation_years_from_gbd_round_id(cv.gbd_round_id)
    ages = cv.simulation_index['age_group_id']
    args = [(l, y, ages, cv.gbd_round_id, cv.decomp_step_id)
            for l in locations for y in years]
    pool = Pool(20)
    prop_dfs = pool.map(get_props, args)
    pool.close()
    pool.join()
    prop_dfs = pd.concat(prop_dfs)
    prop_dfs = prop_dfs.merge(healthstates)
    prop_dfs.rename(
            columns={d: d.replace("draw", "prop") for d in DRAWCOLS},
            inplace=True)

    # Combine DWs
    dws_to_weight = prop_dfs.merge(
        standard_dws, on='healthstate_id', how='left')
    dws_to_weight = dws_to_weight.join(pd.DataFrame(
        data=(
            dws_to_weight.filter(like='dw_').values *
            dws_to_weight.filter(like='prop_').values),
        index=dws_to_weight.index,
        columns=DRAWCOLS))

    def combine_dws(df):
        draws_to_combine = df.filter(like='draw_')
        combined_draws = 1 - (1 - draws_to_combine).prod()
        return combined_draws

    # healthstate_id 772 - Epilepsy
    epilepsy_id = 772
    combined_dws = dws_to_weight.groupby(['location_id', 'year_id']).apply(
        combine_dws).reset_index()
    combined_dws['healthstate_id'] = epilepsy_id
    combined_dws = combined_dws[
        ['location_id', 'year_id', 'healthstate_id'] + DRAWCOLS]

    combined_dws.to_hdf(
        f"{cv.como_dir}/info/epilepsy_any_dws.h5",
        'draws',
        mode='w',
        format='table',
        data_columns=['location_id', 'year_id'])

    # fill in missing years
    year_id = list(range(1990,2020))
    interp = pchip_interpolate(
        df=combined_dws,
        id_cols=['location_id', 'healthstate_id'],
        value_cols=DRAWCOLS,
        time_col="year_id",
        time_vals=year_id)
    combined_dws = combined_dws.append(interp)
    combined_dws = combined_dws[combined_dws.year_id.isin(year_id)]

    combined_dws.to_hdf(
        f"{cv.como_dir}/info/epilepsy_any_dws.h5",
        'draws',
        mode='w',
        format='table',
        data_columns=['location_id', 'year_id'])
    """
    df_19 = df[df.year_id == 2017].copy().reset_index(drop=True)
    df_19["year_id"] = 2019
    draw_cols = df.columns[df.columns.str.contains("draw")]
    df_19[draw_cols] = df_19[draw_cols] * 1.00001
    return df_19


if __name__ == "__main__":
    # Draws pulled via gbd17 archive env prior (see docs), draws dir hardcode for now.
    meid = ADDRESS0

    print(f"Reading csv {time.time()}")
    draws = pd.read_csv(f'{draws_dir}/{meid}_gbd_17/{location_id}.csv')
    print(f"read csv {time.time()}")
    draws_15 = pchip_interpolate(draws, [
        'measure_id', 'modelable_entity_id', 'location_id', 'sex_id',
        'age_group_id'
    ], ['draw_{}'.format(x) for x in range(1000)],
                                 time_vals=[2015])
    print(f"Interp complete: {time.time()}")
    draws = draws.append(draws_15)
    # draws_19 = linear_extrap(draws)
    draws_19 = carry_19(draws)
    print(f"extrap complete: {time.time()}")
    draws = draws.append(draws_19)
    print(f"draws complete: {time.time()}")
    draws["metric_id"] = 3

    draws.to_csv(f"{draws_dir}/{meid}/{location_id}.csv", index=False)