def interpolate_year(data): # Hide the central comp dependency unless required. from core_maths.interpolate import pchip_interpolate id_cols = list(set(data.columns).difference(DRAW_COLUMNS)) fillin_data = pchip_interpolate(data, id_cols, DRAW_COLUMNS) return pd.concat([data, fillin_data], sort=True)
def read_single_en_injury(self, modelable_entity_id, model_version_id, measure_id=[ measures.YLD, measures.INCIDENCE, measures.ST_PREVALENCE, measures.LT_PREVALENCE ]): injury_source = ( self._ss_factory.get_en_injuries_modelable_entity_source( modelable_entity_id, model_version_id)) dim = self.dimensions.get_simulation_dimensions(measure_id=measure_id, at_birth=False) # get filters w/ added years if interpolation is needed filters = dim.index_dim.to_dict()["levels"] req_years = filters["year_id"] if not set(req_years).issubset(set(self._estim_years)): filters["year_id"] = list(set(req_years + self._estim_years)) # read data df = injury_source.content(filters=filters) if df.empty: raise Exception(f"No data returned for ME {modelable_entity_id}, " f"model version {model_version_id}.") draw_cols = [col for col in df.columns if "draw_" in col] # add indices to dimensions object from draw source transforms dim.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist()) dim.index_dim.add_level("cause_id", df.cause_id.unique().tolist()) dim.index_dim.add_level("healthstate_id", df.healthstate_id.unique().tolist()) dim.index_dim.add_level("rei_id", df.rei_id.unique().tolist()) # interpolate missing years if not set(df.year_id.unique()).issuperset(set(req_years)): interp_df = pchip_interpolate(df=df, id_cols=dim.index_names, value_cols=draw_cols, time_col="year_id", time_vals=req_years) df = df[df.year_id.isin(req_years)] df = df.append(interp_df) else: df = df[df.year_id.isin(req_years)] # resample if len(dim.data_list()) != len(draw_cols): gbdizer = gbdize.GBDizeDataFrame(dim) df = gbdizer.correlated_percentile_resample(df) return df
def to_como(como_dir, location_set_id, gbd_round_id): df = pd.read_csv("FILEPATH/urolith_symp_dws.csv") # fill for new locs lt = dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id) locmap = lt.flatten() reg_avgs = df.merge(locmap[['leaf_node', 'level_2']], left_on='location_id', right_on='leaf_node') reg_avgs = reg_avgs[['level_2', 'year_id', 'healthstate_id'] + list(df.filter(like='draw').columns)] reg_avgs = reg_avgs.groupby(['level_2', 'year_id']) reg_avgs = reg_avgs.mean().reset_index() reg_avgs.rename(columns={'level_2': 'location_id'}, inplace=True) df = df.append(reg_avgs) filllen = 0 for ln in list(locmap.leaf_node.unique()): if ln not in list(df.location_id): for i in reversed(range(6)): fill_loc = locmap.loc[locmap.leaf_node == ln, 'level_%s' % i].squeeze() filldf = df[df.location_id == fill_loc] if len(filldf) > 0: filldf['location_id'] = ln df = df.append(filldf) filllen = filllen + 1 break df = df[df.location_id.isin([l.id for l in lt.leaves()])] # fill in missing years extra = df.query("year_id == 2013") extra['year_id'] = 2019 df = df.append(extra) df = df.filter(regex='(.*_id|draw_)') interp = pchip_interpolate(df=df, id_cols=['location_id', 'healthstate_id'], value_cols=['draw_%s' % d for d in range(1000)], time_col="year_id", time_vals=list(range(1990, 2020))) df = df.append(interp) df = df[df.year_id.isin(list(range(1990, 2020)))] # save for como run df.to_hdf(f"{como_dir}/info/urolith_dws.h5", 'draws', mode='w', format='table', data_columns=['location_id', 'year_id'])
def read_single_en_injury(self, modelable_entity_id, model_version_id, measure_id=[3, 6, 35, 36]): injury_source = ( self._ss_factory.get_en_injuries_modelable_entity_source( modelable_entity_id, model_version_id)) dim = self.dimensions.get_simulation_dimensions(measure_id) # get filters w/ added years if interpolation is needed filters = dim.index_dim.to_dict()["levels"] req_years = filters["year_id"] if not set(req_years).issubset(set(self._estim_years)): filters["year_id"] = list(set(req_years + self._estim_years)) # read data df = injury_source.content(filters=filters) if df.empty: raise Exception("No data returned for meid:{} and mvid:{}".format( modelable_entity_id, model_version_id)) # add indices to dimensions object from draw source transforms dim.index_dim.add_level("sequela_id", df.sequela_id.unique().tolist()) dim.index_dim.add_level("cause_id", df.cause_id.unique().tolist()) dim.index_dim.add_level("healthstate_id", df.healthstate_id.unique().tolist()) dim.index_dim.add_level("rei_id", df.rei_id.unique().tolist()) # interpolate missing years if not set(df.year_id.unique()).issuperset(set(req_years)): interp_df = pchip_interpolate(df=df, id_cols=dim.index_names, value_cols=self._draw_cols, time_col="year_id", time_vals=req_years) df = df[df.year_id.isin(req_years)] df = df.append(interp_df) else: df = df[df.year_id.isin(req_years)] # resample if ndraws is less than 1000 if len(dim.data_list()) != 1000: gbdizer = gbdize.GBDizeDataFrame(dim) df = gbdizer.correlated_percentile_resample(df) return df
def _get_inj_dws(self): draw_cols = ["draw_{}".format(i) for i in range(1000)] loc_id = self.dims.index_dim.get_level("location_id")[0] year_id = self.dims.index_dim.get_level("year_id") fp = ("FILEPATH/inputs/lt_dw.h5") inj_dws = read_hdf(fp, "draws", hdf_filters={"location_id": loc_id}) inj_dws = inj_dws.reset_index() # interpolate interp = pchip_interpolate(df=inj_dws, id_cols=["location_id", "ncode"], value_cols=draw_cols, time_col="year_id", time_vals=year_id) inj_dws = inj_dws.append(interp) inj_dws = inj_dws[inj_dws.year_id.isin(year_id)] seq_map = self.cv.injury_dws_by_sequela[[ "sequela_id", "healthstate_id", "n_code" ]].drop_duplicates() inj_dws = inj_dws.merge(seq_map, left_on="ncode", right_on="n_code") inj_dws = inj_dws[['sequela_id', 'healthstate_id'] + draw_cols] return inj_dws
def epilepsy_any(cv): standard_dws = pd.read_csv( "FILEPATH/dw.csv") standard_dws.rename( columns={d: d.replace("draw", "dw") for d in DRAWCOLS}, inplace=True) healthstates = cv.sequela_list[["modelable_entity_id", "healthstate_id"]] # Get country-year specific prevalences for back-calculation locations = cv.simulation_index['location_id'] years = estimation_years_from_gbd_round_id(cv.gbd_round_id) ages = cv.simulation_index['age_group_id'] args = [(l, y, ages, cv.gbd_round_id, cv.decomp_step_id) for l in locations for y in years] pool = Pool(20) prop_dfs = pool.map(get_props, args) pool.close() pool.join() prop_dfs = pd.concat(prop_dfs) prop_dfs = prop_dfs.merge(healthstates) prop_dfs.rename( columns={d: d.replace("draw", "prop") for d in DRAWCOLS}, inplace=True) # Combine DWs dws_to_weight = prop_dfs.merge( standard_dws, on='healthstate_id', how='left') dws_to_weight = dws_to_weight.join(pd.DataFrame( data=( dws_to_weight.filter(like='dw_').values * dws_to_weight.filter(like='prop_').values), index=dws_to_weight.index, columns=DRAWCOLS)) def combine_dws(df): draws_to_combine = df.filter(like='draw_') combined_draws = 1 - (1 - draws_to_combine).prod() return combined_draws # healthstate_id 772 - Epilepsy epilepsy_id = 772 combined_dws = dws_to_weight.groupby(['location_id', 'year_id']).apply( combine_dws).reset_index() combined_dws['healthstate_id'] = epilepsy_id combined_dws = combined_dws[ ['location_id', 'year_id', 'healthstate_id'] + DRAWCOLS] combined_dws.to_hdf( f"{cv.como_dir}/info/epilepsy_any_dws.h5", 'draws', mode='w', format='table', data_columns=['location_id', 'year_id']) # fill in missing years year_id = list(range(1990,2020)) interp = pchip_interpolate( df=combined_dws, id_cols=['location_id', 'healthstate_id'], value_cols=DRAWCOLS, time_col="year_id", time_vals=year_id) combined_dws = combined_dws.append(interp) combined_dws = combined_dws[combined_dws.year_id.isin(year_id)] combined_dws.to_hdf( f"{cv.como_dir}/info/epilepsy_any_dws.h5", 'draws', mode='w', format='table', data_columns=['location_id', 'year_id'])
""" df_19 = df[df.year_id == 2017].copy().reset_index(drop=True) df_19["year_id"] = 2019 draw_cols = df.columns[df.columns.str.contains("draw")] df_19[draw_cols] = df_19[draw_cols] * 1.00001 return df_19 if __name__ == "__main__": # Draws pulled via gbd17 archive env prior (see docs), draws dir hardcode for now. meid = ADDRESS0 print(f"Reading csv {time.time()}") draws = pd.read_csv(f'{draws_dir}/{meid}_gbd_17/{location_id}.csv') print(f"read csv {time.time()}") draws_15 = pchip_interpolate(draws, [ 'measure_id', 'modelable_entity_id', 'location_id', 'sex_id', 'age_group_id' ], ['draw_{}'.format(x) for x in range(1000)], time_vals=[2015]) print(f"Interp complete: {time.time()}") draws = draws.append(draws_15) # draws_19 = linear_extrap(draws) draws_19 = carry_19(draws) print(f"extrap complete: {time.time()}") draws = draws.append(draws_19) print(f"draws complete: {time.time()}") draws["metric_id"] = 3 draws.to_csv(f"{draws_dir}/{meid}/{location_id}.csv", index=False)