def load_lbwsg_exposure(key: str, location: str): path = paths.lbwsg_data_path('exposure', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data.drop('modelable_entity_id', 'columns') data = data[data.parameter != 'cat124'] # LBWSG data has an extra residual category added by get_draws. data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data.groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'exposure', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lbwsg_paf(key: str, location: str): path = paths.lbwsg_data_path('population_attributable_fraction', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data[data.metric_id == vi_globals.METRICS['Percent']] # All lbwsg risk is about mortality. data = data[data.measure_id.isin([vi_globals.MEASURES['YLLs']])] temp = [] causes_map = {c.gbd_id: c for c in causes} # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'population_attributable_fraction', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_deaths(entity: Cause, location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "deaths", location_id) data = utilities.filter_data_by_restrictions( data, entity, "yll", utility_data.get_age_group_ids() ) data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) return data
def write_ckd_data(artifact, location): load = get_load(location) # Metadata key = f'cause.chronic_kidney_disease.restrictions' artifact.write(key, load(key)) # Measures for Disease Model key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate' csmr = load(key) artifact.write(key, csmr.copy()) # Measures for Disease States key = 'cause.chronic_kidney_disease.prevalence' prevalence = load(key) artifact.write(key, prevalence.copy()) key = 'cause.chronic_kidney_disease.disability_weight' df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id, utility_data.get_location_id(location)) ylds = df[df.measure_id == globals.MEASURES['YLDs']] ylds = utilities.filter_data_by_restrictions( ylds, causes.chronic_kidney_disease, 'yld', utility_data.get_age_group_ids()) ylds = utilities.normalize(ylds, fill_value=0) ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS) ylds = utilities.scrub_gbd_conventions(ylds, location) ylds = split_interval(ylds, interval_column='age', split_column_prefix='age') ylds = split_interval(ylds, interval_column='year', split_column_prefix='year') ylds = utilities.sort_hierarchical_data(ylds) dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, dw) key = 'cause.chronic_kidney_disease.excess_mortality_rate' emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, emr) # Measures for Transitions key = 'cause.chronic_kidney_disease.incidence_rate' data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate', location) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) data[ data > 50] = 50 # Russia has absurdly high values in some of the data and it breaks validation. artifact.write(key, data)
def _load_prevalence(entity, location_id: int, entity_type: str): logger.info(f'Loading prevalence for {entity.name} from GBD 2016.') data = extract.get_como_draws(entity.gbd_id, location_id, entity_type) data = data[data.measure_id == vi_globals.MEASURES['Prevalence']] data = utilities.filter_data_by_restrictions( data, causes.diarrheal_diseases, 'yld', utility_data.get_age_group_ids()) data = data[data.year_id == 2016].drop( columns='year_id') # Use latest GBD results for all data data = standardize.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) return utilities.reshape(data)
def get_prevalence(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "prevalence", location_id) if entity.kind == "cause": restrictions_entity = entity else: # sequela cause = [c for c in causes if c.sequelae and entity in c.sequelae][0] restrictions_entity = cause data = utilities.filter_data_by_restrictions( data, restrictions_entity, "yld", utility_data.get_age_group_ids() ) data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) return data
def load_ikf_paf(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False) relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)] data = (data.groupby('cause_id', as_index=False) .apply(core.filter_by_relative_risk, relative_risk) .reset_index(drop=True)) causes_map = {c.gbd_id: c for c in causes} temp = [] # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_exposure( entity: Union[RiskFactor, AlternativeRiskFactor], location_id: int ) -> pd.DataFrame: data = extract.extract_data(entity, "exposure", location_id) data = data.drop("modelable_entity_id", "columns") if entity.name in EXTRA_RESIDUAL_CATEGORY: cat = EXTRA_RESIDUAL_CATEGORY[entity.name] data = data.drop(labels=data.query("parameter == @cat").index) data[DRAW_COLUMNS] = data[DRAW_COLUMNS].clip(lower=MINIMUM_EXPOSURE_VALUE) if entity.kind in ["risk_factor", "alternative_risk_factor"]: data = utilities.filter_data_by_restrictions( data, entity, "outer", utility_data.get_age_group_ids() ) if entity.distribution in ["dichotomous", "ordered_polytomous", "unordered_polytomous"]: tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat( [ utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1), ], ignore_index=True, ) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(DRAW_COLUMNS + ["parameter"])) sums = data.groupby(cols)[DRAW_COLUMNS].sum() data = ( data.groupby("parameter") .apply(lambda df: df.set_index(cols).loc[:, DRAW_COLUMNS].divide(sums)) .reset_index() ) else: data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS + ["parameter"]) return data
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) location_id = utility_data.get_location_id(location) if isinstance(location, str) else location measure = 'exposure' raw_validation.check_metadata(entity, measure) data = gbd.get_exposure(entity.gbd_id, location_id) data = normalize_ikf_exposure_distribution(data) raw_validation.validate_raw_data(data, entity, measure, location_id) data = data.drop('modelable_entity_id', 'columns') data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data .groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_shigella_remission_rate(key: EntityKey, location: str): location_id = extract.get_location_id(location) data = extract.get_modelable_entity_draws( causes.diarrheal_diseases.dismod_id, location_id) data = data[data.measure_id == vi_globals.MEASURES['Remission rate']] data = utilities.filter_data_by_restrictions( data, causes.diarrheal_diseases, 'yld', utility_data.get_age_group_ids()) data = data[data.year_id == 2016].drop( columns='year_id') # Use latest GBD results for all data data = standardize.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_population_attributable_fraction( entity: Union[RiskFactor, Etiology], location_id: int ) -> pd.DataFrame: causes_map = {c.gbd_id: c for c in causes} if entity.kind == "risk_factor": data = extract.extract_data(entity, "population_attributable_fraction", location_id) relative_risk = extract.extract_data(entity, "relative_risk", location_id) # FIXME: we don't currently support yll-only causes so I'm dropping them because the data in some cases is # very messed up, with mort = morb = 1 (e.g., aortic aneurysm in the RR data for high systolic bp) - # 2/8/19 K.W. yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)] data = ( data.groupby("cause_id", as_index=False) .apply(filter_by_relative_risk, relative_risk) .reset_index(drop=True) ) temp = [] # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(["cause_id", "measure_id"]): cause = causes_map[c_id] measure = "yll" if measure == MEASURES["YLLs"] else "yld" df = utilities.filter_data_by_restrictions( df, cause, measure, utility_data.get_age_group_ids() ) temp.append(df) data = pd.concat(temp, ignore_index=True) else: # etiology data = extract.extract_data( entity, "etiology_population_attributable_fraction", location_id ) cause = [c for c in causes if entity in c.etiologies][0] data = utilities.filter_data_by_restrictions( data, cause, "inner", utility_data.get_age_group_ids() ) if np.any(data[DRAW_COLUMNS] < 0): logger.warning( f"{entity.name.capitalize()} has negative values for paf. These will be replaced with 0." ) other_cols = [c for c in data.columns if c not in DRAW_COLUMNS] data.set_index(other_cols, inplace=True) data = data.where(data[DRAW_COLUMNS] > 0, 0).reset_index() data = utilities.convert_affected_entity(data, "cause_id") data.loc[ data["measure_id"] == MEASURES["YLLs"], "affected_measure" ] = "excess_mortality_rate" data.loc[data["measure_id"] == MEASURES["YLDs"], "affected_measure"] = "incidence_rate" data = ( data.groupby(["affected_entity", "affected_measure"]) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True) ) data = data.filter( DEMOGRAPHIC_COLUMNS + ["affected_entity", "affected_measure"] + DRAW_COLUMNS ) return data