def load_lbwsg_exposure(key: str, location: str): path = paths.lbwsg_data_path('exposure', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data.drop('modelable_entity_id', 'columns') data = data[data.parameter != 'cat124'] # LBWSG data has an extra residual category added by get_draws. data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data.groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'exposure', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def write_utilization_rate(artifact, location): key = 'healthcare_entity.outpatient_visits.utilization_rate' from vivarium_csu_hypertension_sdc import external_data data_dir = Path(external_data.__file__).parent data = pd.read_csv(data_dir / f'outpatient_utilization.csv') loc_id = utility_data.get_location_id(location) data = data[data.location_id == loc_id].reset_index(drop=True) data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean']) data['log_sd'] = ( np.log(data['outpatient_visits_per_cap_95_upper']) - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96 draws = np.exp( np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T draws = pd.DataFrame(data=draws, columns=globals.DRAW_COLUMNS) data = pd.concat( [data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1) data = utilities.normalize(data, fill_value=0) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) artifact.write(key, data)
def get_disability_weight(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame: if entity.kind == "cause": data = utility_data.get_demographic_dimensions(location_id, draws=True, value=0.0) data = data.set_index( utilities.get_ordered_index_cols(data.columns.difference(DRAW_COLUMNS)) ) if entity.sequelae: for sequela in entity.sequelae: try: prevalence = get_data(sequela, "prevalence", location_id) except DataDoesNotExistError: # sequela prevalence does not exist so no point continuing with this sequela continue disability = get_data(sequela, "disability_weight", location_id) disability.index = disability.index.set_levels( [location_id], level="location_id" ) data += prevalence * disability cause_prevalence = get_data(entity, "prevalence", location_id) data = (data / cause_prevalence).fillna(0).reset_index() else: # entity.kind == 'sequela' try: data = extract.extract_data(entity, "disability_weight", location_id) data = utilities.normalize(data) cause = [c for c in causes if c.sequelae and entity in c.sequelae][0] data = utilities.clear_disability_weight_outside_restrictions( data, cause, 0.0, utility_data.get_age_group_ids() ) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) except (IndexError, DataDoesNotExistError): logger.warning( f"{entity.name.capitalize()} has no disability weight data. All values will be 0." ) data = utility_data.get_demographic_dimensions(location_id, draws=True, value=0.0) return data
def get_deaths(entity: Cause, location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "deaths", location_id) data = utilities.filter_data_by_restrictions( data, entity, "yll", utility_data.get_age_group_ids() ) data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) return data
def write_ckd_data(artifact, location): load = get_load(location) # Metadata key = f'cause.chronic_kidney_disease.restrictions' artifact.write(key, load(key)) # Measures for Disease Model key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate' csmr = load(key) artifact.write(key, csmr.copy()) # Measures for Disease States key = 'cause.chronic_kidney_disease.prevalence' prevalence = load(key) artifact.write(key, prevalence.copy()) key = 'cause.chronic_kidney_disease.disability_weight' df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id, utility_data.get_location_id(location)) ylds = df[df.measure_id == globals.MEASURES['YLDs']] ylds = utilities.filter_data_by_restrictions( ylds, causes.chronic_kidney_disease, 'yld', utility_data.get_age_group_ids()) ylds = utilities.normalize(ylds, fill_value=0) ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS) ylds = utilities.scrub_gbd_conventions(ylds, location) ylds = split_interval(ylds, interval_column='age', split_column_prefix='age') ylds = split_interval(ylds, interval_column='year', split_column_prefix='year') ylds = utilities.sort_hierarchical_data(ylds) dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, dw) key = 'cause.chronic_kidney_disease.excess_mortality_rate' emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, emr) # Measures for Transitions key = 'cause.chronic_kidney_disease.incidence_rate' data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate', location) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) data[ data > 50] = 50 # Russia has absurdly high values in some of the data and it breaks validation. artifact.write(key, data)
def get_exposure( entity: Union[RiskFactor, AlternativeRiskFactor], location_id: int ) -> pd.DataFrame: data = extract.extract_data(entity, "exposure", location_id) data = data.drop("modelable_entity_id", "columns") if entity.name in EXTRA_RESIDUAL_CATEGORY: cat = EXTRA_RESIDUAL_CATEGORY[entity.name] data = data.drop(labels=data.query("parameter == @cat").index) data[DRAW_COLUMNS] = data[DRAW_COLUMNS].clip(lower=MINIMUM_EXPOSURE_VALUE) if entity.kind in ["risk_factor", "alternative_risk_factor"]: data = utilities.filter_data_by_restrictions( data, entity, "outer", utility_data.get_age_group_ids() ) if entity.distribution in ["dichotomous", "ordered_polytomous", "unordered_polytomous"]: tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat( [ utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1), ], ignore_index=True, ) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(DRAW_COLUMNS + ["parameter"])) sums = data.groupby(cols)[DRAW_COLUMNS].sum() data = ( data.groupby("parameter") .apply(lambda df: df.set_index(cols).loc[:, DRAW_COLUMNS].divide(sums)) .reset_index() ) else: data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS + ["parameter"]) return data
def _load_em_from_meid(location, meid, measure): location_id = utility_data.get_location_id(location) data = gbd.get_modelable_entity_draws(meid, location_id) data = data[data.measure_id == vi_globals.MEASURES[measure]] data = vi_utils.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = vi_utils.reshape(data) data = vi_utils.scrub_gbd_conventions(data, location) data = vi_utils.split_interval(data, interval_column='age', split_column_prefix='age') data = vi_utils.split_interval(data, interval_column='year', split_column_prefix='year') return vi_utils.sort_hierarchical_data(data)
def get_exposure_standard_deviation( entity: Union[RiskFactor, AlternativeRiskFactor], location_id: int ) -> pd.DataFrame: data = extract.extract_data(entity, "exposure_standard_deviation", location_id) data = data.drop("modelable_entity_id", "columns") exposure = extract.extract_data(entity, "exposure", location_id) valid_age_groups = utilities.get_exposure_and_restriction_ages(exposure, entity) data = data[data.age_group_id.isin(valid_age_groups)] data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) return data
def get_estimate(entity: Covariate, location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "estimate", location_id) key_columns = ["location_id", "year_id"] if entity.by_age: key_columns.append("age_group_id") if entity.by_sex: key_columns.append("sex_id") data = data.filter(key_columns + COVARIATE_VALUE_COLUMNS) data = utilities.normalize(data) data = utilities.wide_to_long(data, COVARIATE_VALUE_COLUMNS, var_name="parameter") return data
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) location_id = utility_data.get_location_id(location) if isinstance(location, str) else location measure = 'exposure' raw_validation.check_metadata(entity, measure) data = gbd.get_exposure(entity.gbd_id, location_id) data = normalize_ikf_exposure_distribution(data) raw_validation.validate_raw_data(data, entity, measure, location_id) data = data.drop('modelable_entity_id', 'columns') data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data .groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_prevalence(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "prevalence", location_id) if entity.kind == "cause": restrictions_entity = entity else: # sequela cause = [c for c in causes if c.sequelae and entity in c.sequelae][0] restrictions_entity = cause data = utilities.filter_data_by_restrictions( data, restrictions_entity, "yld", utility_data.get_age_group_ids() ) data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) return data
def load_lri_birth_prevalence_from_meid(_, location): """Ignore the first argument to fit in to the get_data model. """ location_id = utility_data.get_location_id(location) data = get_draws('modelable_entity_id', project_globals.LRI_BIRTH_PREVALENCE_MEID, source=project_globals.LRI_BIRTH_PREVALENCE_DRAW_SOURCE, age_group_id=project_globals.LRI_BIRTH_PREVALENCE_AGE_ID, measure_id=vi_globals.MEASURES['Prevalence'], gbd_round_id=project_globals.LRI_BIRTH_PREVALENCE_GBD_ROUND, location_id=location_id) data = data[data.measure_id == vi_globals.MEASURES['Prevalence']] data = utilities.normalize(data, fill_value=0) idx_columns = list(vi_globals.DEMOGRAPHIC_COLUMNS) idx_columns.remove('age_group_id') data = data.filter(idx_columns + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_exposure_distribution_weights( entity: Union[RiskFactor, AlternativeRiskFactor], location_id: int ) -> pd.DataFrame: data = extract.extract_data(entity, "exposure_distribution_weights", location_id) exposure = extract.extract_data(entity, "exposure", location_id) valid_ages = utilities.get_exposure_and_restriction_ages(exposure, entity) data.drop("age_group_id", axis=1, inplace=True) df = [] for age_id in valid_ages: copied = data.copy() copied["age_group_id"] = age_id df.append(copied) data = pd.concat(df) data = utilities.normalize(data, fill_value=0, cols_to_fill=DISTRIBUTION_COLUMNS) data = data.filter( ["location_id", "sex_id", "age_group_id", "year_id"] + DISTRIBUTION_COLUMNS ) data = utilities.wide_to_long(data, DISTRIBUTION_COLUMNS, var_name="parameter") return data
def load_healthcare_utilization(key: str, location: str) -> pd.DataFrame: data = pd.read_csv(paths.HEALTHCARE_UTILIZATION, dtype={'location_id': np.int64, 'sex_id': np.int64, 'age_group_id': np.int64, 'year_id': np.int64, 'outpatient_visits_per_cap_mean': np.float64, 'outpatient_visits_per_cap_95_upper': np.float64, 'outpatient_visits_per_cap_95_lower': np.float64}) loc_id = utility_data.get_location_id(location) data = data[data.location_id == loc_id].reset_index(drop=True) data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean']) data['log_sd'] = (np.log(data['outpatient_visits_per_cap_95_upper']) - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96 draws = np.exp(np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T draws = pd.DataFrame(data=draws, columns=vi_globals.DRAW_COLUMNS) data = pd.concat([data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1) data = utilities.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_utilization_rate(entity: HealthcareEntity, location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "utilization_rate", location_id) data = utilities.normalize(data, fill_value=0) data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS) return data
def process_exposure(data: pd.DataFrame, key: str, entity: Union[RiskFactor, AlternativeRiskFactor], location: str, gbd_round_id: int, age_group_ids: List[int] = None) -> pd.DataFrame: data['rei_id'] = entity.gbd_id # from vivarium_inputs.extract.extract_exposure allowable_measures = [ vi_globals.MEASURES['Proportion'], vi_globals.MEASURES['Continuous'], vi_globals.MEASURES['Prevalence'] ] proper_measure_id = set(data.measure_id).intersection(allowable_measures) if len(proper_measure_id) != 1: raise vi_globals.DataAbnormalError( f'Exposure data have {len(proper_measure_id)} measure id(s). ' f'Data should have exactly one id out of {allowable_measures} ' f'but came back with {proper_measure_id}.') data = data[data.measure_id == proper_measure_id.pop()] # from vivarium_inputs.core.get_exposure data = data.drop('modelable_entity_id', 'columns') if entity.name in vi_globals.EXTRA_RESIDUAL_CATEGORY: # noinspection PyUnusedLocal cat = vi_globals.EXTRA_RESIDUAL_CATEGORY[entity.name] data = data.drop(labels=data.query('parameter == @cat').index) data[vi_globals.DRAW_COLUMNS] = data[vi_globals.DRAW_COLUMNS].clip( lower=vi_globals.MINIMUM_EXPOSURE_VALUE) if entity.distribution in [ 'dichotomous', 'ordered_polytomous', 'unordered_polytomous' ]: tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([ normalize_age_and_years( exposed, fill_value=0, gbd_round_id=gbd_round_id), normalize_age_and_years( unexposed, fill_value=1, gbd_round_id=gbd_round_id) ], ignore_index=True) # normalize so all categories sum to 1 cols = list( set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) data = data.set_index(cols + ['parameter']) sums = (data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum().reindex( index=data.index)) data = data.divide(sums).reset_index() else: data = vi_utils.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = validate_and_reshape_gbd_data(data, entity, key, location, gbd_round_id, age_group_ids) return data
def get_structure(entity: Population, location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "structure", location_id) data = data.drop("run_id", axis="columns").rename(columns={"population": "value"}) data = utilities.normalize(data) return data
def get_demographic_dimensions(entity: Population, location_id: int) -> pd.DataFrame: demographic_dimensions = utility_data.get_demographic_dimensions(location_id) demographic_dimensions = utilities.normalize(demographic_dimensions) return demographic_dimensions
def get_birth_prevalence(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame: data = extract.extract_data(entity, "birth_prevalence", location_id) data = data.filter(["year_id", "sex_id", "location_id"] + DRAW_COLUMNS) data = utilities.normalize(data, fill_value=0) return data