def write_ckd_data(artifact, location): load = get_load(location) # Metadata key = f'cause.chronic_kidney_disease.restrictions' artifact.write(key, load(key)) # Measures for Disease Model key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate' csmr = load(key) artifact.write(key, csmr.copy()) # Measures for Disease States key = 'cause.chronic_kidney_disease.prevalence' prevalence = load(key) artifact.write(key, prevalence.copy()) key = 'cause.chronic_kidney_disease.disability_weight' df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id, utility_data.get_location_id(location)) ylds = df[df.measure_id == globals.MEASURES['YLDs']] ylds = utilities.filter_data_by_restrictions( ylds, causes.chronic_kidney_disease, 'yld', utility_data.get_age_group_ids()) ylds = utilities.normalize(ylds, fill_value=0) ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS) ylds = utilities.scrub_gbd_conventions(ylds, location) ylds = split_interval(ylds, interval_column='age', split_column_prefix='age') ylds = split_interval(ylds, interval_column='year', split_column_prefix='year') ylds = utilities.sort_hierarchical_data(ylds) dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, dw) key = 'cause.chronic_kidney_disease.excess_mortality_rate' emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0) artifact.write(key, emr) # Measures for Transitions key = 'cause.chronic_kidney_disease.incidence_rate' data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate', location) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) data[ data > 50] = 50 # Russia has absurdly high values in some of the data and it breaks validation. artifact.write(key, data)
def validate_and_reshape_gbd_data( data: pd.DataFrame, entity: ModelableEntity, key: EntityKey, location: str, gbd_round_id: int, age_group_ids: List[int] = None) -> pd.DataFrame: # from vivarium_inputs.core.get_data data = vi_utils.reshape(data, value_cols=vi_globals.DRAW_COLUMNS) # from interface.get_measure data = _scrub_gbd_conventions(data, location, age_group_ids) estimation_years = get_gbd_estimation_years(gbd_round_id) validation_years = pd.DataFrame({ 'year_start': range(min(estimation_years), max(estimation_years) + 1) }) validation_years['year_end'] = validation_years['year_start'] + 1 # validate_for_simulation(data, entity, key.measure, location, years=validation_years, # age_bins=get_gbd_age_bins(age_group_ids)) data = vi_utils.split_interval(data, interval_column='age', split_column_prefix='age') data = vi_utils.split_interval(data, interval_column='year', split_column_prefix='year') data = vi_utils.sort_hierarchical_data(data).droplevel('location') return data
def write_utilization_rate(artifact, location): key = 'healthcare_entity.outpatient_visits.utilization_rate' from vivarium_csu_hypertension_sdc import external_data data_dir = Path(external_data.__file__).parent data = pd.read_csv(data_dir / f'outpatient_utilization.csv') loc_id = utility_data.get_location_id(location) data = data[data.location_id == loc_id].reset_index(drop=True) data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean']) data['log_sd'] = ( np.log(data['outpatient_visits_per_cap_95_upper']) - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96 draws = np.exp( np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T draws = pd.DataFrame(data=draws, columns=globals.DRAW_COLUMNS) data = pd.concat( [data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1) data = utilities.normalize(data, fill_value=0) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) artifact.write(key, data)
def load_lbwsg_exposure(key: str, location: str): path = paths.lbwsg_data_path('exposure', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data.drop('modelable_entity_id', 'columns') data = data[data.parameter != 'cat124'] # LBWSG data has an extra residual category added by get_draws. data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data.groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'exposure', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lbwsg_relative_risk(key: str, location: str): path = paths.lbwsg_data_path('relative_risk', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = utilities.convert_affected_entity(data, 'cause_id') # RRs for all causes are the same. data = data[data.affected_entity == 'diarrheal_diseases'] data['affected_entity'] = 'all' # All lbwsg risk is about mortality. data.loc[:, 'affected_measure'] = 'excess_mortality_rate' data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + vi_globals.DRAW_COLUMNS) data = ( data .groupby(['affected_entity', 'parameter']) .apply(utilities.normalize, fill_value=1) .reset_index(drop=True) ) tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation) tmrel_mask = data.parameter == tmrel_cat data.loc[tmrel_mask, vi_globals.DRAW_COLUMNS] = ( data .loc[tmrel_mask, vi_globals.DRAW_COLUMNS] .mask(np.isclose(data.loc[tmrel_mask, vi_globals.DRAW_COLUMNS], 1.0), 1.0) ) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'relative_risk', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lbwsg_paf(key: str, location: str): path = paths.lbwsg_data_path('population_attributable_fraction', location) data = pd.read_hdf(path) # type: pd.DataFrame data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id data = data[data.metric_id == vi_globals.METRICS['Percent']] # All lbwsg risk is about mortality. data = data[data.measure_id.isin([vi_globals.MEASURES['YLLs']])] temp = [] causes_map = {c.gbd_id: c for c in causes} # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation, 'population_attributable_fraction', location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_demographic_dimensions(location: str) -> pd.DataFrame: """Pull the full demographic dimensions for GBD data, standardized to the expected simulation input format, including scrubbing all GBD conventions to replace IDs with with meaningful values or ranges. Parameters ---------- location Location for which to pull demographic dimension data. Returns ------- pandas.DataFrame Dataframe with age and year bins from GBD, sexes, and the given location. """ pop = Population() data = core.get_data(pop, "demographic_dimensions", location) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, pop, "demographic_dimensions", location) data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def load_forecast_data(key: EntityKey, location: str): location_id = extract.get_location_id(location) path = paths.forecast_data_path(key) data = extract.load_forecast_from_xarray(path, location_id) data = data[data.scenario == project_globals.FORECASTING_SCENARIO].drop( columns='scenario') if key == EntityKey('etiology.shigellosis.incidence'): # Only one draw for incidence data = pd.concat(project_globals.NUM_DRAWS * [ data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id']).value ], axis=1) else: data = data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() if len(data.columns) == 100: # Not 1000 draws for everything data = pd.concat([data] * 10, axis=1) data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) data = data.reset_index() data = standardize.normalize(data) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_shigella_disability_weight(key: EntityKey, location: str): location_id = extract.get_location_id(location) data = _get_raw_demographic_dimensions(location) data = pd.DataFrame(0, columns=vi_globals.DRAW_COLUMNS, index=data.index) for sequela in causes.diarrheal_diseases.sequelae: prevalence = _load_prevalence(sequela, location_id, 'sequela') disability = _load_diarrhea_sequela_disability_weight( sequela, location_id) disability.index = disability.index.set_levels([location_id], 'location_id') data += prevalence * disability diarrhea_prevalence = _load_prevalence(causes.diarrheal_diseases, location_id, 'cause') data = (data / diarrhea_prevalence).fillna(0).reset_index() data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_population_structure(location: str) -> pd.DataFrame: """Pull GBD population data for the given location and standardize to the expected simulation input format, including scrubbing all GBD conventions to replace IDs with meaningful values or ranges and expanding over all demographic dimensions. Parameters ---------- location Location for which to pull population data. Returns ------- pandas.DataFrame Dataframe of population data for `location`, standardized to the format expected by `vivarium` simulations. """ pop = Population() data = core.get_data(pop, "structure", location) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, pop, "structure", location) data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def get_theoretical_minimum_risk_life_expectancy() -> pd.DataFrame: """Pull GBD theoretical minimum risk life expectancy data and standardize to the expected simulation input format, including binning age parameters as expected by simulations. Returns ------- pandas.DataFrame Dataframe of theoretical minimum risk life expectancy data, standardized to the format expected by `vivarium` simulations with binned age parameters. """ pop = Population() data = core.get_data(pop, "theoretical_minimum_risk_life_expectancy", "Global") data = utilities.set_age_interval(data) validation.validate_for_simulation( data, pop, "theoretical_minimum_risk_life_expectancy", "Global") data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def write_hypertension_medication_data(artifact, location): external_data_specification = { 'adherence': { 'seed_columns': ['location'], # all adherence will use the same seeds 'distribution': 'beta', }, 'medication_probabilities': { 'seed_columns': [ 'location', 'measure', 'thiazide_type_diuretics', 'beta_blockers', 'ace_inhibitors', 'angiotensin_ii_blockers', 'calcium_channel_blockers' ], 'distribution': 'beta', }, 'therapy_category': { 'seed_columns': ['location', 'therapy_category'], 'distribution': 'beta', }, 'treatment_coverage': { 'seed_columns': ['location', 'measure'], 'distribution': 'beta', }, 'drug_efficacy': { 'seed_columns': [ 'location', 'drug' ], # don't include dosage so all dosages of same drug will use same seeds 'distribution': 'normal', }, } for k, spec in external_data_specification.items(): data = load_external_data(k, location) data = generate_draws(data, spec['seed_columns'], spec['distribution']) if set(data.location) == {'Global'}: # do this post draw generation so all locations use the same draws if data is global data.location = location if k == 'medication_probabilities': # drop ACE + ARB single pill because not used data = data.loc[~((data.ace_inhibitors == 1) & (data.angiotensin_ii_blockers == 1))] data = utilities.sort_hierarchical_data(utilities.reshape(data)) if k == 'therapy_category': # normalize so that sum of all categories = 1 data = data.divide(data.sum(axis=0), axis=1) key = f'health_technology.hypertension_medication.{k}' data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') artifact.write(key, data)
def load_demographic_dimensions(key: EntityKey, location: str): data = _get_raw_demographic_dimensions(location) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_measure(entity: ModelableEntity, measure: str, location: str) -> pd.DataFrame: """Pull GBD data for measure and entity and prep for simulation input, including scrubbing all GBD conventions to replace IDs with meaningful values or ranges and expanding over all demographic dimensions. To pull data using this function, please have at least 50GB of memory available. Available measures: For entity kind 'sequela': incidence_rate, prevalence, birth_prevalence, disability_weight For entity kind 'cause': incidence_rate, prevalence, birth_prevalence, disability_weight, remission_rate, cause_specific_mortality_rate, excess_mortality_rate For entity kind 'risk_factor': exposure, exposure_standard_deviation, exposure_distribution_weights, relative_risk, population_attributable_fraction, mediation_factors For entity kind 'etiology': population_attributable_fraction For entity kind 'alternative_risk_factor': exposure, exposure_standard_deviation, exposure_distribution_weights For entity kind 'covariate': estimate Parameters ---------- entity Entity for which to pull `measure`. measure Measure for which to pull data, should be a measure available for the kind of entity which `entity` is. location Location for which to pull data. Returns ------- pandas.DataFrame Dataframe standardized to the format expected by `vivarium` simulations. """ data = core.get_data(entity, measure, location) data = utilities.scrub_gbd_conventions(data, location) validation.validate_for_simulation(data, entity, measure, location) data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def _load_em_from_meid(location, meid, measure): location_id = utility_data.get_location_id(location) data = gbd.get_modelable_entity_draws(meid, location_id) data = data[data.measure_id == vi_globals.MEASURES[measure]] data = vi_utils.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = vi_utils.reshape(data) data = vi_utils.scrub_gbd_conventions(data, location) data = vi_utils.split_interval(data, interval_column='age', split_column_prefix='age') data = vi_utils.split_interval(data, interval_column='year', split_column_prefix='year') return vi_utils.sort_hierarchical_data(data)
def load_ikf_paf(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False) relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)] data = (data.groupby('cause_id', as_index=False) .apply(core.filter_by_relative_risk, relative_risk) .reset_index(drop=True)) causes_map = {c.gbd_id: c for c in causes} temp = [] # We filter paf age groups by cause level restrictions. for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']): cause = causes_map[c_id] measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld' df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids()) temp.append(df) data = pd.concat(temp, ignore_index=True) data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate' data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure']) .apply(utilities.normalize, fill_value=0) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_location_specific_life_expectancy(key: EntityKey, location: str): location_id = extract.get_location_id(location) data = extract.get_location_specific_life_expectancy(location_id) data = data.rename(columns={'age': 'age_start'}) data['age_end'] = data.age_start.shift(-1).fillna(5.01) earliest_year = data[data.year_id == 2025] out = [] for year in range(project_globals.MIN_YEAR, 2025): df = earliest_year.copy() df['year_id'] = year out.append(df) data = pd.concat(out + [data], ignore_index=True) data = utilities.normalize_sex(data, None, ['value']) data = standardize.normalize_year(data) data = utilities.reshape(data, value_cols=['value']) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_lri_birth_prevalence_from_meid(_, location): """Ignore the first argument to fit in to the get_data model. """ location_id = utility_data.get_location_id(location) data = get_draws('modelable_entity_id', project_globals.LRI_BIRTH_PREVALENCE_MEID, source=project_globals.LRI_BIRTH_PREVALENCE_DRAW_SOURCE, age_group_id=project_globals.LRI_BIRTH_PREVALENCE_AGE_ID, measure_id=vi_globals.MEASURES['Prevalence'], gbd_round_id=project_globals.LRI_BIRTH_PREVALENCE_GBD_ROUND, location_id=location_id) data = data[data.measure_id == vi_globals.MEASURES['Prevalence']] data = utilities.normalize(data, fill_value=0) idx_columns = list(vi_globals.DEMOGRAPHIC_COLUMNS) idx_columns.remove('age_group_id') data = data.filter(idx_columns + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_healthcare_utilization(key: str, location: str) -> pd.DataFrame: data = pd.read_csv(paths.HEALTHCARE_UTILIZATION, dtype={'location_id': np.int64, 'sex_id': np.int64, 'age_group_id': np.int64, 'year_id': np.int64, 'outpatient_visits_per_cap_mean': np.float64, 'outpatient_visits_per_cap_95_upper': np.float64, 'outpatient_visits_per_cap_95_lower': np.float64}) loc_id = utility_data.get_location_id(location) data = data[data.location_id == loc_id].reset_index(drop=True) data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean']) data['log_sd'] = (np.log(data['outpatient_visits_per_cap_95_upper']) - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96 draws = np.exp(np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T draws = pd.DataFrame(data=draws, columns=vi_globals.DRAW_COLUMNS) data = pd.concat([data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1) data = utilities.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_shigella_remission_rate(key: EntityKey, location: str): location_id = extract.get_location_id(location) data = extract.get_modelable_entity_draws( causes.diarrheal_diseases.dismod_id, location_id) data = data[data.measure_id == vi_globals.MEASURES['Remission rate']] data = utilities.filter_data_by_restrictions( data, causes.diarrheal_diseases, 'yld', utility_data.get_age_group_ids()) data = data[data.year_id == 2016].drop( columns='year_id') # Use latest GBD results for all data data = standardize.normalize(data, fill_value=0) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def get_age_bins() -> pd.DataFrame: """Pull GBD age bin data and standardize to the expected simulation input format. Returns ------- pandas.DataFrame Dataframe of age bin data, with bin start and end values as well as bin names. """ pop = Population() data = core.get_data(pop, "age_bins", "Global") data = utilities.set_age_interval(data) validation.validate_for_simulation(data, pop, "age_bins", "Global") data = utilities.split_interval(data, interval_column="age", split_column_prefix="age") data = utilities.split_interval(data, interval_column="year", split_column_prefix="year") return utilities.sort_hierarchical_data(data)
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) location_id = utility_data.get_location_id(location) if isinstance(location, str) else location measure = 'exposure' raw_validation.check_metadata(entity, measure) data = gbd.get_exposure(entity.gbd_id, location_id) data = normalize_ikf_exposure_distribution(data) raw_validation.validate_raw_data(data, entity, measure, location_id) data = data.drop('modelable_entity_id', 'columns') data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids()) tmrel_cat = utility_data.get_tmrel_category(entity) exposed = data[data.parameter != tmrel_cat] unexposed = data[data.parameter == tmrel_cat] # FIXME: We fill 1 as exposure of tmrel category, which is not correct. data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)], ignore_index=True) # normalize so all categories sum to 1 cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter'])) sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum() data = (data .groupby('parameter') .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums)) .reset_index()) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter']) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_live_births_by_year(key: EntityKey, location: str): location_id = extract.get_location_id(location) asfr_key = EntityKey('covariate.age_specific_fertility_rate.estimate') pop_key = EntityKey(project_globals.POPULATION_STRUCTURE) asfr_data = extract.load_forecast_from_xarray( paths.forecast_data_path(asfr_key), location_id) asfr_data = asfr_data[ (asfr_data.scenario == project_globals.FORECASTING_SCENARIO) & (asfr_data.year_id >= project_globals.MIN_YEAR)].drop( columns='scenario') asfr_data = asfr_data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() asfr_data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) pop_data = extract.load_forecast_from_xarray( paths.forecast_data_path(pop_key), location_id) pop_data = pop_data[( pop_data.scenario == project_globals.FORECASTING_SCENARIO)].drop( columns='scenario') pop_data = pop_data.set_index( ['location_id', 'age_group_id', 'sex_id', 'year_id', 'draw']).unstack() pop_data.columns = pd.Index([f'draw_{i}' for i in range(1000)]) pop_data = pop_data.loc[asfr_data.index] live_births = asfr_data * pop_data live_births = (live_births.reset_index().drop( columns=['sex_id', 'age_group_id']).groupby(['location_id', 'year_id' ]).sum().reset_index()) data = standardize.normalize(live_births) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def load_ikf_relative_risk(key: str, location: str) -> pd.DataFrame: key = EntityKey(key) entity = get_entity(key) value_cols = vi_globals.DRAW_COLUMNS location_id = utility_data.get_location_id(location) data = extract.extract_data(entity, 'relative_risk', location_id, validate=False) yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only]) data = data[~data.cause_id.isin(yll_only_causes)] data = utilities.convert_affected_entity(data, 'cause_id') data = data[data['affected_entity'].isin(project_globals.DISEASE_MODELS)] morbidity = data.morbidity == 1 mortality = data.mortality == 1 data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate' data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate' data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality_rate' data = core.filter_relative_risk_to_cause_restrictions(data) data = (data.groupby(['affected_entity', 'parameter']) .apply(utilities.normalize, fill_value=1) .reset_index(drop=True)) data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + vi_globals.DRAW_COLUMNS) tmrel_cat = utility_data.get_tmrel_category(entity) tmrel_mask = data.parameter == tmrel_cat data.loc[tmrel_mask, value_cols] = ( data.loc[tmrel_mask, value_cols].mask(np.isclose(data.loc[tmrel_mask, value_cols], 1.0), 1.0) ) data = utilities.reshape(data, value_cols=value_cols) data = utilities.scrub_gbd_conventions(data, location) sim_validation.validate_for_simulation(data, entity, key.measure, location) data = utilities.split_interval(data, interval_column='age', split_column_prefix='age') data = utilities.split_interval(data, interval_column='year', split_column_prefix='year') return utilities.sort_hierarchical_data(data)
def build_joint_pafs(location: str, draws: str, verbose: int, queue: str): # Local import to avoid data dependencies from vivarium_inputs import globals as vi_globals, utilities output_dir = paths.JOINT_PAF_DIR locations = project_globals.LOCATIONS if location == 'all' else [location] from vivarium_cluster_tools.psimulate.utilities import get_drmaa drmaa = get_drmaa() jobs = {} draw_list = {'all': range(1000), 'none': []}.get(draws, draws.split(',')) with drmaa.Session() as session: for location in locations: build_joint_pafs_single_location(drmaa, queue, jobs, location, draw_list, output_dir, session) if verbose: logger.info('Entering monitoring loop.') logger.info('-------------------------') logger.info('') while any([ job[1] not in [drmaa.JobState.DONE, drmaa.JobState.FAILED] for job in jobs.values() ]): for location, (job_id, status) in jobs.items(): jobs[location] = (job_id, session.jobStatus(job_id)) logger.info( f'{location:<35}: {decode_status(drmaa, jobs[location][1]):>15}' ) logger.info('') time.sleep(project_globals.MAKE_ARTIFACT_SLEEP) logger.info('Checking status again') logger.info('---------------------') logger.info('') for location in locations: logger.info(f'Merging data for location - {location}') sanitized_location = sanitize_location(location) location_dir = paths.JOINT_PAF_DIR / sanitized_location existing_data_path = output_dir / f'{sanitized_location}.hdf' joint_pafs = [] if existing_data_path.exists(): joint_pafs.append( pd.read_hdf(output_dir / f'{sanitized_location}.hdf')) joint_pafs[0].to_hdf(output_dir / f'{sanitized_location}-old.hdf', 'data') for file_path in location_dir.iterdir(): draw = file_path.parts[-1].split('.')[0] draw_joint_paf = pd.read_hdf(file_path).rename(columns={0: draw}) draw_joint_paf['affected_measure'] = 'incidence_rate' draw_joint_paf = draw_joint_paf.set_index( list(draw_joint_paf.columns.drop(draw))) joint_pafs.append(draw_joint_paf) joint_paf_data = pd.concat(joint_pafs, axis=1) joint_paf_data = joint_paf_data[ vi_globals.DRAW_COLUMNS] # sort the columns joint_paf_data = utilities.sort_hierarchical_data( joint_paf_data).convert_objects() joint_paf_data.to_hdf(output_dir / f'{sanitized_location}.hdf', 'data') shutil.rmtree(location_dir) logger.info('**Done**')
def write_sbp_data(artifact, location): load = get_load(location) affected_entity_map = { 'ischemic_heart_disease': 'acute_myocardial_infarction', 'ischemic_stroke': 'acute_ischemic_stroke', 'intracerebral_hemorrhage': 'acute_intracerebral_hemorrhage', 'subarachnoid_hemorrhage': 'acute_subarachnoid_hemorrhage', 'chronic_kidney_disease': 'chronic_kidney_disease' } prefix = 'risk_factor.high_systolic_blood_pressure.' measures = [ "restrictions", "distribution", "tmred", "exposure", "exposure_standard_deviation", "relative_risk_scalar", "exposure_distribution_weights" ] for m in measures: key = prefix + m artifact.write(key, load(key)) sbp = risk_factors.high_systolic_blood_pressure data = gbd.get_paf(sbp.gbd_id, utility_data.get_location_id(location)) data = data[data.metric_id == globals.METRICS['Percent']] data = data[data.measure_id == globals.MEASURES['YLDs']] data = utilities.convert_affected_entity(data, 'cause_id') data.loc[data['measure_id'] == globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate' data = (data.groupby(['affected_entity', 'affected_measure' ]).apply(utilities.normalize, fill_value=0).reset_index(drop=True)) data = data.loc[data.affected_entity.isin(affected_entity_map.keys())] data.affected_entity.replace(to_replace=affected_entity_map, inplace=True) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure'] + globals.DRAW_COLUMNS) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') data = utilities.sort_hierarchical_data(data) key = prefix + 'population_attributable_fraction' artifact.write(key, data) data = gbd.get_relative_risk(sbp.gbd_id, utility_data.get_location_id(location)) data = utilities.convert_affected_entity(data, 'cause_id') morbidity = data.morbidity == 1 mortality = data.mortality == 1 data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate' data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate' data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality' data = data.loc[data.affected_entity.isin(affected_entity_map.keys())] data = core.filter_relative_risk_to_cause_restrictions(data) data.affected_entity.replace(to_replace=affected_entity_map, inplace=True) data = data.filter(globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter'] + globals.DRAW_COLUMNS) data = (data.groupby(['affected_entity', 'parameter' ]).apply(utilities.normalize, fill_value=1).reset_index(drop=True)) data = utilities.reshape(data) data = utilities.scrub_gbd_conventions(data, location) data = utilities.sort_hierarchical_data(data) data = split_interval(data, interval_column='age', split_column_prefix='age') data = split_interval(data, interval_column='year', split_column_prefix='year') loc = location.lower().replace(' ', '_') ckd_rr = pd.read_hdf( f'/share/costeffectiveness/artifacts/vivarium_csu_hypertension_sdc/ckd_rr/{loc}.hdf' ) ckd_rr = ckd_rr.reset_index() ckd_rr['parameter'] = 'per unit' ckd_rr['affected_entity'] = 'chronic_kidney_disease' ckd_rr['affected_measure'] = 'incidence_rate' ckd_rr = ckd_rr.set_index([ 'location', 'sex', 'age_start', 'year_start', 'affected_entity', 'affected_measure', 'parameter', 'age_end', 'year_end' ]) data = pd.concat([data, ckd_rr]) key = prefix + 'relative_risk' artifact.write(key, data)