Beispiel #1
0
def get_data(key: EntityKey,
             entity: ModelableEntity,
             location: str,
             source: str,
             gbd_id_type: str,
             age_group_ids: Set[int],
             gbd_round_id: int,
             decomp_step: str = 'iterative') -> pd.DataFrame:
    age_group_ids = list(age_group_ids)

    # from interface.get_measure
    # from vivarium_inputs.core.get_data
    location_id = utility_data.get_location_id(location) if isinstance(
        location, str) else location

    # from vivarium_inputs.core.get_{measure}
    # from vivarium_inputs.extract.extract_data
    check_metadata(entity, key.measure)

    # from vivarium_inputs.extract.extract_{measure}
    # from vivarium_gbd_access.gbd.get_{measure}
    data = get_draws(gbd_id_type=gbd_id_type,
                     gbd_id=entity.gbd_id,
                     source=source,
                     location_id=location_id,
                     sex_id=gbd_constants.SEX.MALE + gbd_constants.SEX.FEMALE,
                     age_group_id=age_group_ids,
                     gbd_round_id=gbd_round_id,
                     decomp_step=decomp_step,
                     status='best')
    return data
def write_utilization_rate(artifact, location):
    key = 'healthcare_entity.outpatient_visits.utilization_rate'
    from vivarium_csu_hypertension_sdc import external_data
    data_dir = Path(external_data.__file__).parent
    data = pd.read_csv(data_dir / f'outpatient_utilization.csv')
    loc_id = utility_data.get_location_id(location)
    data = data[data.location_id == loc_id].reset_index(drop=True)
    data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean'])
    data['log_sd'] = (
        np.log(data['outpatient_visits_per_cap_95_upper']) -
        np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96
    draws = np.exp(
        np.random.normal(loc=data['log_mean'],
                         scale=data['log_sd'],
                         size=(1000, len(data)))).T
    draws = pd.DataFrame(data=draws, columns=globals.DRAW_COLUMNS)
    data = pd.concat(
        [data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws],
        axis=1)
    data = utilities.normalize(data, fill_value=0)
    data = data.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = split_interval(data,
                          interval_column='age',
                          split_column_prefix='age')
    data = split_interval(data,
                          interval_column='year',
                          split_column_prefix='year')
    data = utilities.sort_hierarchical_data(data)
    artifact.write(key, data)
Beispiel #3
0
def test_core_causelike(entity, measure, location):
    entity_name, entity_expected_measure_ids = entity
    measure_name, measure_id = measure
    tester = success_expected if (entity_expected_measure_ids
                                  & measure_id) else fail_expected
    df = tester(entity_name, measure_name,
                utility_data.get_location_id(location))
def write_ckd_data(artifact, location):
    load = get_load(location)

    # Metadata
    key = f'cause.chronic_kidney_disease.restrictions'
    artifact.write(key, load(key))

    # Measures for Disease Model
    key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate'
    csmr = load(key)
    artifact.write(key, csmr.copy())

    # Measures for Disease States
    key = 'cause.chronic_kidney_disease.prevalence'
    prevalence = load(key)
    artifact.write(key, prevalence.copy())

    key = 'cause.chronic_kidney_disease.disability_weight'
    df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id,
                                      utility_data.get_location_id(location))
    ylds = df[df.measure_id == globals.MEASURES['YLDs']]
    ylds = utilities.filter_data_by_restrictions(
        ylds, causes.chronic_kidney_disease, 'yld',
        utility_data.get_age_group_ids())
    ylds = utilities.normalize(ylds, fill_value=0)
    ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS)
    ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS)
    ylds = utilities.scrub_gbd_conventions(ylds, location)
    ylds = split_interval(ylds,
                          interval_column='age',
                          split_column_prefix='age')
    ylds = split_interval(ylds,
                          interval_column='year',
                          split_column_prefix='year')
    ylds = utilities.sort_hierarchical_data(ylds)
    dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0)
    artifact.write(key, dw)

    key = 'cause.chronic_kidney_disease.excess_mortality_rate'
    emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0)
    artifact.write(key, emr)

    # Measures for Transitions
    key = 'cause.chronic_kidney_disease.incidence_rate'
    data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate',
                         location)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    data = utilities.sort_hierarchical_data(data)
    data[
        data >
        50] = 50  # Russia has absurdly high values in some of the data and it breaks validation.
    artifact.write(key, data)
Beispiel #5
0
def _load_em_from_meid(location, meid, measure):
    location_id = utility_data.get_location_id(location)
    data = gbd.get_modelable_entity_draws(meid, location_id)
    data = data[data.measure_id == vi_globals.MEASURES[measure]]
    data = vi_utils.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS)
    data = vi_utils.reshape(data)
    data = vi_utils.scrub_gbd_conventions(data, location)
    data = vi_utils.split_interval(data, interval_column='age', split_column_prefix='age')
    data = vi_utils.split_interval(data, interval_column='year', split_column_prefix='year')
    return vi_utils.sort_hierarchical_data(data)
Beispiel #6
0
def get_raw_data(entity: ModelableEntity, measure: str,
                 location: str) -> Union[pd.Series, pd.DataFrame]:
    """Pull raw data from GBD for the requested entity, measure, and location.
    Skip standard raw validation checks in order to return data that can be
    investigated for oddities. The only filter that occurs is by applicable
    measure id, metric id, or to most detailed causes where relevant.

    Available measures:

        For entity kind 'sequela':
            incidence_rate, prevalence, birth_prevalence, disability_weight

        For entity kind 'cause':
            incidence_rate, prevalence, birth_prevalence, disability_weight,
            remission_rate, deaths

        For entity kind 'risk_factor':
            exposure, exposure_standard_deviation, exposure_distribution_weights,
            relative_risk, population_attributable_fraction, mediation_factors

        For entity kind 'etiology':
            population_attributable_fraction

        For entity kind 'alternative_risk_factor':
            exposure, exposure_standard_deviation, exposure_distribution_weights

        For entity kind 'covariate':
            estimate

        For entity kind 'population':
            structure, theoretical_minimum_risk_life_expectancy

    Parameters
    ----------
    entity
        Entity for which to extract data.
    measure
        Measure for which to extract data.
    location
        Location for which to extract data.

    Returns
    -------
    Union[pandas.Series, pandas.DataFrame]
        Data for the entity-measure pair and specific location requested, with no
        formatting or reshaping.
    """
    location_id = utility_data.get_location_id(location)
    data = extract.extract_data(entity, measure, location_id, validate=False)
    return data
Beispiel #7
0
def load_ikf_paf(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False)
    relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False)

    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]
    relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)]

    data = (data.groupby('cause_id', as_index=False)
            .apply(core.filter_by_relative_risk, relative_risk)
            .reset_index(drop=True))

    causes_map = {c.gbd_id: c for c in causes}
    temp = []
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
def load_lri_birth_prevalence_from_meid(_, location):
    """Ignore the first argument to fit in to the get_data model. """
    location_id = utility_data.get_location_id(location)
    data = get_draws('modelable_entity_id', project_globals.LRI_BIRTH_PREVALENCE_MEID,
                     source=project_globals.LRI_BIRTH_PREVALENCE_DRAW_SOURCE,
                     age_group_id=project_globals.LRI_BIRTH_PREVALENCE_AGE_ID,
                     measure_id=vi_globals.MEASURES['Prevalence'],
                     gbd_round_id=project_globals.LRI_BIRTH_PREVALENCE_GBD_ROUND,
                     location_id=location_id)
    data = data[data.measure_id == vi_globals.MEASURES['Prevalence']]
    data = utilities.normalize(data, fill_value=0)

    idx_columns = list(vi_globals.DEMOGRAPHIC_COLUMNS)
    idx_columns.remove('age_group_id')
    data = data.filter(idx_columns + vi_globals.DRAW_COLUMNS)

    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Beispiel #9
0
def load_healthcare_utilization(key: str, location: str) -> pd.DataFrame:
    data = pd.read_csv(paths.HEALTHCARE_UTILIZATION,
                       dtype={'location_id': np.int64, 'sex_id': np.int64, 'age_group_id': np.int64,
                              'year_id': np.int64, 'outpatient_visits_per_cap_mean': np.float64,
                              'outpatient_visits_per_cap_95_upper': np.float64,
                              'outpatient_visits_per_cap_95_lower': np.float64})
    loc_id = utility_data.get_location_id(location)
    data = data[data.location_id == loc_id].reset_index(drop=True)
    data['log_mean'] = np.log(data['outpatient_visits_per_cap_mean'])
    data['log_sd'] = (np.log(data['outpatient_visits_per_cap_95_upper'])
                      - np.log(data['outpatient_visits_per_cap_95_lower'])) / 1.96
    draws = np.exp(np.random.normal(loc=data['log_mean'], scale=data['log_sd'], size=(1000, len(data)))).T
    draws = pd.DataFrame(data=draws, columns=vi_globals.DRAW_COLUMNS)
    data = pd.concat([data[['location_id', 'sex_id', 'age_group_id', 'year_id']], draws], axis=1)
    data = utilities.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Beispiel #10
0
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    location_id = utility_data.get_location_id(location) if isinstance(location, str) else location
    measure = 'exposure'
    raw_validation.check_metadata(entity, measure)

    data = gbd.get_exposure(entity.gbd_id, location_id)
    data = normalize_ikf_exposure_distribution(data)
    raw_validation.validate_raw_data(data, entity, measure, location_id)

    data = data.drop('modelable_entity_id', 'columns')

    data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids())

    tmrel_cat = utility_data.get_tmrel_category(entity)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]

    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data
            .groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())

    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Beispiel #11
0
def load_ikf_relative_risk(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'relative_risk', location_id, validate=False)
    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]

    data = utilities.convert_affected_entity(data, 'cause_id')
    data = data[data['affected_entity'].isin(project_globals.DISEASE_MODELS)]
    morbidity = data.morbidity == 1
    mortality = data.mortality == 1
    data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality_rate'
    data = core.filter_relative_risk_to_cause_restrictions(data)

    data = (data.groupby(['affected_entity', 'parameter'])
            .apply(utilities.normalize, fill_value=1)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure', 'parameter']
                       + vi_globals.DRAW_COLUMNS)

    tmrel_cat = utility_data.get_tmrel_category(entity)
    tmrel_mask = data.parameter == tmrel_cat
    data.loc[tmrel_mask, value_cols] = (
        data.loc[tmrel_mask, value_cols].mask(np.isclose(data.loc[tmrel_mask, value_cols], 1.0), 1.0)
    )

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Beispiel #12
0
def test_extract_population(measures):
    pop = ModelableEntity("ignored", "population", None)
    df = extract.extract_data(
        pop, measures, utility_data.get_location_id("India"), validate=VALIDATE_FLAG
    )
Beispiel #13
0
def test_extract_covariatelike(entity, measure, location):
    df = extract.extract_data(
        entity, measure, utility_data.get_location_id(location), validate=VALIDATE_FLAG
    )
def test_get_measure_covariatelike(entity, measure, location):
    df = get_measure(entity, measure, utility_data.get_location_id(location))
Beispiel #15
0
def test_core_healthsystem(entity, measure, location):
    df = core.get_data(entity, measure, utility_data.get_location_id(location))
def write_sbp_data(artifact, location):
    load = get_load(location)
    affected_entity_map = {
        'ischemic_heart_disease': 'acute_myocardial_infarction',
        'ischemic_stroke': 'acute_ischemic_stroke',
        'intracerebral_hemorrhage': 'acute_intracerebral_hemorrhage',
        'subarachnoid_hemorrhage': 'acute_subarachnoid_hemorrhage',
        'chronic_kidney_disease': 'chronic_kidney_disease'
    }

    prefix = 'risk_factor.high_systolic_blood_pressure.'
    measures = [
        "restrictions", "distribution", "tmred", "exposure",
        "exposure_standard_deviation", "relative_risk_scalar",
        "exposure_distribution_weights"
    ]
    for m in measures:
        key = prefix + m
        artifact.write(key, load(key))

    sbp = risk_factors.high_systolic_blood_pressure

    data = gbd.get_paf(sbp.gbd_id, utility_data.get_location_id(location))
    data = data[data.metric_id == globals.METRICS['Percent']]
    data = data[data.measure_id == globals.MEASURES['YLDs']]
    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == globals.MEASURES['YLDs'],
             'affected_measure'] = 'incidence_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'
                          ]).apply(utilities.normalize,
                                   fill_value=0).reset_index(drop=True))
    data = data.loc[data.affected_entity.isin(affected_entity_map.keys())]
    data.affected_entity.replace(to_replace=affected_entity_map, inplace=True)
    data = data.filter(globals.DEMOGRAPHIC_COLUMNS +
                       ['affected_entity', 'affected_measure'] +
                       globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = split_interval(data,
                          interval_column='age',
                          split_column_prefix='age')
    data = split_interval(data,
                          interval_column='year',
                          split_column_prefix='year')
    data = utilities.sort_hierarchical_data(data)

    key = prefix + 'population_attributable_fraction'
    artifact.write(key, data)

    data = gbd.get_relative_risk(sbp.gbd_id,
                                 utility_data.get_location_id(location))
    data = utilities.convert_affected_entity(data, 'cause_id')
    morbidity = data.morbidity == 1
    mortality = data.mortality == 1
    data.loc[morbidity & mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[morbidity & ~mortality, 'affected_measure'] = 'incidence_rate'
    data.loc[~morbidity & mortality, 'affected_measure'] = 'excess_mortality'

    data = data.loc[data.affected_entity.isin(affected_entity_map.keys())]
    data = core.filter_relative_risk_to_cause_restrictions(data)
    data.affected_entity.replace(to_replace=affected_entity_map, inplace=True)
    data = data.filter(globals.DEMOGRAPHIC_COLUMNS +
                       ['affected_entity', 'affected_measure', 'parameter'] +
                       globals.DRAW_COLUMNS)
    data = (data.groupby(['affected_entity', 'parameter'
                          ]).apply(utilities.normalize,
                                   fill_value=1).reset_index(drop=True))
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.sort_hierarchical_data(data)
    data = split_interval(data,
                          interval_column='age',
                          split_column_prefix='age')
    data = split_interval(data,
                          interval_column='year',
                          split_column_prefix='year')
    loc = location.lower().replace(' ', '_')
    ckd_rr = pd.read_hdf(
        f'/share/costeffectiveness/artifacts/vivarium_csu_hypertension_sdc/ckd_rr/{loc}.hdf'
    )
    ckd_rr = ckd_rr.reset_index()
    ckd_rr['parameter'] = 'per unit'
    ckd_rr['affected_entity'] = 'chronic_kidney_disease'
    ckd_rr['affected_measure'] = 'incidence_rate'
    ckd_rr = ckd_rr.set_index([
        'location', 'sex', 'age_start', 'year_start', 'affected_entity',
        'affected_measure', 'parameter', 'age_end', 'year_end'
    ])
    data = pd.concat([data, ckd_rr])
    key = prefix + 'relative_risk'
    artifact.write(key, data)
Beispiel #17
0
def get_data(entity, measure: str, location: Union[str, int]):
    measure_handlers = {
        # Cause-like measures
        "incidence_rate": (get_incidence_rate, ("cause", "sequela")),
        "raw_incidence_rate": (get_raw_incidence_rate, ("cause", "sequela")),
        "prevalence": (get_prevalence, ("cause", "sequela")),
        "birth_prevalence": (get_birth_prevalence, ("cause", "sequela")),
        "disability_weight": (get_disability_weight, ("cause", "sequela")),
        "remission_rate": (get_remission_rate, ("cause",)),
        "cause_specific_mortality_rate": (get_cause_specific_mortality_rate, ("cause",)),
        "excess_mortality_rate": (get_excess_mortality_rate, ("cause",)),
        "deaths": (get_deaths, ("cause",)),
        # Risk-like measures
        "exposure": (
            get_exposure,
            (
                "risk_factor",
                "alternative_risk_factor",
            ),
        ),
        "exposure_standard_deviation": (
            get_exposure_standard_deviation,
            ("risk_factor", "alternative_risk_factor"),
        ),
        "exposure_distribution_weights": (
            get_exposure_distribution_weights,
            ("risk_factor", "alternative_risk_factor"),
        ),
        "relative_risk": (get_relative_risk, ("risk_factor",)),
        "population_attributable_fraction": (
            get_population_attributable_fraction,
            ("risk_factor", "etiology"),
        ),
        # Covariate measures
        "estimate": (get_estimate, ("covariate",)),
        # Population measures
        "structure": (get_structure, ("population",)),
        "theoretical_minimum_risk_life_expectancy": (
            get_theoretical_minimum_risk_life_expectancy,
            ("population",),
        ),
        "age_bins": (get_age_bins, ("population",)),
        "demographic_dimensions": (get_demographic_dimensions, ("population",)),
    }

    if measure not in measure_handlers:
        raise InvalidQueryError(f"No functions available to pull data for measure {measure}.")

    handler, entity_types = measure_handlers[measure]

    if entity.kind not in entity_types:
        raise InvalidQueryError(f"{measure.capitalize()} not available for {entity.kind}.")

    location_id = (
        utility_data.get_location_id(location) if isinstance(location, str) else location
    )
    data = handler(entity, location_id)

    if measure in [
        "structure",
        "theoretical_minimum_risk_life_expectancy",
        "estimate",
        "exposure_distribution_weights",
    ]:
        value_cols = ["value"]
    else:
        value_cols = DRAW_COLUMNS

    data = utilities.reshape(data, value_cols=value_cols)

    return data
Beispiel #18
0
def test_core_population(measures):
    pop = ModelableEntity("ignored", "population", None)
    df = core.get_data(pop, measures, utility_data.get_location_id("India"))