Ejemplo n.º 1
0
def load_lbwsg_exposure(key: str, location: str):
    path = paths.lbwsg_data_path('exposure', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id

    data = data.drop('modelable_entity_id', 'columns')
    data = data[data.parameter != 'cat124']  # LBWSG data has an extra residual category added by get_draws.
    data = utilities.filter_data_by_restrictions(data, risk_factors.low_birth_weight_and_short_gestation,
                                                 'outer', utility_data.get_age_group_ids())
    tmrel_cat = utility_data.get_tmrel_category(risk_factors.low_birth_weight_and_short_gestation)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]
    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data.groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'exposure', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Ejemplo n.º 2
0
def load_lbwsg_paf(key: str, location: str):
    path = paths.lbwsg_data_path('population_attributable_fraction', location)
    data = pd.read_hdf(path)  # type: pd.DataFrame
    data['rei_id'] = risk_factors.low_birth_weight_and_short_gestation.gbd_id
    data = data[data.metric_id == vi_globals.METRICS['Percent']]
    # All lbwsg risk is about mortality.
    data = data[data.measure_id.isin([vi_globals.MEASURES['YLLs']])]

    temp = []
    causes_map = {c.gbd_id: c for c in causes}
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS
                       + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    validation.validate_for_simulation(data, risk_factors.low_birth_weight_and_short_gestation,
                                       'population_attributable_fraction', location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Ejemplo n.º 3
0
def get_deaths(entity: Cause, location_id: int) -> pd.DataFrame:
    data = extract.extract_data(entity, "deaths", location_id)
    data = utilities.filter_data_by_restrictions(
        data, entity, "yll", utility_data.get_age_group_ids()
    )
    data = utilities.normalize(data, fill_value=0)
    data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS)
    return data
def write_ckd_data(artifact, location):
    load = get_load(location)

    # Metadata
    key = f'cause.chronic_kidney_disease.restrictions'
    artifact.write(key, load(key))

    # Measures for Disease Model
    key = f'cause.chronic_kidney_disease.cause_specific_mortality_rate'
    csmr = load(key)
    artifact.write(key, csmr.copy())

    # Measures for Disease States
    key = 'cause.chronic_kidney_disease.prevalence'
    prevalence = load(key)
    artifact.write(key, prevalence.copy())

    key = 'cause.chronic_kidney_disease.disability_weight'
    df = gbd.get_incidence_prevalence(causes.chronic_kidney_disease.gbd_id,
                                      utility_data.get_location_id(location))
    ylds = df[df.measure_id == globals.MEASURES['YLDs']]
    ylds = utilities.filter_data_by_restrictions(
        ylds, causes.chronic_kidney_disease, 'yld',
        utility_data.get_age_group_ids())
    ylds = utilities.normalize(ylds, fill_value=0)
    ylds = ylds.filter(globals.DEMOGRAPHIC_COLUMNS + globals.DRAW_COLUMNS)
    ylds = utilities.reshape(ylds, value_cols=globals.DRAW_COLUMNS)
    ylds = utilities.scrub_gbd_conventions(ylds, location)
    ylds = split_interval(ylds,
                          interval_column='age',
                          split_column_prefix='age')
    ylds = split_interval(ylds,
                          interval_column='year',
                          split_column_prefix='year')
    ylds = utilities.sort_hierarchical_data(ylds)
    dw = (ylds / prevalence).fillna(0).replace([np.inf, -np.inf], 0)
    artifact.write(key, dw)

    key = 'cause.chronic_kidney_disease.excess_mortality_rate'
    emr = (csmr / prevalence).fillna(0).replace([np.inf, -np.inf], 0)
    artifact.write(key, emr)

    # Measures for Transitions
    key = 'cause.chronic_kidney_disease.incidence_rate'
    data = core.get_data(causes.chronic_kidney_disease, 'incidence_rate',
                         location)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    data = utilities.sort_hierarchical_data(data)
    data[
        data >
        50] = 50  # Russia has absurdly high values in some of the data and it breaks validation.
    artifact.write(key, data)
Ejemplo n.º 5
0
def _load_prevalence(entity, location_id: int, entity_type: str):
    logger.info(f'Loading prevalence for {entity.name} from GBD 2016.')
    data = extract.get_como_draws(entity.gbd_id, location_id, entity_type)
    data = data[data.measure_id == vi_globals.MEASURES['Prevalence']]
    data = utilities.filter_data_by_restrictions(
        data, causes.diarrheal_diseases, 'yld',
        utility_data.get_age_group_ids())
    data = data[data.year_id == 2016].drop(
        columns='year_id')  # Use latest GBD results for all data
    data = standardize.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS +
                       vi_globals.DRAW_COLUMNS)
    return utilities.reshape(data)
Ejemplo n.º 6
0
def get_prevalence(entity: Union[Cause, Sequela], location_id: int) -> pd.DataFrame:
    data = extract.extract_data(entity, "prevalence", location_id)
    if entity.kind == "cause":
        restrictions_entity = entity
    else:  # sequela
        cause = [c for c in causes if c.sequelae and entity in c.sequelae][0]
        restrictions_entity = cause

    data = utilities.filter_data_by_restrictions(
        data, restrictions_entity, "yld", utility_data.get_age_group_ids()
    )
    data = utilities.normalize(data, fill_value=0)
    data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS)
    return data
Ejemplo n.º 7
0
def load_ikf_paf(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    value_cols = vi_globals.DRAW_COLUMNS
    location_id = utility_data.get_location_id(location)

    data = extract.extract_data(entity, 'population_attributable_fraction', location_id, validate=False)
    relative_risk = extract.extract_data(entity, 'relative_risk', location_id, validate=False)

    yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
    data = data[~data.cause_id.isin(yll_only_causes)]
    relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)]

    data = (data.groupby('cause_id', as_index=False)
            .apply(core.filter_by_relative_risk, relative_risk)
            .reset_index(drop=True))

    causes_map = {c.gbd_id: c for c in causes}
    temp = []
    # We filter paf age groups by cause level restrictions.
    for (c_id, measure), df in data.groupby(['cause_id', 'measure_id']):
        cause = causes_map[c_id]
        measure = 'yll' if measure == vi_globals.MEASURES['YLLs'] else 'yld'
        df = utilities.filter_data_by_restrictions(df, cause, measure, utility_data.get_age_group_ids())
        temp.append(df)
    data = pd.concat(temp, ignore_index=True)

    data = utilities.convert_affected_entity(data, 'cause_id')
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLLs'], 'affected_measure'] = 'excess_mortality_rate'
    data.loc[data['measure_id'] == vi_globals.MEASURES['YLDs'], 'affected_measure'] = 'incidence_rate'
    data = (data.groupby(['affected_entity', 'affected_measure'])
            .apply(utilities.normalize, fill_value=0)
            .reset_index(drop=True))
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + ['affected_entity', 'affected_measure']
                       + vi_globals.DRAW_COLUMNS)

    data = utilities.reshape(data, value_cols=value_cols)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Ejemplo n.º 8
0
def get_exposure(
    entity: Union[RiskFactor, AlternativeRiskFactor], location_id: int
) -> pd.DataFrame:
    data = extract.extract_data(entity, "exposure", location_id)
    data = data.drop("modelable_entity_id", "columns")

    if entity.name in EXTRA_RESIDUAL_CATEGORY:
        cat = EXTRA_RESIDUAL_CATEGORY[entity.name]
        data = data.drop(labels=data.query("parameter == @cat").index)
        data[DRAW_COLUMNS] = data[DRAW_COLUMNS].clip(lower=MINIMUM_EXPOSURE_VALUE)

    if entity.kind in ["risk_factor", "alternative_risk_factor"]:
        data = utilities.filter_data_by_restrictions(
            data, entity, "outer", utility_data.get_age_group_ids()
        )

    if entity.distribution in ["dichotomous", "ordered_polytomous", "unordered_polytomous"]:
        tmrel_cat = utility_data.get_tmrel_category(entity)
        exposed = data[data.parameter != tmrel_cat]
        unexposed = data[data.parameter == tmrel_cat]

        #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
        data = pd.concat(
            [
                utilities.normalize(exposed, fill_value=0),
                utilities.normalize(unexposed, fill_value=1),
            ],
            ignore_index=True,
        )

        # normalize so all categories sum to 1
        cols = list(set(data.columns).difference(DRAW_COLUMNS + ["parameter"]))
        sums = data.groupby(cols)[DRAW_COLUMNS].sum()
        data = (
            data.groupby("parameter")
            .apply(lambda df: df.set_index(cols).loc[:, DRAW_COLUMNS].divide(sums))
            .reset_index()
        )
    else:
        data = utilities.normalize(data, fill_value=0)
    data = data.filter(DEMOGRAPHIC_COLUMNS + DRAW_COLUMNS + ["parameter"])
    return data
Ejemplo n.º 9
0
def load_ikf_exposure(key: str, location: str) -> pd.DataFrame:
    key = EntityKey(key)
    entity = get_entity(key)

    location_id = utility_data.get_location_id(location) if isinstance(location, str) else location
    measure = 'exposure'
    raw_validation.check_metadata(entity, measure)

    data = gbd.get_exposure(entity.gbd_id, location_id)
    data = normalize_ikf_exposure_distribution(data)
    raw_validation.validate_raw_data(data, entity, measure, location_id)

    data = data.drop('modelable_entity_id', 'columns')

    data = utilities.filter_data_by_restrictions(data, entity, 'outer', utility_data.get_age_group_ids())

    tmrel_cat = utility_data.get_tmrel_category(entity)
    exposed = data[data.parameter != tmrel_cat]
    unexposed = data[data.parameter == tmrel_cat]

    #  FIXME: We fill 1 as exposure of tmrel category, which is not correct.
    data = pd.concat([utilities.normalize(exposed, fill_value=0), utilities.normalize(unexposed, fill_value=1)],
                     ignore_index=True)

    # normalize so all categories sum to 1
    cols = list(set(data.columns).difference(vi_globals.DRAW_COLUMNS + ['parameter']))
    sums = data.groupby(cols)[vi_globals.DRAW_COLUMNS].sum()
    data = (data
            .groupby('parameter')
            .apply(lambda df: df.set_index(cols).loc[:, vi_globals.DRAW_COLUMNS].divide(sums))
            .reset_index())

    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS + vi_globals.DRAW_COLUMNS + ['parameter'])
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    sim_validation.validate_for_simulation(data, entity, key.measure, location)
    data = utilities.split_interval(data, interval_column='age', split_column_prefix='age')
    data = utilities.split_interval(data, interval_column='year', split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Ejemplo n.º 10
0
def load_shigella_remission_rate(key: EntityKey, location: str):
    location_id = extract.get_location_id(location)
    data = extract.get_modelable_entity_draws(
        causes.diarrheal_diseases.dismod_id, location_id)
    data = data[data.measure_id == vi_globals.MEASURES['Remission rate']]
    data = utilities.filter_data_by_restrictions(
        data, causes.diarrheal_diseases, 'yld',
        utility_data.get_age_group_ids())
    data = data[data.year_id == 2016].drop(
        columns='year_id')  # Use latest GBD results for all data
    data = standardize.normalize(data, fill_value=0)
    data = data.filter(vi_globals.DEMOGRAPHIC_COLUMNS +
                       vi_globals.DRAW_COLUMNS)
    data = utilities.reshape(data)
    data = utilities.scrub_gbd_conventions(data, location)
    data = utilities.split_interval(data,
                                    interval_column='age',
                                    split_column_prefix='age')
    data = utilities.split_interval(data,
                                    interval_column='year',
                                    split_column_prefix='year')
    return utilities.sort_hierarchical_data(data)
Ejemplo n.º 11
0
def get_population_attributable_fraction(
    entity: Union[RiskFactor, Etiology], location_id: int
) -> pd.DataFrame:
    causes_map = {c.gbd_id: c for c in causes}
    if entity.kind == "risk_factor":
        data = extract.extract_data(entity, "population_attributable_fraction", location_id)
        relative_risk = extract.extract_data(entity, "relative_risk", location_id)

        # FIXME: we don't currently support yll-only causes so I'm dropping them because the data in some cases is
        #  very messed up, with mort = morb = 1 (e.g., aortic aneurysm in the RR data for high systolic bp) -
        #  2/8/19 K.W.
        yll_only_causes = set([c.gbd_id for c in causes if c.restrictions.yll_only])
        data = data[~data.cause_id.isin(yll_only_causes)]
        relative_risk = relative_risk[~relative_risk.cause_id.isin(yll_only_causes)]

        data = (
            data.groupby("cause_id", as_index=False)
            .apply(filter_by_relative_risk, relative_risk)
            .reset_index(drop=True)
        )

        temp = []
        # We filter paf age groups by cause level restrictions.
        for (c_id, measure), df in data.groupby(["cause_id", "measure_id"]):
            cause = causes_map[c_id]
            measure = "yll" if measure == MEASURES["YLLs"] else "yld"
            df = utilities.filter_data_by_restrictions(
                df, cause, measure, utility_data.get_age_group_ids()
            )
            temp.append(df)
        data = pd.concat(temp, ignore_index=True)

    else:  # etiology
        data = extract.extract_data(
            entity, "etiology_population_attributable_fraction", location_id
        )
        cause = [c for c in causes if entity in c.etiologies][0]
        data = utilities.filter_data_by_restrictions(
            data, cause, "inner", utility_data.get_age_group_ids()
        )
        if np.any(data[DRAW_COLUMNS] < 0):
            logger.warning(
                f"{entity.name.capitalize()} has negative values for paf. These will be replaced with 0."
            )
            other_cols = [c for c in data.columns if c not in DRAW_COLUMNS]
            data.set_index(other_cols, inplace=True)
            data = data.where(data[DRAW_COLUMNS] > 0, 0).reset_index()

    data = utilities.convert_affected_entity(data, "cause_id")
    data.loc[
        data["measure_id"] == MEASURES["YLLs"], "affected_measure"
    ] = "excess_mortality_rate"
    data.loc[data["measure_id"] == MEASURES["YLDs"], "affected_measure"] = "incidence_rate"
    data = (
        data.groupby(["affected_entity", "affected_measure"])
        .apply(utilities.normalize, fill_value=0)
        .reset_index(drop=True)
    )
    data = data.filter(
        DEMOGRAPHIC_COLUMNS + ["affected_entity", "affected_measure"] + DRAW_COLUMNS
    )
    return data