Esempio n. 1
0
def _read_regional_scalars(parent_dir: str) -> pd.DataFrame:
    """
    Read all regional scalars.
    There aren't many of them, so it's fine to keep all of them in memory.
    """
    return read_cached_hdf(
        _get_regional_scalars_path(parent_dir),
        Keys.REGIONAL_SCALARS,
        columns=[Columns.YEAR_ID, Columns.LOCATION_ID, Columns.MEAN])
Esempio n. 2
0
def _compute_cause_fractions(
        data: pd.DataFrame,
        parent_dir: str,
        location_id: int,
        year_id: int
) -> pd.DataFrame:
    """
    Creates the cause fraction estimates for the gbd database and returns as a
    new dataframe.

    :param data: pd.DataFrame
    :return pd.DataFrame
    """
    cause_fractions = data.copy()
    # Envelope may be carried over from CODEm draws, remove it.
    if Columns.ENVELOPE in cause_fractions.columns:
        cause_fractions.drop(labels=[Columns.ENVELOPE], axis=1, inplace=True)
    # Read in the envelope from disk
    envelope: pd.DataFrame = io.read_cached_hdf(
        filepath=os.path.join(
            parent_dir,
            FilePaths.INPUT_FILES_DIR,
            FilePaths.ENVELOPE_SUMMARY_FILE
        ),
        key=Keys.ENVELOPE,
        where=[f"'location_id'=={location_id} and 'year_id'=={year_id}"]
    )
    # merge together
    cause_fractions = pd.merge(
        cause_fractions,
        envelope,
        on=[
            Columns.LOCATION_ID, Columns.YEAR_ID, Columns.SEX_ID,
            Columns.AGE_GROUP_ID
        ],
        how='left',
        indicator=True
    )
    if not (cause_fractions._merge=="both").all():
        missing = cause_fractions.loc[cause_fractions._merge!="both"]
        raise ValueError(
            f"There are demographics missing cause_fraction information:\n"
            f"{missing}"
        )
    cause_fractions = cause_fractions.drop("_merge", axis=1)

    # Compute: draws / envelope_mean
    cause_fractions[Columns.DRAWS] = cause_fractions[Columns.DRAWS].div(
        cause_fractions[Columns.ENVELOPE].values, axis='index'
    ).reset_index(drop=True)
    cause_fractions.drop([Columns.ENVELOPE], axis=1, inplace=True)
    return cause_fractions
Esempio n. 3
0
def _read_spacetime_restrictions(parent_dir: str) -> pd.DataFrame:
    """
    Read all spacetime restrictions.
    There aren't many of them, so it's fine to keep all of them in memory.
    """
    return io.read_cached_hdf(
        _get_spacetime_restrictions_path(parent_dir),
        constants.Keys.SPACETIME_RESTRICTIONS,
        columns=[
            constants.Columns.CAUSE_ID,
            constants.Columns.LOCATION_ID,
            constants.Columns.YEAR_ID
        ]
    )
Esempio n. 4
0
def _read_pred_ex_filtered(
        parent_dir: str,
        location_id: int,
        sex_id: int
) -> pd.DataFrame:
    """
    Read predicted life expectancy filtered by location_id, sex_id, and year_id
    """
    pred_ex_filter = [
        f'{Columns.LOCATION_ID}=={location_id}',
        f'{Columns.SEX_ID}=={sex_id}'
    ]
    pred_ex_columns = Columns.INDEX + [Columns.PRED_EX]
    pred_ex_path = _get_pred_ex_path(parent_dir)
    return read_cached_hdf(
        pred_ex_path, Keys.PRED_EX, pred_ex_filter, pred_ex_columns
    )
Esempio n. 5
0
def _read_scalars_filtered(
        scalar_location_id: int,
        sex_id: int,
        year_ids: int,
        scalar_version_id: int,
        draw_location_id: int
) -> pd.DataFrame:
    """Read scalars filtered by location_id, sex_id, and year_id"""
    scalars_filter = [
        f'{constants.Columns.LOCATION_ID}=={scalar_location_id}',
        f'{constants.Columns.SEX_ID}=={sex_id}',
        f'{constants.Columns.YEAR_ID} in {year_ids}'
    ]
    scalars_columns = constants.Columns.INDEX + [constants.Columns.SCALAR]
    scalars_path = _get_scalars_path(scalar_version_id)
    scalar_data = io.read_cached_hdf(
        scalars_path, constants.Keys.SCALARS, scalars_filter, scalars_columns
    )
    scalar_data[constants.Columns.LOCATION_ID] = draw_location_id
    return scalar_data
Esempio n. 6
0
def _read_envelope_summary(
        parent_dir: str,
        location_id: int,
        sex_id: int
) -> pd.DataFrame:
    """Read in envelope summary filtered by location_id and sex_id."""
    envelope_filter = [
        f'{constants.Columns.LOCATION_ID}=={location_id}',
        f'{constants.Columns.SEX_ID}=={sex_id}'
    ]
    envelope_columns = (
        constants.Columns.DEMOGRAPHIC_INDEX + constants.Columns.ENVELOPE_DRAWS
    )
    envelope_path = _get_envelope_summary_path(parent_dir)
    envelope_summ = io.read_cached_hdf(
        envelope_path,
        constants.Keys.ENVELOPE_SUMMARY,
        envelope_filter,
        envelope_columns
    )
    return envelope_summ
Esempio n. 7
0
def summarize_cod(
        parent_dir: str,
        gbd_round_id: int,
        location_id: int,
        year_id: int,
        version: MachineParameters
) -> None:
    measure_id = Measures.Ids.DEATHS
    logging.info("Read in scaled draws with shocks from disk.")
    data_with_shocks = _read_scaled_draws(version.process,
                                          parent_dir,
                                          location_id,
                                          year_id,
                                          measure_id)
    check_duplicates(data_with_shocks, subset=Columns.INDEX)
    if Columns.MEASURE_ID in data_with_shocks:
        data_with_shocks = data_with_shocks.drop(Columns.MEASURE_ID, axis=1)

    logging.info("Read in scaled draws without shocks from disk.")
    data_no_shocks = io.read_aggregated_rescaled_draws_for_summaries(
        parent_dir, location_id, year_id, measure_id=measure_id)
    check_duplicates(data_no_shocks, subset=Columns.INDEX)
    if Columns.MEASURE_ID in data_no_shocks:
        data_no_shocks = data_no_shocks.drop(Columns.MEASURE_ID, axis=1)

    logging.info("Read in population.")
    population: pd.DataFrame = io.read_cached_hdf(
        filepath=os.path.join(
            parent_dir,
            FilePaths.INPUT_FILES_DIR,
            FilePaths.POPULATION_FILE
        ),
        key=Keys.POPULATION,
        where=[f"'location_id'=={location_id} and 'year_id'=={year_id}"]
    )
    population = population[Columns.DEMOGRAPHIC_INDEX + [Columns.POPULATION]]
    population = _compute_population_aggregates(population, gbd_round_id)

    df_dict = {'data_with_shocks': data_with_shocks,
               'data_no_shocks': data_no_shocks}
    summaries_dict = {}
    for df_name in df_dict:
        df = df_dict[df_name]
        # create sex_id 3
        logging.info(f"Compute sex aggregates for {df_name}.")
        sex_aggregate = _compute_sex_aggregate(df)
        df = pd.concat(
            [df, sex_aggregate],
            sort=True
        ).reset_index(drop=True)
        # create all age
        logging.info(f"Compute age aggregates for {df_name}.")
        age_aggregate = _compute_age_aggregates(df, gbd_round_id)
        # merge on population
        logging.info("Merge population on demographic indices.")
        df = _merge_population(df, population)
        # create age standardized
        logging.info(f"Compute age standardized for {df_name}.")
        age_standardized_rates = _compute_age_standardized_rate(
            df,
            gbd_round_id=gbd_round_id
        ).drop(Columns.POPULATION, axis=1)
        # Drop population column, add in our age aggregates before
        # calculating cause fractions for COD database
        logging.info(f"Add age aggregates to {df_name}.")
        df = df.drop(Columns.POPULATION, axis=1)
        df = pd.concat(
            [df, age_aggregate],
            sort=True
        ).reset_index(drop=True)

        # Do not add back into the unscaled data, we need only count space for
        # cause fraction calculation.
        logging.info(f"Compute cause fractions for {df_name}.")
        cause_fractions = _compute_cause_fractions_codcorrect(df)

        # add age-standardized to our count-space df
        logging.info(f"Add age standardized to {df_name}.")
        df = pd.concat([df, age_standardized_rates], sort=True)
        rename = {Columns.COD_MEAN: Columns.CAUSE_FRACTION_MEAN,
                     Columns.COD_LOWER: Columns.CAUSE_FRACTION_LOWER,
                     Columns.COD_UPPER: Columns.CAUSE_FRACTION_UPPER}
        rename_values = list(rename.values())
        logging.info("Summarizing data.")
        df_summary = _generate_summaries(df, DataBases.COD)[
            Columns.INDEX + list(rename.keys())]
        cf_summary = _generate_summaries(cause_fractions, DataBases.COD).rename(
            columns=rename)[Columns.INDEX + rename_values]
        df_summary = df_summary.merge(
            cf_summary, on=Columns.INDEX, how='left')
        df_summary[rename_values] = df_summary[rename_values].fillna(0)
        summaries_dict[df_name] = df_summary
    summary_val_cols = Columns.COD_SUMMARY + Columns.CAUSE_FRACTION_SUMMARY
    shocks_rename_cols = [col + '_with_shocks' for col in summary_val_cols]
    rename = dict(zip(summary_val_cols, shocks_rename_cols))
    summaries_dict['data_with_shocks'] = (
        summaries_dict['data_with_shocks'].rename(
            columns=rename)
    )
    summary = pd.merge(summaries_dict['data_no_shocks'],
                       summaries_dict['data_with_shocks'],
                       on=Columns.INDEX, how='outer')
    # there will be NaNs for mean/upper/lower_death for any shocks,
    # because of the outer merge on cause_id, when obviously there's no
    # non-shock data for a shock cause
    summary[shocks_rename_cols] = (
        summary[shocks_rename_cols].fillna(0))
    if _is_most_detailed_location(location_id, version):
        model_version_ids = _get_model_version_ids(version)
        summary = summary.merge(
            model_version_ids,
            on=[Columns.CAUSE_ID, Columns.SEX_ID, Columns.AGE_GROUP_ID],
            how='left')
        summary[Columns.MODEL_VERSION_ID] = summary[
            Columns.MODEL_VERSION_ID].fillna(0)
    else:
        summary[Columns.MODEL_VERSION_ID] = 0

    # Add in measure_id
    summary[Columns.MEASURE_ID] = measure_id
    # Division by 0 can create inf, replace all inf with na and replace
    # na with 0
    summary = summary.replace([np.inf, -np.inf], np.nan).fillna(0)
    logging.info("Saving summaries.")
    _save_cod_summaries(summary, parent_dir, location_id, year_id,
                        version.version_id)
Esempio n. 8
0
def prep_summarize_gbd(
        df: pd.DataFrame,
        tool_name: str,
        parent_dir: str,
        gbd_round_id: int,
        location_id: int,
        year_id: int
) -> pd.DataFrame:
    # Read population from disk
    logging.info("Read in population cache")
    population: pd.DataFrame = io.read_cached_hdf(
        filepath=os.path.join(
            parent_dir,
            FilePaths.INPUT_FILES_DIR,
            FilePaths.POPULATION_FILE
        ),
        key=Keys.POPULATION,
        where=[f"location_id=={location_id} and year_id=={year_id}"]
    )
    population = population[Columns.DEMOGRAPHIC_INDEX + [Columns.POPULATION]]
    population = _compute_population_aggregates(population, gbd_round_id)

    # Compute sex aggregates
    logging.info("Compute sex aggregates and combine with scaled estimates.")
    sex_aggregate = _compute_sex_aggregate(df)
    df = pd.concat(
        [df, sex_aggregate],
        sort=True
    ).reset_index(drop=True)

    # Compute age aggregates
    logging.info("Compute age aggregates and combine with scaled estimates.")
    age_aggregate = _compute_age_aggregates(df, gbd_round_id)

    # Compute ASR
    # First add a metric id to the existing scaled estimates,
    # then merge on a population column, then compute ASR
    logging.info("Compute age standardized rate.")
    df[Columns.METRIC_ID] = gbd.metrics.NUMBER
    logging.info("Merge population on demographic indices.")
    df = _merge_population(df, population)
    age_standardized_rates = _compute_age_standardized_rate(
        df,
        gbd_round_id=gbd_round_id
    )

    # Compute GBD rates
    # Drop pop column from scaled estimates, add a metric_id to the age
    # aggregates, combine with scaled estimates,
    # and then merge on a new population before computing rates for GBD
    # database
    logging.info("Compute GBD rates.")
    df =  df.drop(Columns.POPULATION, axis=1)
    age_aggregate[Columns.METRIC_ID] = gbd.metrics.NUMBER
    df = pd.concat(
        [df, age_aggregate],
        sort=True
    ).reset_index(drop=True)
    # merge on a new population column that is not age or sex restricted
    df = _merge_population(df, population)
    # Do not add back into the unscaled data, we need only count space for
    # cause fraction calculation.
    rate_estimates = _compute_rates(df)

    logging.info("Compute GBD cause fractions.")
    if tool_name == GBD.Process.Name.FAUXCORRECT:
        cause_fractions = _compute_cause_fractions(
            df,
            parent_dir=parent_dir,
            location_id=location_id,
            year_id=year_id
    )
    else:
        cause_fractions = _compute_cause_fractions_codcorrect(
            df)
    cause_fractions[Columns.METRIC_ID] = gbd.metrics.PERCENT

    logging.info("Bringing all our newly created demographics together.")
    df = pd.concat(
        [
            df, age_standardized_rates, rate_estimates,
            cause_fractions
        ],
        sort=True
    )
    if Columns.POPULATION in df.columns:
        df = df.drop(columns=Columns.POPULATION)
    return df