Example #1
def aggregate_to_country_level(orig_df, location_set_version_id):
    """Aggregate sub nationals to country level."""
    df = orig_df.copy()

    # merge on country level location_ids
    location_meta_df = get_current_location_hierarchy(
    country_location_ids = \
    df = df.merge(country_location_ids, how='left', on='location_id')
    report_if_merge_fail(df, 'country_location_id', ['location_id'])

    # aggregate sub national locations to national level
    df = df[df['location_id'] != df['country_location_id']]
    df['location_id'] = df['country_location_id']
    df = df.drop(['country_location_id'], axis=1)
    group_cols = [col for col in df.columns if col not in VAL_COLS]
    df = df.groupby(group_cols, as_index=False)[VAL_COLS].sum()
    df['loc_agg'] = 1

    # append aggregates to original dataframe
    orig_df['loc_agg'] = 0
    df = df.append(orig_df)
    return df
def get_country_loc_id_map(location_hierarchy):
    """Creates a map of location_id -> country id, meaning countries are
    mapped to themselves and subnationals are mapped to their parent country.
    This is so that we can aggregate data up to the country level.
    all_locs = list(location_hierarchy.query('level >= 3')['location_id'].unique())
    country_location_map = get_country_level_location_id(all_locs, location_hierarchy)
    country_location_map = country_location_map.set_index('location_id').to_dict()['country_location_id']
    return country_location_map
Example #3
    def append_national_aggregates(self, df):
        """Aggregate subnationals to national and append."""
        country_ids = get_country_level_location_id(
            df.location_id.unique(), self.location_meta_df).set_index(
        df["country_loc_id"] = df["location_id"].map(country_ids)
        report_if_merge_fail(df, 'country_loc_id', 'location_id')

        nat_vr_df = df.query('country_loc_id != location_id & '
                             '(data_type_id == 9 | data_type_id == 10)')
        nat_vr_df = self.aggregate_national_vr(nat_vr_df)

        # Now aggregate VA + CHAMPS
        nat_va_df = df.query(
            'country_loc_id != location_id & data_type_id in [8, 12]')
        nat_va_df = self.aggregate_national_va(nat_va_df)

        df = pd.concat([nat_va_df, nat_vr_df, df], ignore_index=True)

        df = df.drop('country_loc_id', axis=1)

        return df
Example #4
    def simple_aggregate(self):
        """Aggregate location_ids to country level."""
        df = self.df.copy()
        country_location_ids = \
        df = df.merge(country_location_ids, how='left', on='location_id')
        report_if_merge_fail(df, 'country_location_id', ['location_id'])
        df = df[df['location_id'] != df['country_location_id']]
        df['location_id'] = df['country_location_id']
        df = df.drop(['country_location_id'], axis=1)

        # want to collapse site_id for national level
        group_cols = [col for col in df.columns if col not in self.val_cols]
        df = df.groupby(group_cols, as_index=False)[self.val_cols].sum()

        # set site_id for national aggregates (cannot be missing)
        df['site_id'] = 2

        # append national aggregates to the incoming dataframe
        df = df.append(self.df)

        return df
Example #5
    def get_computed_dataframe(self, df, location_meta_df):
        """Split value_column into detailed age and sex groups.

        Applies a relative rate splitting algorithm with a K-multiplier that
        adjusts for the specific population that the data to be split applies

        Arguments and Attributes:
            df (pandas.DataFrame): must contain all columns needed to merge on
                    ['location_id', 'age_group_id', 'sex_id', 'year_id'].
                Must be unique on id_cols.
            id_cols (list): list of columns that must exist in df and identify
                observations. Used to preserve df in every way except for
                splitting value_column, age_group_id, and sex_id.
            pop_run_id (int): which population version to use
            cause_set_version_id (int): which cause set version id to use
            value_column (str): must be a column in df that contains values
                to be split
            gbd_round_id (int): which gbd_round is it
            gbd_team_for_ages (str): what gbd team to use to call the shared
                function db_queries.get_demographics

            split_df (pandas.DataFrame): contains all the columns passed
                in df, but all age_group_id values will be detailed, all
                sex_ids will be detailed (1, 2), and val will be split
                into these detailed ids.
        # set cache options
        standard_cache_options = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_dir': "standard",
            'cache_results': False
        verbose = self.verbose
        value_column = self.value_column
        pop_run_id = self.pop_run_id
        cause_set_version_id = self.cause_set_version_id
        gbd_round_id = self.conf.get_id('gbd_round')
        id_cols = self.id_cols
        gbd_team_for_ages = self.gbd_team_for_ages

        orig_val_sum = df[self.value_column].sum()

        # pull in populations
        # get relevant populations
        if verbose:
            print("[{}] Prepping population".format(str(datetime.now())))

        locations_in_data = list(set(df.location_id))
        mapping_to_country_location_id = get_country_level_location_id(
            locations_in_data, location_meta_df)
        # Map subnational to it's country
        df = df.merge(mapping_to_country_location_id,
        df.rename(columns={'location_id': 'orig_location_id'}, inplace=True)

        df['location_id'] = df['country_location_id']
        df.drop('country_location_id', axis=1, inplace=True)
        country_locations_in_data = list(df['location_id'].unique())
        years_in_data = list(set(df.year_id))
        pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)
        pop_df = pop_df.loc[
            & (pop_df['year_id'].isin(years_in_data))]

        # what columns identify population data
        pop_id_cols = ['location_id', 'age_group_id', 'sex_id', 'year_id']

        assert not pop_df[pop_id_cols].duplicated().any()
        # pull causes table
        if verbose:
            print("[{}] Prepping cause metadata".format(str(datetime.now())))
        cause_meta_df = get_current_cause_hierarchy(

        # pull age sex weights
        if verbose:
            print("[{}] Prepping age sex weights".format(str(datetime.now())))
        dist_df = get_cause_age_sex_distributions(
        keep_cols = ['cause_id', 'age_group_id', 'sex_id', 'weight']
        dist_df = dist_df[keep_cols]
        # pull age detail map
        if verbose:
            print("[{}] Prepping age agg to detail "
        age_detail_map = getcache_age_aggregate_to_detail_map(
            gbd_round_id=gbd_round_id, **standard_cache_options)

        # create map from aggregate sex ids to detail sex ids
        if verbose:
            print("[{}] Prepping sex detail map".format(str(datetime.now())))
        sex_detail_map = AgeSexSplitter.prep_sex_aggregate_to_detail_map()

        detail_maps = {
            'age_group_id': age_detail_map,
            'sex_id': sex_detail_map

        dist_causes = dist_df.cause_id.unique()

        if verbose:
            print("[{}] Prepping cause_id to weight cause "
        cause_to_weight_cause_map = \
                cause_meta_df, dist_causes)

        val_to_dist_maps = {'cause_id': cause_to_weight_cause_map}
        # which columns are to be split
        split_cols = ['age_group_id', 'sex_id']

        split_inform_cols = ['cause_id']

        value_cols = [value_column]

        if verbose:
            print("[{}] Running RR splitting "
        split_df = relative_rate_split(df,

        df.drop('location_id', axis=1, inplace=True)
        df.rename(columns={'orig_location_id': 'location_id'}, inplace=True)
        if self.collect_diagnostics:
            # making this optional because of memory usage
            self.diag_df = split_df.copy()

        group_columns = list(df.columns)
        if verbose:
            print("[{}] Collapsing result".format(str(datetime.now())))
        split_df = split_df.groupby(group_columns,

        if verbose:
            print("[{}] Asserting valid results".format(str(datetime.now())))
        val_diff = abs(split_df[value_column].sum() - orig_val_sum)
        if not np.allclose(split_df[value_column].sum(), orig_val_sum):
            text = "Difference of {} {} from age sex " \
                   "splitting".format(val_diff, value_column)
            raise AssertionError(text)

        # check that all age group ids are good
        good_age_group_ids = db_queries.get_demographics(
            gbd_team_for_ages, gbd_round_id=gbd_round_id)['age_group_id']
        bad = set(split_df.age_group_id) - set(good_age_group_ids)
        if len(bad) > 0:
            text = "Some age group ids still aggregate: {}".format(bad)
            raise AssertionError(text)

        # should be the same set of cause ids
        assert set(split_df.cause_id) == set(df.cause_id)

        return split_df