Esempio n. 1
0
def aggregate_to_country_level(orig_df, location_set_version_id):
    """Aggregate sub nationals to country level."""
    df = orig_df.copy()

    # merge on country level location_ids
    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id)
    country_location_ids = \
        get_country_level_location_id(df.location_id.unique(),
                                      location_meta_df)
    df = df.merge(country_location_ids, how='left', on='location_id')
    report_if_merge_fail(df, 'country_location_id', ['location_id'])

    # aggregate sub national locations to national level
    df = df[df['location_id'] != df['country_location_id']]
    df['location_id'] = df['country_location_id']
    df = df.drop(['country_location_id'], axis=1)
    group_cols = [col for col in df.columns if col not in VAL_COLS]
    df = df.groupby(group_cols, as_index=False)[VAL_COLS].sum()
    df['loc_agg'] = 1

    # append aggregates to original dataframe
    orig_df['loc_agg'] = 0
    df = df.append(orig_df)
    return df
Esempio n. 2
0
def get_country_loc_id_map(location_hierarchy):
    """Creates a map of location_id -> country id, meaning countries are
    mapped to themselves and subnationals are mapped to their parent country.
    This is so that we can aggregate data up to the country level.
    """
    all_locs = list(location_hierarchy.query('level >= 3')['location_id'].unique())
    country_location_map = get_country_level_location_id(all_locs, location_hierarchy)
    country_location_map = country_location_map.set_index('location_id').to_dict()['country_location_id']
    return country_location_map
Esempio n. 3
0
    def append_national_aggregates(self, df):
        """Aggregate subnationals to national and append."""
        country_ids = get_country_level_location_id(
            df.location_id.unique(), self.location_meta_df).set_index(
                'location_id')['country_location_id'].to_dict()
        df["country_loc_id"] = df["location_id"].map(country_ids)
        report_if_merge_fail(df, 'country_loc_id', 'location_id')

        nat_vr_df = df.query('country_loc_id != location_id & '
                             '(data_type_id == 9 | data_type_id == 10)')
        nat_vr_df = self.aggregate_national_vr(nat_vr_df)

        # Now aggregate VA + CHAMPS
        nat_va_df = df.query(
            'country_loc_id != location_id & data_type_id in [8, 12]')
        nat_va_df = self.aggregate_national_va(nat_va_df)

        df = pd.concat([nat_va_df, nat_vr_df, df], ignore_index=True)

        df = df.drop('country_loc_id', axis=1)

        return df
Esempio n. 4
0
    def simple_aggregate(self):
        """Aggregate location_ids to country level."""
        df = self.df.copy()
        country_location_ids = \
            get_country_level_location_id(df.location_id.unique(),
                                          self.location_meta_df)
        df = df.merge(country_location_ids, how='left', on='location_id')
        report_if_merge_fail(df, 'country_location_id', ['location_id'])
        df = df[df['location_id'] != df['country_location_id']]
        df['location_id'] = df['country_location_id']
        df = df.drop(['country_location_id'], axis=1)

        # want to collapse site_id for national level
        group_cols = [col for col in df.columns if col not in self.val_cols]
        group_cols.remove('site_id')
        df = df.groupby(group_cols, as_index=False)[self.val_cols].sum()

        # set site_id for national aggregates (cannot be missing)
        df['site_id'] = 2

        # append national aggregates to the incoming dataframe
        df = df.append(self.df)

        return df
Esempio n. 5
0
    def get_computed_dataframe(self, df, location_meta_df):
        """Split value_column into detailed age and sex groups.

        Applies a relative rate splitting algorithm with a K-multiplier that
        adjusts for the specific population that the data to be split applies
        to.

        Arguments and Attributes:
            df (pandas.DataFrame): must contain all columns needed to merge on
                population:
                    ['location_id', 'age_group_id', 'sex_id', 'year_id'].
                Must be unique on id_cols.
            id_cols (list): list of columns that must exist in df and identify
                observations. Used to preserve df in every way except for
                splitting value_column, age_group_id, and sex_id.
            pop_run_id (int): which population version to use
            cause_set_version_id (int): which cause set version id to use
            value_column (str): must be a column in df that contains values
                to be split
            gbd_round_id (int): which gbd_round is it
            gbd_team_for_ages (str): what gbd team to use to call the shared
                function db_queries.get_demographics

        Returns:
            split_df (pandas.DataFrame): contains all the columns passed
                in df, but all age_group_id values will be detailed, all
                sex_ids will be detailed (1, 2), and val will be split
                into these detailed ids.
        """
        # set cache options
        standard_cache_options = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_dir': "standard",
            'cache_results': False
        }
        verbose = self.verbose
        value_column = self.value_column
        pop_run_id = self.pop_run_id
        cause_set_version_id = self.cause_set_version_id
        gbd_round_id = self.conf.get_id('gbd_round')
        id_cols = self.id_cols
        gbd_team_for_ages = self.gbd_team_for_ages

        orig_val_sum = df[self.value_column].sum()

        # pull in populations
        # get relevant populations
        if verbose:
            print("[{}] Prepping population".format(str(datetime.now())))

        locations_in_data = list(set(df.location_id))
        mapping_to_country_location_id = get_country_level_location_id(
            locations_in_data, location_meta_df)
        # Map subnational to it's country
        df = df.merge(mapping_to_country_location_id,
                      how='left',
                      on='location_id')
        df.rename(columns={'location_id': 'orig_location_id'}, inplace=True)

        df['location_id'] = df['country_location_id']
        df.drop('country_location_id', axis=1, inplace=True)
        country_locations_in_data = list(df['location_id'].unique())
        years_in_data = list(set(df.year_id))
        pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options)
        pop_df = pop_df.loc[
            (pop_df['location_id'].isin(country_locations_in_data))
            & (pop_df['year_id'].isin(years_in_data))]

        # what columns identify population data
        pop_id_cols = ['location_id', 'age_group_id', 'sex_id', 'year_id']

        assert not pop_df[pop_id_cols].duplicated().any()
        # pull causes table
        if verbose:
            print("[{}] Prepping cause metadata".format(str(datetime.now())))
        cause_meta_df = get_current_cause_hierarchy(
            cause_set_version_id=cause_set_version_id,
            **standard_cache_options)

        # pull age sex weights
        if verbose:
            print("[{}] Prepping age sex weights".format(str(datetime.now())))
        dist_df = get_cause_age_sex_distributions(
            distribution_set_version_id=self.distribution_set_version_id,
            **standard_cache_options)
        keep_cols = ['cause_id', 'age_group_id', 'sex_id', 'weight']
        dist_df = dist_df[keep_cols]
        # pull age detail map
        if verbose:
            print("[{}] Prepping age agg to detail "
                  "map".format(str(datetime.now())))
        age_detail_map = getcache_age_aggregate_to_detail_map(
            gbd_round_id=gbd_round_id, **standard_cache_options)

        # create map from aggregate sex ids to detail sex ids
        if verbose:
            print("[{}] Prepping sex detail map".format(str(datetime.now())))
        sex_detail_map = AgeSexSplitter.prep_sex_aggregate_to_detail_map()

        detail_maps = {
            'age_group_id': age_detail_map,
            'sex_id': sex_detail_map
        }

        dist_causes = dist_df.cause_id.unique()

        if verbose:
            print("[{}] Prepping cause_id to weight cause "
                  "map".format(str(datetime.now())))
        cause_to_weight_cause_map = \
            AgeSexSplitter.prep_cause_to_weight_cause_map(
                cause_meta_df, dist_causes)

        val_to_dist_maps = {'cause_id': cause_to_weight_cause_map}
        # which columns are to be split
        split_cols = ['age_group_id', 'sex_id']

        split_inform_cols = ['cause_id']

        value_cols = [value_column]

        if verbose:
            print("[{}] Running RR splitting "
                  "algorithm".format(str(datetime.now())))
        split_df = relative_rate_split(df,
                                       pop_df,
                                       dist_df,
                                       detail_maps,
                                       split_cols,
                                       split_inform_cols,
                                       pop_id_cols,
                                       value_cols,
                                       pop_val_name='population',
                                       val_to_dist_map_dict=val_to_dist_maps,
                                       verbose=verbose)

        df.drop('location_id', axis=1, inplace=True)
        df.rename(columns={'orig_location_id': 'location_id'}, inplace=True)
        if self.collect_diagnostics:
            # making this optional because of memory usage
            self.diag_df = split_df.copy()

        group_columns = list(df.columns)
        group_columns.remove(value_column)
        if verbose:
            print("[{}] Collapsing result".format(str(datetime.now())))
        split_df = split_df.groupby(group_columns,
                                    as_index=False)[value_column].sum()

        if verbose:
            print("[{}] Asserting valid results".format(str(datetime.now())))
        val_diff = abs(split_df[value_column].sum() - orig_val_sum)
        if not np.allclose(split_df[value_column].sum(), orig_val_sum):
            text = "Difference of {} {} from age sex " \
                   "splitting".format(val_diff, value_column)
            raise AssertionError(text)

        # check that all age group ids are good
        good_age_group_ids = db_queries.get_demographics(
            gbd_team_for_ages, gbd_round_id=gbd_round_id)['age_group_id']
        bad = set(split_df.age_group_id) - set(good_age_group_ids)
        if len(bad) > 0:
            text = "Some age group ids still aggregate: {}".format(bad)
            raise AssertionError(text)

        # should be the same set of cause ids
        assert set(split_df.cause_id) == set(df.cause_id)

        return split_df