예제 #1
0
    get_country_level_location_id,
    get_cause_map,
    add_location_metadata,
    get_current_location_hierarchy
)
from cod_prep.utils import (
    print_log_message,
    report_duplicates,
    report_if_merge_fail,
    cod_timestamp
)
from cod_prep.claude.claude_io import get_claude_data, makedirs_safely
from cod_prep.claude.configurator import Configurator
from save_proportions_for_tableau import SharedPackage

CONF = Configurator()

MODEL_DATA_CODE_SYSTEMS = [1, 6]
RDP_REG_DIR = CONF.get_directory('rdp_regressions')


def get_package_code_ids(regression_specification, code_system_id):
    """Returns code_ids for garbage codes in package for given code system"""
    package_description = regression_specification[
        'package_descriptions'
    ][code_system_id]

    packages = get_package_list(code_system_id)
    package_id = packages.loc[
        packages['package_description'] == package_description,
        'package_id'
예제 #2
0
def run_phase(df, cause_set_version_id, location_set_version_id, data_type_id,
              env_run_id, source, nid, extract_type_id, remove_decimal,
              code_map_version_id, iso3):
    """Run the full pipeline, chaining together CodProcesses."""
    configurator = Configurator('standard')
    cache_dir = configurator.get_directory('db_cache')
    cache_options = {
        'block_rerun': True,
        'cache_dir': cache_dir,
        'force_rerun': False,
        'cache_results': False
    }

    # get cause hierarchy
    cause_meta_df = get_current_cause_hierarchy(
        cause_set_version_id=cause_set_version_id, **cache_options)

    # get location hierarchy
    location_meta_df = get_current_location_hierarchy(
        location_set_version_id=location_set_version_id, **cache_options)

    # get envelope
    env_meta_df = get_env(env_run_id=env_run_id, **cache_options)

    # get env with HIV
    env_hiv_meta_df = get_env(env_run_id=env_run_id,
                              with_hiv=True,
                              **cache_options)

    # get age groups
    age_meta_df = get_ages(**cache_options)

    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    cause_map = get_cause_map(code_map_version_id=code_map_version_id,
                              **cache_options)

    package_map = get_package_map(code_system_id=code_system_id,
                                  **cache_options)

    disagg_df = get_phase_output("disaggregation", nid, extract_type_id)
    misdc_df = get_phase_output("misdiagnosiscorrection", nid, extract_type_id)

    cause_package_hierarchy = get_cause_package_hierarchy(
        code_system_id, **cache_options)

    if source == "Cancer_Registry":
        df = prune_cancer_registry_data(df, location_meta_df)

    # aggregate location
    # defaults to simple location -> national aggregation
    # running full aggregation for India Survey data
    print_log_message("Aggregating location to country level")
    location_aggregator = LocationAggregator(df, location_meta_df)
    if (data_type_id == 7) & (iso3 == 'IND'):
        df = location_aggregator.get_computed_dataframe('full')
    else:
        df = location_aggregator.get_computed_dataframe()

    if data_type_id in POLICE_SURVEY_DATA_TYPE:
        # special step to remove HIV from maternal data
        print_log_message("Removing HIV from cc_code for maternal data.")
        maternal_hiv_remover = MaternalHIVRemover(df, env_meta_df,
                                                  env_hiv_meta_df, source, nid)
        df = maternal_hiv_remover.get_computed_dataframe()

    print_log_message("Calculating sample size")
    df = calc_sample_size(df)
    print_log_message(log_statistic(df))

    print_log_message("Converting to cause fractions")
    df = df.loc[df['sample_size'] > 0]
    df = convert_to_cause_fractions(
        df, ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw'])
    print_log_message(log_statistic(df))

    if data_type_id == VA_DATA_TYPE:
        # run VA anemia adjusment
        print_log_message("Running VA Anemia adjustment")
        va_anemia_adjuster = AnemiaAdjuster()
        df = va_anemia_adjuster.get_computed_dataframe(df)

    if data_type_id == POLICE_DATA_TYPE:
        if source == 'Various_RTI':
            rti_adjuster = RTIAdjuster(df, cause_meta_df, age_meta_df,
                                       location_meta_df)
            df = rti_adjuster.get_computed_dataframe()

    if data_type_id in POLICE_SURVEY_DATA_TYPE:
        # issue: rows with > 0 sample size are dropped
        # most common in maternal data, but relevant anywhere
        # we have only cc_code and one other cause and there
        # are 0 deaths for the other cause for a given age/sex
        cause_list = df.cause_id.unique()
        square_me = (len(cause_list) == 2) & (CC_CODE in cause_list)
        if (source in MATERNAL_SQUARED) or square_me:
            print_log_message("Squaring maternal data")
            df = square_maternal_sources(df, cause_meta_df, age_meta_df)

    print_log_message("Dropping cc code")
    df = drop_cc_code(df)
    print_log_message(log_statistic(df))

    print_log_message("Splitting locations.")
    env_loc_splitter = EnvelopeLocationSplitter(df, env_meta_df, source)
    df = env_loc_splitter.get_computed_dataframe()
    print_log_message(log_statistic(df))

    # aggregate causes
    print_log_message("Aggregating causes")
    cause_aggregator = CauseAggregator(df, cause_meta_df, source)
    df = cause_aggregator.get_computed_dataframe()
    print_log_message(log_statistic(df))

    print_log_message("Adding parnt-mapped garbage to aggregated causes")
    parent_gbg_adder = ParentMappedAggregatedGarbageAdder(
        nid, extract_type_id, source, cause_package_hierarchy, cause_meta_df,
        package_map, cause_map, remove_decimal, disagg_df, misdc_df)
    df = parent_gbg_adder.get_computed_dataframe(df)

    print_log_message("Applying hiv-prevalance in pregnancy adjustment to "
                      "maternal deaths")
    hmp = HIVMatPAFs()
    df = hmp.get_computed_dataframe(df, cause_meta_df, location_meta_df)
    print_log_message(log_statistic(df))

    # TO DO
    # ** In the recode step for BTL some cancer deaths were moved to the
    # cancer parent. The squaring step created 0's. Get rid of the 0's in
    # country-years the recode was previously applied to.

    print_log_message(
        "Removing HIV and shocks from cause fraction denominator")
    hiv_shock_remover = SampleSizeCauseRemover(cause_meta_df)
    df = hiv_shock_remover.get_computed_dataframe(df)
    print_log_message(log_statistic(df))

    # not sure why we do this, but could use a comment of some kind.
    df = conform_one_like_cf_to_one(df)

    print_log_message("Verifying cause fractions not null between 0 and 1")
    assert_valid_cause_fractions(df)

    if dataset_has_redistribution_variance(data_type_id, source):
        # Determine the redistribution variance
        rdvar = RedistributionVarianceEstimator(
            nid,
            extract_type_id,
            cause_meta_df,
            remove_decimal,
            code_system_id,
            cause_map,
            package_map,
            code_map_version_id=code_map_version_id)
        df = rdvar.get_computed_dataframe(df, **cache_options)

    return df
예제 #3
0
class MCoDMapper():
    """Map ICD codes to code_ids, cause_ids.

    Arguments:
        int_cause (str): the intermediate cause of interest (e.g. sepsis)
        code_system_id (int): the ICD category, determines which map to use
        code_map_version_id (int): the version of the map to use
        df (dataframe): dataframe of formatted mcod data

    Returns:
        df (dataframe): dataframe with the underlying cause mapped to code id and cause_id
        and the causes in the chain flagged for containing the intermediate cause of interest.


    """
    cache_options = {'force_rerun': False, 'block_rerun': True}
    conf = Configurator()
    inj_causes = ['x59', 'y34']
    int_cause_name_dict = {
        'x59': ['unspecified external factor x59'],
        'y34': ['external causes udi,type unspecified-y34']
    }
    possible_int_causes = list(int_cause_name_dict.keys())

    def __init__(self, int_cause, code_system_id, code_map_version_id,
                 drop_p2):
        self.int_cause = int_cause
        self.code_system_id = code_system_id
        self.code_map_version_id = code_map_version_id
        self.drop_p2 = drop_p2
        assert self.int_cause in self.possible_int_causes, \
            f"{self.int_cause} is not a valid intermediate cause"
        self.full_cause_name = self.int_cause_name_dict[self.int_cause]
        if type(self.full_cause_name) != list:
            self.full_cause_name = [self.full_cause_name]

    @staticmethod
    def get_code_columns(df):
        """Get a list of raw cause columns with ICD codes as values."""
        col_names = list(df.columns)
        code_cols = [
            x for x in col_names if "multiple_cause" in x and "pII" not in x
        ] + ['cause']
        return code_cols

    @staticmethod
    def _get_cause_num(mcod_col):
        """Get sort order for cause columns.

        Assumes you have an underlying cause (cause_x) column and chain columns (multiple_cause_x)
        and that the value to sort off of is after the second underscore.
        """
        if mcod_col.startswith('cause'):
            return '0'
        else:
            assert re.match(r"^multiple_cause_[a-z]*[0-9]*", mcod_col), \
                f"column {mcod_col} does not match expected format: multiple_cause_x"
            return mcod_col.split('_')[2]

    @staticmethod
    def prep_raw_mapped_cause_dictionary(raw_cols, mapped_cols):
        """Create dictionary of raw cause columns to mapped cause columns.

        Ensures that "multiple_cause_2_mapped" is the value associated with
        "multiple_cause_2" key, e.g.
        """
        raw_cols = sorted(raw_cols, key=MCoDMapper._get_cause_num)
        mapped_cols = sorted(mapped_cols, key=MCoDMapper._get_cause_num)
        return dict(list(zip(raw_cols, mapped_cols)))

    @staticmethod
    def fix_icd_codes(df, codes, code_system_id):
        """Adjustment to icd9/10 cause codes."""
        if code_system_id == 6:
            # according to Mohsen, codes between 800 to 900 need an E if underlying
            # assume 800, 900 codes are N codes if in the chain, don't add any prefix
            df.loc[df['cause'].str.contains('^[89]'),
                   'cause'] = 'E' + df['cause']
        elif code_system_id == 1:
            # S + T codes are always intermediate causes of death
            # V + Y codes are always the underlying cause of death
            violations = df['cause'].str.contains('^[ST]')
            num_violations = len(df[violations])
            if num_violations > 0:
                print_log_message(
                    f"Found S or T code as underlying cause, dropping {num_violations} rows"
                )
                assert np.isclose(len(df[~violations]), len(df), rtol=.10)
                df = df.loc[~violations]

            # next check violations in chain causes
            # V and Y codes can only be UCOD
            for col in codes:
                if col != 'cause':
                    violations = df[col].str.contains('^[VY]')
                    num_violations = len(df[violations])
                    if num_violations > 0:
                        print_log_message(
                            f"Setting {num_violations} rows with V/Y in chain to 0000 for {col}"
                        )
                        df.loc[violations, col] = '0000'
        return df

    @staticmethod
    def prep_cause_package_map(cause_package_map):
        """Expects cause-package map.

        Set dictionary of value: map_id since we only care about the package name
        or the cause_id, not the individual ICD code level code.
        """
        check_map = cause_package_map[['map_id', 'map_type']].drop_duplicates()
        report_duplicates(check_map, 'map_id')
        cause_package_map = cause_package_map.set_index(
            'value')['map_id'].to_dict()
        return cause_package_map

    @staticmethod
    def prep_cause_map(cause_map):
        """Clean up cause map."""
        cause_map['value'] = clean_icd_codes(cause_map['value'],
                                             remove_decimal=True)
        # duplicates are a result of weird _gc, the duplicates dropped all
        # have the higher sort_order (999999)
        cause_map = cause_map.drop_duplicates(['code_system_id', 'value'])
        cause_map['code_id'] = cause_map['code_id'].astype(int)
        cause_map = cause_map.set_index('value')['code_id'].to_dict()
        return cause_map

    @staticmethod
    def map_cause_codes(df, coi_map, coi, cols_to_map=None):
        """Map cause codes to any given value (e.g. acause, category, etc.).

        Inputs
        df (pd dataframe): incoming, unmapped data with ICD codes
        cause_map (pd dataframe): primary cause map, probably downloaded from the engine room
        coi_map (pd dataframe): special map designed just for one cause of interest
        coi (string): cause of interest
        Returns
        df (pd dataframe): mapped dataframe with additional columns for each cause
        """
        df = df.copy()
        if not cols_to_map:
            cols_to_map = MCoDMapper.get_code_columns(df)
        # map chain causes using cause of interest map
        for col in cols_to_map:
            df[col] = df[col].fillna('0000')
            df[col] = df[col].astype(object)
            df[col + '_' + coi] = df[col].map(coi_map)
        return df

    @staticmethod
    def trim_and_remap(df, code_dict, cause_map, code_system_id):
        """Trim ICD codes to 4 digits, map again, then 3, and map again."""
        df = df.copy()
        # before trimming, map "null" chain causes to '0000'
        for code, mapped_code in list(code_dict.items()):
            df.loc[df[code] == '0000', mapped_code] = '0000'

        # trim and re map null mappings
        for n in reversed(range(3, 6)):
            for code, mapped_code in list(code_dict.items()):
                temp_code = 'temp_' + code
                df[temp_code] = df[code].copy()
                try:
                    df.loc[df[mapped_code].isnull(),
                           temp_code] = df[temp_code].apply(lambda x: x[0:n])
                except TypeError:
                    # was getting a type error for some unicode issues?
                    if mapped_code != 'cause_mapped':
                        df[mapped_code] = '0000'
                    else:
                        print("problem code here..." + df[code])
                df.loc[df[mapped_code].isnull(),
                       mapped_code] = df[temp_code].map(cause_map)
                df = df.drop(temp_code, axis=1)
        return df

    def prep_int_cause_map(self):
        map_dir = self.conf.get_directory('process_inputs')
        code_system_name = {1: 'icd10', 6: 'icd9'}[self.code_system_id]
        df = pd.read_excel(f"{map_dir}/mcause_map.xlsx",
                           dtype={'icd_code': object})
        df = df[['icd_code', 'package_description',
                 'code_system']].drop_duplicates()

        # cleanup strings and things
        df['icd_code'] = clean_icd_codes(df['icd_code'], remove_decimal=True)
        df[['package_description', 'code_system']] = \
            df[['package_description', 'code_system']].apply(
            lambda x: x.str.lower())

        # only keep the rows we need for this intermediate cause
        # keep n-code rows for injuries
        df = df.loc[(df['package_description'].isin(self.full_cause_name)) | (
            df['package_description'].str.contains('nn'))].drop_duplicates()

        # intermediate causes should be mutually exclusive
        report_duplicates(df, ['icd_code', 'code_system'])

        # subset to just the code system being run through
        df = df.query(f'code_system == "{code_system_name}"')

        assert len(df) > 0, \
            f"There are no mappings for {code_system_name}, {self.full_cause_name}"

        # convert to a dictionary
        mcod_map = dict(list(zip(df['icd_code'], df['package_description'])))

        return mcod_map

    def capture_int_cause(self, df, int_cause_cols):
        """Flag deaths related to the intermediate cause."""
        df[self.int_cause] = None

        for col in int_cause_cols:
            df[col] = df[col].fillna("other")
            df.loc[df[col].isin(self.full_cause_name), self.int_cause] = 1
        df[self.int_cause] = df[self.int_cause].fillna(0)

        assert df[self.int_cause].notnull().values.all()

        return df

    def set_part2_flag(self, df):
        """Mark whether or not the cause of interest is from part 2 of the death certificate."""
        p2_cols = [x for x in df.columns if 'pII' in x]
        int_cause_chains = [
            x for x in df.columns
            if (self.int_cause in x) and ('multiple' in x)
        ]
        p2_chain_dict = dict(list(zip(p2_cols, int_cause_chains)))
        df['pII_' + self.int_cause] = 0
        for p2_col, chain in sorted(p2_chain_dict.items()):
            df.loc[(df[chain].isin(self.full_cause_name)) & (df[p2_col] == 1),
                   'pII_' + self.int_cause] = 1
        return df

    def get_computed_dataframe(self, df):
        """Return mapped dataframe."""
        # list of all cause columns
        raw_cause_cols = MCoDMapper.get_code_columns(df)
        df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id)

        print_log_message("Mapping underlying cause/primary diagnosis")
        cause_map = get_cause_map(code_map_version_id=self.code_map_version_id,
                                  **self.cache_options)
        code_map = MCoDMapper.prep_cause_map(cause_map)
        df['cause_mapped'] = df['cause'].map(code_map)

        print_log_message(
            "Trimming ICD codes and remapping underlying cause/primary diagnosis"
        )
        df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map,
                                       self.code_system_id)
        report_if_merge_fail(df, 'cause_mapped', 'cause')

        # merge on the cause_id for the underlying cause
        df = df.rename(columns={'cause_mapped': 'code_id'})
        df['code_id'] = df['code_id'].astype(int)
        df = add_code_metadata(df,
                               'cause_id',
                               code_map_version_id=self.code_map_version_id,
                               **self.cache_options)
        report_if_merge_fail(df, 'cause_id', 'code_id')

        print_log_message("Mapping chain causes")
        # get the special intermediate cause map
        int_cause_map = self.prep_int_cause_map()
        df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause)

        print_log_message("Trimming ICD codes and remapping chain causes")
        int_cause_cols = [x for x in df.columns if self.int_cause in x]
        int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary(
            raw_cause_cols, int_cause_cols)
        df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map,
                                       self.code_system_id)

        print_log_message(
            "Identifying rows with intermediate cause of interest")
        df = self.capture_int_cause(df, int_cause_cols)
        if not self.drop_p2:
            df = self.set_part2_flag(df)

        return df
예제 #4
0
 def __init__(self, df, loc_meta_df, cause_meta_df):
     self.df = df
     self.start_deaths = self.df['deaths'].sum()
     self.loc_meta_df = loc_meta_df
     self.cause_meta_df = cause_meta_df
     self.conf = Configurator("standard")
예제 #5
0
 def __init__(self, df, cause_meta_df):
     self.df = df
     self.start_deaths = self.df.deaths.sum()
     self.conf = Configurator('standard')
     self.cause_meta_df = cause_meta_df
예제 #6
0
class NonZeroFloorer(CodProcess):
    """APPLY NON-ZERO FLOOR OF 1 DEATH PER 10,000,000"""
    conf = Configurator('standard')
    draws = range(0, conf.get_resource('uncertainty_draws'))
    cf_draw_cols = ['cf_draw_{}'.format(draw) for draw in draws]

    def __init__(self, df):
        self.df = df
        self.merge_cols = ['year_id', 'sex_id', 'age_group_id', 'cause_id']
        self.cf_col = 'cf_final'
        if 'cf_draw_0' in self.df:
            self.cf_cols = [self.cf_col] + self.cf_draw_cols
        else:
            self.cf_cols = [self.cf_col]
        # initialize this to something crazy small, then adjust later when
        # nonzero floor file is read in
        self.min_possible_val = 1e-50

    def get_computed_dataframe(self, pop_df, env_df, cause_hierarchy):
        """Calculate mortality rates and replace cause fractions, as needed.

        Make death rates and calculate the cf as if the rate were 2
        MADs below the "global" median. Every cause in the floor file
        is checked to ensure non-zero values in any non-restricted age-sex.
        So, just check and make sure there is something there for the cause,
        filling in zeroes where missing if the cause is present in the floor
        file (will break if there is a cause not present)
        """

        orig_cols = list(self.df.columns)
        age_aggs = self.df[self.df.age_group_id.isin([22, 27])]
        self.df = self.df[~self.df.age_group_id.isin([22, 27])]
        self.merge_pop_env(pop_df, env_df)
        self.merge_nonzero_mad_info(cause_hierarchy)
        self.make_min_floor()
        self.make_replace_cf()
        for col in self.cf_cols:
            self.replace_cf(col)
        self.diag_df = self.df
        null_cfs = self.df.loc[self.df[self.cf_cols].isnull().any(axis=1)]
        if len(null_cfs) > 0:
            raise AssertionError(
                "Found null rates in the data: \n{}".format(null_cfs))
        self.df = self.df[orig_cols]
        self.df = self.df.append(age_aggs)
        # find lowest non-zero value that is in the dataframe and check that
        # it is not lower than lowest non-zero floor value
        data_min_val = self.df[self.df > 0][self.cf_cols].min().min()
        assert data_min_val >= self.min_possible_val, \
            "Data min value [{}] was lower than non-zero floor min " \
            "value [{}]".format(data_min_val, self.min_possible_val)
        return self.df

    def make_replace_cf(self):
        """Replace cause fractions based on mortality rates.

        If the rate is over 0 and less than the floor, then the cause
        fractions are replaced with floor * pop / mean_env
        """
        self.df.loc[self.df['floor'].isnull(), 'floor'] = self.df['min_floor']
        # there are so many checks before this that it would be very surprising
        # if this line does anything, but its another round of safety to make
        # sure that cause fractions arent being replaced with null
        self.df.loc[self.df['floor'].isnull(), 'floor'] = 0
        self.df['cf_replace'] = ((self.df['floor'] * self.df['population']) /
                                 self.df['mean_env'])
        self.min_possible_val = self.df.cf_replace.min()

    def replace_cf(self, check_cf_col):

        # Replace the CF with the rate-adjusted CF if the
        # rate is less than the floor and greater than zero
        self.df['rate'] = ((self.df[check_cf_col] * self.df['mean_env']) /
                           self.df['population'])
        cf_over_0 = self.df[check_cf_col] > 0
        rate_less_than_floor = self.df['rate'] < self.df['floor']
        self.df.loc[cf_over_0 & rate_less_than_floor,
                    check_cf_col] = self.df['cf_replace']

    def make_min_floor(self):
        """Set min floor to the minimum cf of any rows floor by cause."""
        self.df['min_floor'] = self.df.groupby(
            'cause_id', as_index=False)['floor'].transform('min')
        missing_floor = self.df['min_floor'].isnull()
        nonzero_cf = self.df[self.cf_col] > 0
        assert len(self.df[nonzero_cf & missing_floor]) == 0

    def merge_pop_env(self, pop_df, env_df):
        if 'population' not in self.df.columns:
            self.df = add_population(self.df,
                                     add_cols=['population'],
                                     pop_df=pop_df)
        if 'mean_env' not in self.df.columns:
            self.df = add_envelope(self.df,
                                   add_cols=['mean_env'],
                                   env_df=env_df)

    def convert_nonzero_mad(self, df, cmdf):
        # add cause_id
        cmdf = cmdf[['acause', 'cause_id']]
        df = df.merge(cmdf, how='left', on='acause')
        # add id to cols
        df = df.rename(columns={
            'year': 'year_id',
            'sex': 'sex_id',
            'age': 'age_group_id'
        })
        # convert age
        age_to_id_map = {
            1: 5,
            5: 6,
            10: 7,
            15: 8,
            20: 9,
            25: 10,
            30: 11,
            35: 12,
            40: 13,
            45: 14,
            50: 15,
            55: 16,
            60: 17,
            65: 18,
            70: 19,
            75: 20,
            80: 30,
            85: 31,
            90: 32,
            95: 235,
            91: 2,
            93: 3,
            94: 4
        }
        df['age_group_id'] = df['age_group_id'].map(age_to_id_map)
        df = df.drop('acause', axis=1)

        # make sure 2017-2018 are still missing
        missing_years = [2017, 2018]
        assert df.loc[df['year_id'].isin(
            missing_years)].floor.isnull().values.all()
        df = df.loc[~df['year_id'].isin(missing_years)]

        # We have determined that the floor is missing values for:
        # (1) certain cause/age/sexes in 2016 - we will use the 2015 floor to fill in these values
        # (2) certain cause/age/sexes across the entire time series - really nothing we
        # can do short of resetting the floor
        merge_cols = ['cause_id', 'age_group_id', 'sex_id']
        report_duplicates(df, merge_cols + ['year_id'])
        new_floor = pd.merge(df.loc[df.year_id == 2016].copy(),
                             df.loc[df.year_id == 2015].copy(),
                             how='outer',
                             on=merge_cols,
                             suffixes=('', '_2015'))
        new_floor = new_floor.fillna({'floor': new_floor['floor_2015']})\
            .loc[:, merge_cols + ['year_id', 'floor']]
        df = df.loc[df.year_id != 2016]\
            .append(new_floor, ignore_index=True, sort=True)
        # If anything else is still missing, make sure it's missing for the entire time
        # series - otherwise we should write something more sophisticated to fill it in
        assert df.assign(floor_null=df.floor.isnull())\
            .groupby(merge_cols + ['floor_null'])['year_id'].apply(
            lambda x: set(x) == set(range(1980, 2017))).all()

        # copy 2016 to 2017, 2018
        for year in missing_years:
            df = df.append(
                df.loc[df.year_id == 2016].copy().assign(year_id=year),
                ignore_index=True)

        # Due to age restriction changes since last round, we now have data in cause/age
        # groups where we had no floor in GBD 2017
        # Add in a nonzero floor created based on GBD 2019 data for these cause/age groups
        new_cause_ages = pd.read_csv(self.conf.get_resource('nonzero_floor_new_age_restrictions'))\
            .drop('borrow_age_group_id', axis='columns')
        assert new_cause_ages.notnull().values.all()
        df = df.append(new_cause_ages, sort=True)

        # no duplicates
        df = df.loc[df.floor.notnull()]
        report_duplicates(df,
                          ['year_id', 'cause_id', 'age_group_id', 'sex_id'])

        return df

    def fill_na_floors(self, df):
        if df.floor.isnull().any():
            median = np.median(df[~df.floor.isnull()].floor)
            df.loc[df['floor'].isnull(), 'floor'] = median
        return df

    def merge_nonzero_mad_info(self, cmdf):
        """Read in the floor input and merge onto main dataframe."""
        nonzero_mad = pd.read_csv(self.conf.get_resource("nonzero_floor_mad"))
        nonzero_mad = self.convert_nonzero_mad(nonzero_mad, cmdf)
        nonzero_mad_cols = self.merge_cols + ['floor']
        nonzero_mad = nonzero_mad[nonzero_mad_cols]
        self.df = self.df.merge(nonzero_mad, how='left', on=self.merge_cols)
        if self.df.floor.isnull().any():
            self.df = self.df.groupby(['year_id', 'sex_id',
                                       'cause_id']).apply(self.fill_na_floors)
        if self.df.floor.isnull().any():
            trouble_causes = self.df[self.df.floor.isnull()].cause_id.unique()
            filler = np.median(self.df[~self.df.floor.isnull()].floor)
            print_log_message("using nonzero filler because"
                              " of these causes: {}".format(trouble_causes))
            self.df.floor = self.df.floor.fillna(filler)
        self.df.loc[self.df.cause_id == 975, 'floor'] = 1e-50

        report_if_merge_fail(self.df, 'floor', self.merge_cols)

    def get_diagnostic_dataframe(self):
        """Return diagnostics."""
        try:
            return self.diag_df
        except AttributeError:
            print("You requested the diag dataframe before it was ready,"
                  " returning an empty dataframe.")
            return pd.DataFrame()
예제 #7
0
class NonZeroFloorer(CodProcess):
    """APPLY NON-ZERO FLOOR OF 1 DEATH PER 10,000,000"""
    conf = Configurator('standard')
    draws = range(0, conf.get_resource('uncertainty_draws'))
    cf_draw_cols = ['cf_draw_{}'.format(draw) for draw in draws]

    def __init__(self, df):
        self.df = df
        self.merge_cols = ['year_id', 'sex_id', 'age_group_id', 'cause_id']
        self.cf_col = 'cf_final'
        if 'cf_draw_0' in self.df:
            self.cf_cols = [self.cf_col] + self.cf_draw_cols
        else:
            self.cf_cols = [self.cf_col]
        self.min_possible_val = 1e-50

    def _check_all_floors_exist(self, nzf_df):
        ''' Check that all expected cancers, ages, and years, are present and have
            nonzero floor values 
        '''
        def _remove_ages_less_than(a, b):
            '''
            '''
            orig_list = a.copy()
            for val in orig_list:
                if b == 5 & val in [2, 3, 4]:
                    continue
                if val < b:
                    a.remove(val)
            return a

        print("CHECKING FOR ALL CAUSES, AGES, and YEARS...")
        # create cause_list
        db_link = cdb.db_api(db_connection_name='cancer_db')
        gbd_id = utils.get_gbd_parameter('current_gbd_round')
        registry_entity = db_link.get_table('registry_input_entity')
        registry_entity = registry_entity.loc[
            registry_entity['gbd_round_id'].eq(gbd_id)
            & registry_entity['is_active'].eq(1), ]
        cancer_metadata = registry_entity[[
            'acause', 'cause_id', 'yll_age_start', 'yll_age_end'
        ]]
        causes_checklist = registry_entity['acause'].unique().tolist()

        # exceptions for nonzero floors
        causes_checklist.remove('neo_nmsc_bcc')
        causes_checklist.remove('neo_ben_intest')
        causes_checklist.remove('neo_ben_utr')
        causes_checklist.remove('neo_ben_other')
        causes_checklist.remove('neo_ben_brain')
        causes_checklist.remove('_gc')

        # create year_list
        year_start = utils.get_gbd_parameter('min_year_cod')
        year_end = utils.get_gbd_parameter('max_year')  # + 1 for GBD2020
        year_checklist = list(range(year_start, year_end))

        # sex &  age_id checklist
        age_id_checklist = [
            5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31,
            32, 235, 2, 3, 4
        ]  #age_ids for 0-95 ages
        sex_checklist = [1, 2]

        # print any causes/years/sexes that are expected and missing
        for cancer in causes_checklist:
            print('working on...{}'.format(cancer))
            subset = nzf_df.loc[nzf_df['acause'].eq(cancer), ]
            age_start = int(
                cancer_metadata.loc[cancer_metadata['acause'].eq(cancer),
                                    'yll_age_start'])
            age_start = (age_start /
                         5) + 5  # conversion from age to GBD age_group_id
            if len(subset) == 0:
                print('MISSING CAUSE... {} '.format(cancer))
            missing_ages = set(age_id_checklist) - set(
                subset['age_group_id'].unique().tolist())
            missing_ages = list(missing_ages)
            missing_ages = _remove_ages_less_than(missing_ages, age_start)
            if len(missing_ages) > 0:
                print('missing the following ages for {}: {}'.format(
                    cancer, missing_ages))
            missing_sexes = set(sex_checklist) - set(
                subset['sex_id'].unique().tolist())
            if len(missing_sexes) > 0:
                print('missing the following sexes for {}: {}'.format(
                    cancer, missing_sexes))
            missing_years = set(year_checklist) - set(
                subset['year_id'].unique().tolist())
            if len(missing_years) > 0:
                print('missing the following years for {}: {}'.format(
                    cancer, missing_years))
        return

    def format_nzf(self, nzf_df, cmdf):
        '''
        '''
        # merge acause column
        nzf_df = pd.merge(nzf_df,
                          cmdf[['acause', 'cause_id']],
                          on='cause_id',
                          how='left')
        return nzf_df

    def get_computed_dataframe(self, pop_df, env_df, cause_hierarchy):
        """Calculate mortality rates and replace cause fractions, as needed.
        """

        orig_cols = list(self.df.columns)
        age_aggs = self.df[self.df.age_group_id.isin([22, 27])]
        self.df = self.df[~self.df.age_group_id.isin([22, 27])]
        self.merge_pop_env(pop_df, env_df)
        self.merge_nonzero_mad_info(cause_hierarchy)
        self.make_min_floor()
        self.make_replace_cf()
        for col in self.cf_cols:
            self.replace_cf(col)
        self.diag_df = self.df
        null_cfs = self.df.loc[self.df[self.cf_cols].isnull().any(axis=1)]
        if len(null_cfs) > 0:
            raise AssertionError(
                "Found null rates in the data: \n{}".format(null_cfs))
        self.df = self.df[orig_cols + ['rate', 'floor']]
        self.df = self.df.append(age_aggs)
        # find lowest non-zero value that is in the dataframe and check that
        # it is not lower than lowest non-zero floor value
        data_min_val = self.df.loc[self.df['cf_final'] > 0, 'cf_final'].min()
        assert data_min_val >= self.min_possible_val, \
            "Data min value [{}] was lower than non-zero floor min " \
            "value [{}]".format(data_min_val, self.min_possible_val)
        return self.df

    def convert_nonzero_mad(self, df, cmdf):
        # add cause_id
        cmdf = cmdf[['acause', 'cause_id']]
        df = df.merge(cmdf, how='left', on='acause')
        # add id to cols
        df = df.rename(columns={
            'year': 'year_id',
            'sex': 'sex_id',
            'age': 'age_group_id'
        })
        # convert age
        age_to_id_map = {
            1: 5,
            5: 6,
            10: 7,
            15: 8,
            20: 9,
            25: 10,
            30: 11,
            35: 12,
            40: 13,
            45: 14,
            50: 15,
            55: 16,
            60: 17,
            65: 18,
            70: 19,
            75: 20,
            80: 30,
            85: 31,
            90: 32,
            95: 235,
            91: 2,
            93: 3,
            94: 4
        }
        df['age_group_id'] = df['age_group_id'].map(age_to_id_map)
        df = df.drop('acause', axis=1)

        return df

    def compile_nonzero_floor(self, cmdf):
        '''
        For GBD2019, new floor values were generated for cancer causes that had 
        updated age restrictions, or was a new modeled cause. This function takes
        the original nonzero floor values, and appends all updated values 
        '''
        work_dir = utils.get_path(process='cod_mortality',
                                  key='nonzero_floor_workspace')
        orig_nzf = pd.read_csv(
            utils.get_path(process='cod_mortality', key='orig_nonzero_file'))

        # convert age_group_ids to comply with GBD's
        formatted_orig_nzf = self.convert_nonzero_mad(orig_nzf, cmdf)

        # load nonzero floor values with new age restrictions, and that were new causes
        # for this GBD cycle
        new_age_rstrct_df = pd.read_csv(
            '{}/nonzero_floor_new_age_restrictions.csv'.format(work_dir))
        new_causes_df = pd.read_csv(
            '{}/nonzero_new_causes.csv'.format(work_dir))

        # append all nonzero values together
        comp_nzf = formatted_orig_nzf.append(new_age_rstrct_df)
        comp_nzf = comp_nzf.append(new_causes_df)

        return comp_nzf

    def make_replace_cf(self):
        """Replace cause fractions based on mortality rates.

        If the rate is over 0 and less than the floor, then the cause
        fractions are replaced with floor * pop / mean_env
        """
        self.df.loc[self.df['floor'].isnull(), 'floor'] = self.df['min_floor']
        self.df.loc[self.df['floor'].isnull(), 'floor'] = 0
        self.df['cf_replace'] = ((self.df['floor'] * self.df['population']) /
                                 self.df['mean_env'])

    def replace_cf(self, check_cf_col):
        # Replace the CF with the rate-adjusted CF if the
        # rate is less than the floor and greater than zero
        self.df['rate'] = ((self.df[check_cf_col] * self.df['mean_env']) /
                           self.df['population'])
        cf_over_0 = self.df[check_cf_col] > 0
        rate_less_than_floor = self.df['rate'] < self.df['floor']
        self.df.loc[cf_over_0 & rate_less_than_floor,
                    check_cf_col] = self.df['cf_replace']

    def make_min_floor(self):
        """Set min floor to the minimum cf of any rows floor by cause."""
        self.df['min_floor'] = self.df.groupby(
            'cause_id', as_index=False)['floor'].transform('min')
        missing_floor = self.df['min_floor'].isnull()
        nonzero_cf = self.df[self.cf_col] > 0
        assert len(self.df[nonzero_cf & missing_floor]) == 0

    def merge_pop_env(self, pop_df, env_df):
        if 'population' not in self.df.columns:
            self.df = add_population(self.df,
                                     add_cols=['population'],
                                     pop_df=pop_df)
        if 'mean_env' not in self.df.columns:
            self.df = add_envelope(self.df,
                                   add_cols=['mean_env'],
                                   env_df=env_df)

    def fill_na_floors(self, df):
        if df.floor.isnull().any():
            median = np.median(df[~df.floor.isnull()].floor)
            df.loc[df['floor'].isnull(), 'floor'] = median
        return df

    def merge_nonzero_mad_info(self, cmdf):
        """Read in the floor input and merge onto main dataframe."""
        # load nonzero floor values
        nonzero_mad = self.compile_nonzero_floor(cmdf)
        nonzero_mad = self.format_nzf(nonzero_mad, cmdf)
        self._check_all_floors_exist(
            nonzero_mad)  # checks that all age_groups/cancer/year/sex exist
        nonzero_mad_cols = self.merge_cols + ['floor']
        nonzero_mad = nonzero_mad[nonzero_mad_cols]
        self.min_possible_val = nonzero_mad['floor'].min()
        self.df = self.df.merge(nonzero_mad, how='left', on=self.merge_cols)
        # ensure no floor values are missing
        assert self.df.floor.isnull().any() == False, "null floor values exist"
        report_if_merge_fail(self.df, 'floor', self.merge_cols)

    def get_diagnostic_dataframe(self):
        """Return diagnostics."""
        try:
            return self.diag_df
        except AttributeError:
            print("You requested the diag dataframe before it was ready,"
                  " returning an empty dataframe.")
            return pd.DataFrame()