get_country_level_location_id, get_cause_map, add_location_metadata, get_current_location_hierarchy ) from cod_prep.utils import ( print_log_message, report_duplicates, report_if_merge_fail, cod_timestamp ) from cod_prep.claude.claude_io import get_claude_data, makedirs_safely from cod_prep.claude.configurator import Configurator from save_proportions_for_tableau import SharedPackage CONF = Configurator() MODEL_DATA_CODE_SYSTEMS = [1, 6] RDP_REG_DIR = CONF.get_directory('rdp_regressions') def get_package_code_ids(regression_specification, code_system_id): """Returns code_ids for garbage codes in package for given code system""" package_description = regression_specification[ 'package_descriptions' ][code_system_id] packages = get_package_list(code_system_id) package_id = packages.loc[ packages['package_description'] == package_description, 'package_id'
def run_phase(df, cause_set_version_id, location_set_version_id, data_type_id, env_run_id, source, nid, extract_type_id, remove_decimal, code_map_version_id, iso3): """Run the full pipeline, chaining together CodProcesses.""" configurator = Configurator('standard') cache_dir = configurator.get_directory('db_cache') cache_options = { 'block_rerun': True, 'cache_dir': cache_dir, 'force_rerun': False, 'cache_results': False } # get cause hierarchy cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **cache_options) # get location hierarchy location_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **cache_options) # get envelope env_meta_df = get_env(env_run_id=env_run_id, **cache_options) # get env with HIV env_hiv_meta_df = get_env(env_run_id=env_run_id, with_hiv=True, **cache_options) # get age groups age_meta_df = get_ages(**cache_options) code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) cause_map = get_cause_map(code_map_version_id=code_map_version_id, **cache_options) package_map = get_package_map(code_system_id=code_system_id, **cache_options) disagg_df = get_phase_output("disaggregation", nid, extract_type_id) misdc_df = get_phase_output("misdiagnosiscorrection", nid, extract_type_id) cause_package_hierarchy = get_cause_package_hierarchy( code_system_id, **cache_options) if source == "Cancer_Registry": df = prune_cancer_registry_data(df, location_meta_df) # aggregate location # defaults to simple location -> national aggregation # running full aggregation for India Survey data print_log_message("Aggregating location to country level") location_aggregator = LocationAggregator(df, location_meta_df) if (data_type_id == 7) & (iso3 == 'IND'): df = location_aggregator.get_computed_dataframe('full') else: df = location_aggregator.get_computed_dataframe() if data_type_id in POLICE_SURVEY_DATA_TYPE: # special step to remove HIV from maternal data print_log_message("Removing HIV from cc_code for maternal data.") maternal_hiv_remover = MaternalHIVRemover(df, env_meta_df, env_hiv_meta_df, source, nid) df = maternal_hiv_remover.get_computed_dataframe() print_log_message("Calculating sample size") df = calc_sample_size(df) print_log_message(log_statistic(df)) print_log_message("Converting to cause fractions") df = df.loc[df['sample_size'] > 0] df = convert_to_cause_fractions( df, ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw']) print_log_message(log_statistic(df)) if data_type_id == VA_DATA_TYPE: # run VA anemia adjusment print_log_message("Running VA Anemia adjustment") va_anemia_adjuster = AnemiaAdjuster() df = va_anemia_adjuster.get_computed_dataframe(df) if data_type_id == POLICE_DATA_TYPE: if source == 'Various_RTI': rti_adjuster = RTIAdjuster(df, cause_meta_df, age_meta_df, location_meta_df) df = rti_adjuster.get_computed_dataframe() if data_type_id in POLICE_SURVEY_DATA_TYPE: # issue: rows with > 0 sample size are dropped # most common in maternal data, but relevant anywhere # we have only cc_code and one other cause and there # are 0 deaths for the other cause for a given age/sex cause_list = df.cause_id.unique() square_me = (len(cause_list) == 2) & (CC_CODE in cause_list) if (source in MATERNAL_SQUARED) or square_me: print_log_message("Squaring maternal data") df = square_maternal_sources(df, cause_meta_df, age_meta_df) print_log_message("Dropping cc code") df = drop_cc_code(df) print_log_message(log_statistic(df)) print_log_message("Splitting locations.") env_loc_splitter = EnvelopeLocationSplitter(df, env_meta_df, source) df = env_loc_splitter.get_computed_dataframe() print_log_message(log_statistic(df)) # aggregate causes print_log_message("Aggregating causes") cause_aggregator = CauseAggregator(df, cause_meta_df, source) df = cause_aggregator.get_computed_dataframe() print_log_message(log_statistic(df)) print_log_message("Adding parnt-mapped garbage to aggregated causes") parent_gbg_adder = ParentMappedAggregatedGarbageAdder( nid, extract_type_id, source, cause_package_hierarchy, cause_meta_df, package_map, cause_map, remove_decimal, disagg_df, misdc_df) df = parent_gbg_adder.get_computed_dataframe(df) print_log_message("Applying hiv-prevalance in pregnancy adjustment to " "maternal deaths") hmp = HIVMatPAFs() df = hmp.get_computed_dataframe(df, cause_meta_df, location_meta_df) print_log_message(log_statistic(df)) # TO DO # ** In the recode step for BTL some cancer deaths were moved to the # cancer parent. The squaring step created 0's. Get rid of the 0's in # country-years the recode was previously applied to. print_log_message( "Removing HIV and shocks from cause fraction denominator") hiv_shock_remover = SampleSizeCauseRemover(cause_meta_df) df = hiv_shock_remover.get_computed_dataframe(df) print_log_message(log_statistic(df)) # not sure why we do this, but could use a comment of some kind. df = conform_one_like_cf_to_one(df) print_log_message("Verifying cause fractions not null between 0 and 1") assert_valid_cause_fractions(df) if dataset_has_redistribution_variance(data_type_id, source): # Determine the redistribution variance rdvar = RedistributionVarianceEstimator( nid, extract_type_id, cause_meta_df, remove_decimal, code_system_id, cause_map, package_map, code_map_version_id=code_map_version_id) df = rdvar.get_computed_dataframe(df, **cache_options) return df
class MCoDMapper(): """Map ICD codes to code_ids, cause_ids. Arguments: int_cause (str): the intermediate cause of interest (e.g. sepsis) code_system_id (int): the ICD category, determines which map to use code_map_version_id (int): the version of the map to use df (dataframe): dataframe of formatted mcod data Returns: df (dataframe): dataframe with the underlying cause mapped to code id and cause_id and the causes in the chain flagged for containing the intermediate cause of interest. """ cache_options = {'force_rerun': False, 'block_rerun': True} conf = Configurator() inj_causes = ['x59', 'y34'] int_cause_name_dict = { 'x59': ['unspecified external factor x59'], 'y34': ['external causes udi,type unspecified-y34'] } possible_int_causes = list(int_cause_name_dict.keys()) def __init__(self, int_cause, code_system_id, code_map_version_id, drop_p2): self.int_cause = int_cause self.code_system_id = code_system_id self.code_map_version_id = code_map_version_id self.drop_p2 = drop_p2 assert self.int_cause in self.possible_int_causes, \ f"{self.int_cause} is not a valid intermediate cause" self.full_cause_name = self.int_cause_name_dict[self.int_cause] if type(self.full_cause_name) != list: self.full_cause_name = [self.full_cause_name] @staticmethod def get_code_columns(df): """Get a list of raw cause columns with ICD codes as values.""" col_names = list(df.columns) code_cols = [ x for x in col_names if "multiple_cause" in x and "pII" not in x ] + ['cause'] return code_cols @staticmethod def _get_cause_num(mcod_col): """Get sort order for cause columns. Assumes you have an underlying cause (cause_x) column and chain columns (multiple_cause_x) and that the value to sort off of is after the second underscore. """ if mcod_col.startswith('cause'): return '0' else: assert re.match(r"^multiple_cause_[a-z]*[0-9]*", mcod_col), \ f"column {mcod_col} does not match expected format: multiple_cause_x" return mcod_col.split('_')[2] @staticmethod def prep_raw_mapped_cause_dictionary(raw_cols, mapped_cols): """Create dictionary of raw cause columns to mapped cause columns. Ensures that "multiple_cause_2_mapped" is the value associated with "multiple_cause_2" key, e.g. """ raw_cols = sorted(raw_cols, key=MCoDMapper._get_cause_num) mapped_cols = sorted(mapped_cols, key=MCoDMapper._get_cause_num) return dict(list(zip(raw_cols, mapped_cols))) @staticmethod def fix_icd_codes(df, codes, code_system_id): """Adjustment to icd9/10 cause codes.""" if code_system_id == 6: # according to Mohsen, codes between 800 to 900 need an E if underlying # assume 800, 900 codes are N codes if in the chain, don't add any prefix df.loc[df['cause'].str.contains('^[89]'), 'cause'] = 'E' + df['cause'] elif code_system_id == 1: # S + T codes are always intermediate causes of death # V + Y codes are always the underlying cause of death violations = df['cause'].str.contains('^[ST]') num_violations = len(df[violations]) if num_violations > 0: print_log_message( f"Found S or T code as underlying cause, dropping {num_violations} rows" ) assert np.isclose(len(df[~violations]), len(df), rtol=.10) df = df.loc[~violations] # next check violations in chain causes # V and Y codes can only be UCOD for col in codes: if col != 'cause': violations = df[col].str.contains('^[VY]') num_violations = len(df[violations]) if num_violations > 0: print_log_message( f"Setting {num_violations} rows with V/Y in chain to 0000 for {col}" ) df.loc[violations, col] = '0000' return df @staticmethod def prep_cause_package_map(cause_package_map): """Expects cause-package map. Set dictionary of value: map_id since we only care about the package name or the cause_id, not the individual ICD code level code. """ check_map = cause_package_map[['map_id', 'map_type']].drop_duplicates() report_duplicates(check_map, 'map_id') cause_package_map = cause_package_map.set_index( 'value')['map_id'].to_dict() return cause_package_map @staticmethod def prep_cause_map(cause_map): """Clean up cause map.""" cause_map['value'] = clean_icd_codes(cause_map['value'], remove_decimal=True) # duplicates are a result of weird _gc, the duplicates dropped all # have the higher sort_order (999999) cause_map = cause_map.drop_duplicates(['code_system_id', 'value']) cause_map['code_id'] = cause_map['code_id'].astype(int) cause_map = cause_map.set_index('value')['code_id'].to_dict() return cause_map @staticmethod def map_cause_codes(df, coi_map, coi, cols_to_map=None): """Map cause codes to any given value (e.g. acause, category, etc.). Inputs df (pd dataframe): incoming, unmapped data with ICD codes cause_map (pd dataframe): primary cause map, probably downloaded from the engine room coi_map (pd dataframe): special map designed just for one cause of interest coi (string): cause of interest Returns df (pd dataframe): mapped dataframe with additional columns for each cause """ df = df.copy() if not cols_to_map: cols_to_map = MCoDMapper.get_code_columns(df) # map chain causes using cause of interest map for col in cols_to_map: df[col] = df[col].fillna('0000') df[col] = df[col].astype(object) df[col + '_' + coi] = df[col].map(coi_map) return df @staticmethod def trim_and_remap(df, code_dict, cause_map, code_system_id): """Trim ICD codes to 4 digits, map again, then 3, and map again.""" df = df.copy() # before trimming, map "null" chain causes to '0000' for code, mapped_code in list(code_dict.items()): df.loc[df[code] == '0000', mapped_code] = '0000' # trim and re map null mappings for n in reversed(range(3, 6)): for code, mapped_code in list(code_dict.items()): temp_code = 'temp_' + code df[temp_code] = df[code].copy() try: df.loc[df[mapped_code].isnull(), temp_code] = df[temp_code].apply(lambda x: x[0:n]) except TypeError: # was getting a type error for some unicode issues? if mapped_code != 'cause_mapped': df[mapped_code] = '0000' else: print("problem code here..." + df[code]) df.loc[df[mapped_code].isnull(), mapped_code] = df[temp_code].map(cause_map) df = df.drop(temp_code, axis=1) return df def prep_int_cause_map(self): map_dir = self.conf.get_directory('process_inputs') code_system_name = {1: 'icd10', 6: 'icd9'}[self.code_system_id] df = pd.read_excel(f"{map_dir}/mcause_map.xlsx", dtype={'icd_code': object}) df = df[['icd_code', 'package_description', 'code_system']].drop_duplicates() # cleanup strings and things df['icd_code'] = clean_icd_codes(df['icd_code'], remove_decimal=True) df[['package_description', 'code_system']] = \ df[['package_description', 'code_system']].apply( lambda x: x.str.lower()) # only keep the rows we need for this intermediate cause # keep n-code rows for injuries df = df.loc[(df['package_description'].isin(self.full_cause_name)) | ( df['package_description'].str.contains('nn'))].drop_duplicates() # intermediate causes should be mutually exclusive report_duplicates(df, ['icd_code', 'code_system']) # subset to just the code system being run through df = df.query(f'code_system == "{code_system_name}"') assert len(df) > 0, \ f"There are no mappings for {code_system_name}, {self.full_cause_name}" # convert to a dictionary mcod_map = dict(list(zip(df['icd_code'], df['package_description']))) return mcod_map def capture_int_cause(self, df, int_cause_cols): """Flag deaths related to the intermediate cause.""" df[self.int_cause] = None for col in int_cause_cols: df[col] = df[col].fillna("other") df.loc[df[col].isin(self.full_cause_name), self.int_cause] = 1 df[self.int_cause] = df[self.int_cause].fillna(0) assert df[self.int_cause].notnull().values.all() return df def set_part2_flag(self, df): """Mark whether or not the cause of interest is from part 2 of the death certificate.""" p2_cols = [x for x in df.columns if 'pII' in x] int_cause_chains = [ x for x in df.columns if (self.int_cause in x) and ('multiple' in x) ] p2_chain_dict = dict(list(zip(p2_cols, int_cause_chains))) df['pII_' + self.int_cause] = 0 for p2_col, chain in sorted(p2_chain_dict.items()): df.loc[(df[chain].isin(self.full_cause_name)) & (df[p2_col] == 1), 'pII_' + self.int_cause] = 1 return df def get_computed_dataframe(self, df): """Return mapped dataframe.""" # list of all cause columns raw_cause_cols = MCoDMapper.get_code_columns(df) df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id) print_log_message("Mapping underlying cause/primary diagnosis") cause_map = get_cause_map(code_map_version_id=self.code_map_version_id, **self.cache_options) code_map = MCoDMapper.prep_cause_map(cause_map) df['cause_mapped'] = df['cause'].map(code_map) print_log_message( "Trimming ICD codes and remapping underlying cause/primary diagnosis" ) df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map, self.code_system_id) report_if_merge_fail(df, 'cause_mapped', 'cause') # merge on the cause_id for the underlying cause df = df.rename(columns={'cause_mapped': 'code_id'}) df['code_id'] = df['code_id'].astype(int) df = add_code_metadata(df, 'cause_id', code_map_version_id=self.code_map_version_id, **self.cache_options) report_if_merge_fail(df, 'cause_id', 'code_id') print_log_message("Mapping chain causes") # get the special intermediate cause map int_cause_map = self.prep_int_cause_map() df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause) print_log_message("Trimming ICD codes and remapping chain causes") int_cause_cols = [x for x in df.columns if self.int_cause in x] int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary( raw_cause_cols, int_cause_cols) df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map, self.code_system_id) print_log_message( "Identifying rows with intermediate cause of interest") df = self.capture_int_cause(df, int_cause_cols) if not self.drop_p2: df = self.set_part2_flag(df) return df
def __init__(self, df, loc_meta_df, cause_meta_df): self.df = df self.start_deaths = self.df['deaths'].sum() self.loc_meta_df = loc_meta_df self.cause_meta_df = cause_meta_df self.conf = Configurator("standard")
def __init__(self, df, cause_meta_df): self.df = df self.start_deaths = self.df.deaths.sum() self.conf = Configurator('standard') self.cause_meta_df = cause_meta_df
class NonZeroFloorer(CodProcess): """APPLY NON-ZERO FLOOR OF 1 DEATH PER 10,000,000""" conf = Configurator('standard') draws = range(0, conf.get_resource('uncertainty_draws')) cf_draw_cols = ['cf_draw_{}'.format(draw) for draw in draws] def __init__(self, df): self.df = df self.merge_cols = ['year_id', 'sex_id', 'age_group_id', 'cause_id'] self.cf_col = 'cf_final' if 'cf_draw_0' in self.df: self.cf_cols = [self.cf_col] + self.cf_draw_cols else: self.cf_cols = [self.cf_col] # initialize this to something crazy small, then adjust later when # nonzero floor file is read in self.min_possible_val = 1e-50 def get_computed_dataframe(self, pop_df, env_df, cause_hierarchy): """Calculate mortality rates and replace cause fractions, as needed. Make death rates and calculate the cf as if the rate were 2 MADs below the "global" median. Every cause in the floor file is checked to ensure non-zero values in any non-restricted age-sex. So, just check and make sure there is something there for the cause, filling in zeroes where missing if the cause is present in the floor file (will break if there is a cause not present) """ orig_cols = list(self.df.columns) age_aggs = self.df[self.df.age_group_id.isin([22, 27])] self.df = self.df[~self.df.age_group_id.isin([22, 27])] self.merge_pop_env(pop_df, env_df) self.merge_nonzero_mad_info(cause_hierarchy) self.make_min_floor() self.make_replace_cf() for col in self.cf_cols: self.replace_cf(col) self.diag_df = self.df null_cfs = self.df.loc[self.df[self.cf_cols].isnull().any(axis=1)] if len(null_cfs) > 0: raise AssertionError( "Found null rates in the data: \n{}".format(null_cfs)) self.df = self.df[orig_cols] self.df = self.df.append(age_aggs) # find lowest non-zero value that is in the dataframe and check that # it is not lower than lowest non-zero floor value data_min_val = self.df[self.df > 0][self.cf_cols].min().min() assert data_min_val >= self.min_possible_val, \ "Data min value [{}] was lower than non-zero floor min " \ "value [{}]".format(data_min_val, self.min_possible_val) return self.df def make_replace_cf(self): """Replace cause fractions based on mortality rates. If the rate is over 0 and less than the floor, then the cause fractions are replaced with floor * pop / mean_env """ self.df.loc[self.df['floor'].isnull(), 'floor'] = self.df['min_floor'] # there are so many checks before this that it would be very surprising # if this line does anything, but its another round of safety to make # sure that cause fractions arent being replaced with null self.df.loc[self.df['floor'].isnull(), 'floor'] = 0 self.df['cf_replace'] = ((self.df['floor'] * self.df['population']) / self.df['mean_env']) self.min_possible_val = self.df.cf_replace.min() def replace_cf(self, check_cf_col): # Replace the CF with the rate-adjusted CF if the # rate is less than the floor and greater than zero self.df['rate'] = ((self.df[check_cf_col] * self.df['mean_env']) / self.df['population']) cf_over_0 = self.df[check_cf_col] > 0 rate_less_than_floor = self.df['rate'] < self.df['floor'] self.df.loc[cf_over_0 & rate_less_than_floor, check_cf_col] = self.df['cf_replace'] def make_min_floor(self): """Set min floor to the minimum cf of any rows floor by cause.""" self.df['min_floor'] = self.df.groupby( 'cause_id', as_index=False)['floor'].transform('min') missing_floor = self.df['min_floor'].isnull() nonzero_cf = self.df[self.cf_col] > 0 assert len(self.df[nonzero_cf & missing_floor]) == 0 def merge_pop_env(self, pop_df, env_df): if 'population' not in self.df.columns: self.df = add_population(self.df, add_cols=['population'], pop_df=pop_df) if 'mean_env' not in self.df.columns: self.df = add_envelope(self.df, add_cols=['mean_env'], env_df=env_df) def convert_nonzero_mad(self, df, cmdf): # add cause_id cmdf = cmdf[['acause', 'cause_id']] df = df.merge(cmdf, how='left', on='acause') # add id to cols df = df.rename(columns={ 'year': 'year_id', 'sex': 'sex_id', 'age': 'age_group_id' }) # convert age age_to_id_map = { 1: 5, 5: 6, 10: 7, 15: 8, 20: 9, 25: 10, 30: 11, 35: 12, 40: 13, 45: 14, 50: 15, 55: 16, 60: 17, 65: 18, 70: 19, 75: 20, 80: 30, 85: 31, 90: 32, 95: 235, 91: 2, 93: 3, 94: 4 } df['age_group_id'] = df['age_group_id'].map(age_to_id_map) df = df.drop('acause', axis=1) # make sure 2017-2018 are still missing missing_years = [2017, 2018] assert df.loc[df['year_id'].isin( missing_years)].floor.isnull().values.all() df = df.loc[~df['year_id'].isin(missing_years)] # We have determined that the floor is missing values for: # (1) certain cause/age/sexes in 2016 - we will use the 2015 floor to fill in these values # (2) certain cause/age/sexes across the entire time series - really nothing we # can do short of resetting the floor merge_cols = ['cause_id', 'age_group_id', 'sex_id'] report_duplicates(df, merge_cols + ['year_id']) new_floor = pd.merge(df.loc[df.year_id == 2016].copy(), df.loc[df.year_id == 2015].copy(), how='outer', on=merge_cols, suffixes=('', '_2015')) new_floor = new_floor.fillna({'floor': new_floor['floor_2015']})\ .loc[:, merge_cols + ['year_id', 'floor']] df = df.loc[df.year_id != 2016]\ .append(new_floor, ignore_index=True, sort=True) # If anything else is still missing, make sure it's missing for the entire time # series - otherwise we should write something more sophisticated to fill it in assert df.assign(floor_null=df.floor.isnull())\ .groupby(merge_cols + ['floor_null'])['year_id'].apply( lambda x: set(x) == set(range(1980, 2017))).all() # copy 2016 to 2017, 2018 for year in missing_years: df = df.append( df.loc[df.year_id == 2016].copy().assign(year_id=year), ignore_index=True) # Due to age restriction changes since last round, we now have data in cause/age # groups where we had no floor in GBD 2017 # Add in a nonzero floor created based on GBD 2019 data for these cause/age groups new_cause_ages = pd.read_csv(self.conf.get_resource('nonzero_floor_new_age_restrictions'))\ .drop('borrow_age_group_id', axis='columns') assert new_cause_ages.notnull().values.all() df = df.append(new_cause_ages, sort=True) # no duplicates df = df.loc[df.floor.notnull()] report_duplicates(df, ['year_id', 'cause_id', 'age_group_id', 'sex_id']) return df def fill_na_floors(self, df): if df.floor.isnull().any(): median = np.median(df[~df.floor.isnull()].floor) df.loc[df['floor'].isnull(), 'floor'] = median return df def merge_nonzero_mad_info(self, cmdf): """Read in the floor input and merge onto main dataframe.""" nonzero_mad = pd.read_csv(self.conf.get_resource("nonzero_floor_mad")) nonzero_mad = self.convert_nonzero_mad(nonzero_mad, cmdf) nonzero_mad_cols = self.merge_cols + ['floor'] nonzero_mad = nonzero_mad[nonzero_mad_cols] self.df = self.df.merge(nonzero_mad, how='left', on=self.merge_cols) if self.df.floor.isnull().any(): self.df = self.df.groupby(['year_id', 'sex_id', 'cause_id']).apply(self.fill_na_floors) if self.df.floor.isnull().any(): trouble_causes = self.df[self.df.floor.isnull()].cause_id.unique() filler = np.median(self.df[~self.df.floor.isnull()].floor) print_log_message("using nonzero filler because" " of these causes: {}".format(trouble_causes)) self.df.floor = self.df.floor.fillna(filler) self.df.loc[self.df.cause_id == 975, 'floor'] = 1e-50 report_if_merge_fail(self.df, 'floor', self.merge_cols) def get_diagnostic_dataframe(self): """Return diagnostics.""" try: return self.diag_df except AttributeError: print("You requested the diag dataframe before it was ready," " returning an empty dataframe.") return pd.DataFrame()
class NonZeroFloorer(CodProcess): """APPLY NON-ZERO FLOOR OF 1 DEATH PER 10,000,000""" conf = Configurator('standard') draws = range(0, conf.get_resource('uncertainty_draws')) cf_draw_cols = ['cf_draw_{}'.format(draw) for draw in draws] def __init__(self, df): self.df = df self.merge_cols = ['year_id', 'sex_id', 'age_group_id', 'cause_id'] self.cf_col = 'cf_final' if 'cf_draw_0' in self.df: self.cf_cols = [self.cf_col] + self.cf_draw_cols else: self.cf_cols = [self.cf_col] self.min_possible_val = 1e-50 def _check_all_floors_exist(self, nzf_df): ''' Check that all expected cancers, ages, and years, are present and have nonzero floor values ''' def _remove_ages_less_than(a, b): ''' ''' orig_list = a.copy() for val in orig_list: if b == 5 & val in [2, 3, 4]: continue if val < b: a.remove(val) return a print("CHECKING FOR ALL CAUSES, AGES, and YEARS...") # create cause_list db_link = cdb.db_api(db_connection_name='cancer_db') gbd_id = utils.get_gbd_parameter('current_gbd_round') registry_entity = db_link.get_table('registry_input_entity') registry_entity = registry_entity.loc[ registry_entity['gbd_round_id'].eq(gbd_id) & registry_entity['is_active'].eq(1), ] cancer_metadata = registry_entity[[ 'acause', 'cause_id', 'yll_age_start', 'yll_age_end' ]] causes_checklist = registry_entity['acause'].unique().tolist() # exceptions for nonzero floors causes_checklist.remove('neo_nmsc_bcc') causes_checklist.remove('neo_ben_intest') causes_checklist.remove('neo_ben_utr') causes_checklist.remove('neo_ben_other') causes_checklist.remove('neo_ben_brain') causes_checklist.remove('_gc') # create year_list year_start = utils.get_gbd_parameter('min_year_cod') year_end = utils.get_gbd_parameter('max_year') # + 1 for GBD2020 year_checklist = list(range(year_start, year_end)) # sex & age_id checklist age_id_checklist = [ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31, 32, 235, 2, 3, 4 ] #age_ids for 0-95 ages sex_checklist = [1, 2] # print any causes/years/sexes that are expected and missing for cancer in causes_checklist: print('working on...{}'.format(cancer)) subset = nzf_df.loc[nzf_df['acause'].eq(cancer), ] age_start = int( cancer_metadata.loc[cancer_metadata['acause'].eq(cancer), 'yll_age_start']) age_start = (age_start / 5) + 5 # conversion from age to GBD age_group_id if len(subset) == 0: print('MISSING CAUSE... {} '.format(cancer)) missing_ages = set(age_id_checklist) - set( subset['age_group_id'].unique().tolist()) missing_ages = list(missing_ages) missing_ages = _remove_ages_less_than(missing_ages, age_start) if len(missing_ages) > 0: print('missing the following ages for {}: {}'.format( cancer, missing_ages)) missing_sexes = set(sex_checklist) - set( subset['sex_id'].unique().tolist()) if len(missing_sexes) > 0: print('missing the following sexes for {}: {}'.format( cancer, missing_sexes)) missing_years = set(year_checklist) - set( subset['year_id'].unique().tolist()) if len(missing_years) > 0: print('missing the following years for {}: {}'.format( cancer, missing_years)) return def format_nzf(self, nzf_df, cmdf): ''' ''' # merge acause column nzf_df = pd.merge(nzf_df, cmdf[['acause', 'cause_id']], on='cause_id', how='left') return nzf_df def get_computed_dataframe(self, pop_df, env_df, cause_hierarchy): """Calculate mortality rates and replace cause fractions, as needed. """ orig_cols = list(self.df.columns) age_aggs = self.df[self.df.age_group_id.isin([22, 27])] self.df = self.df[~self.df.age_group_id.isin([22, 27])] self.merge_pop_env(pop_df, env_df) self.merge_nonzero_mad_info(cause_hierarchy) self.make_min_floor() self.make_replace_cf() for col in self.cf_cols: self.replace_cf(col) self.diag_df = self.df null_cfs = self.df.loc[self.df[self.cf_cols].isnull().any(axis=1)] if len(null_cfs) > 0: raise AssertionError( "Found null rates in the data: \n{}".format(null_cfs)) self.df = self.df[orig_cols + ['rate', 'floor']] self.df = self.df.append(age_aggs) # find lowest non-zero value that is in the dataframe and check that # it is not lower than lowest non-zero floor value data_min_val = self.df.loc[self.df['cf_final'] > 0, 'cf_final'].min() assert data_min_val >= self.min_possible_val, \ "Data min value [{}] was lower than non-zero floor min " \ "value [{}]".format(data_min_val, self.min_possible_val) return self.df def convert_nonzero_mad(self, df, cmdf): # add cause_id cmdf = cmdf[['acause', 'cause_id']] df = df.merge(cmdf, how='left', on='acause') # add id to cols df = df.rename(columns={ 'year': 'year_id', 'sex': 'sex_id', 'age': 'age_group_id' }) # convert age age_to_id_map = { 1: 5, 5: 6, 10: 7, 15: 8, 20: 9, 25: 10, 30: 11, 35: 12, 40: 13, 45: 14, 50: 15, 55: 16, 60: 17, 65: 18, 70: 19, 75: 20, 80: 30, 85: 31, 90: 32, 95: 235, 91: 2, 93: 3, 94: 4 } df['age_group_id'] = df['age_group_id'].map(age_to_id_map) df = df.drop('acause', axis=1) return df def compile_nonzero_floor(self, cmdf): ''' For GBD2019, new floor values were generated for cancer causes that had updated age restrictions, or was a new modeled cause. This function takes the original nonzero floor values, and appends all updated values ''' work_dir = utils.get_path(process='cod_mortality', key='nonzero_floor_workspace') orig_nzf = pd.read_csv( utils.get_path(process='cod_mortality', key='orig_nonzero_file')) # convert age_group_ids to comply with GBD's formatted_orig_nzf = self.convert_nonzero_mad(orig_nzf, cmdf) # load nonzero floor values with new age restrictions, and that were new causes # for this GBD cycle new_age_rstrct_df = pd.read_csv( '{}/nonzero_floor_new_age_restrictions.csv'.format(work_dir)) new_causes_df = pd.read_csv( '{}/nonzero_new_causes.csv'.format(work_dir)) # append all nonzero values together comp_nzf = formatted_orig_nzf.append(new_age_rstrct_df) comp_nzf = comp_nzf.append(new_causes_df) return comp_nzf def make_replace_cf(self): """Replace cause fractions based on mortality rates. If the rate is over 0 and less than the floor, then the cause fractions are replaced with floor * pop / mean_env """ self.df.loc[self.df['floor'].isnull(), 'floor'] = self.df['min_floor'] self.df.loc[self.df['floor'].isnull(), 'floor'] = 0 self.df['cf_replace'] = ((self.df['floor'] * self.df['population']) / self.df['mean_env']) def replace_cf(self, check_cf_col): # Replace the CF with the rate-adjusted CF if the # rate is less than the floor and greater than zero self.df['rate'] = ((self.df[check_cf_col] * self.df['mean_env']) / self.df['population']) cf_over_0 = self.df[check_cf_col] > 0 rate_less_than_floor = self.df['rate'] < self.df['floor'] self.df.loc[cf_over_0 & rate_less_than_floor, check_cf_col] = self.df['cf_replace'] def make_min_floor(self): """Set min floor to the minimum cf of any rows floor by cause.""" self.df['min_floor'] = self.df.groupby( 'cause_id', as_index=False)['floor'].transform('min') missing_floor = self.df['min_floor'].isnull() nonzero_cf = self.df[self.cf_col] > 0 assert len(self.df[nonzero_cf & missing_floor]) == 0 def merge_pop_env(self, pop_df, env_df): if 'population' not in self.df.columns: self.df = add_population(self.df, add_cols=['population'], pop_df=pop_df) if 'mean_env' not in self.df.columns: self.df = add_envelope(self.df, add_cols=['mean_env'], env_df=env_df) def fill_na_floors(self, df): if df.floor.isnull().any(): median = np.median(df[~df.floor.isnull()].floor) df.loc[df['floor'].isnull(), 'floor'] = median return df def merge_nonzero_mad_info(self, cmdf): """Read in the floor input and merge onto main dataframe.""" # load nonzero floor values nonzero_mad = self.compile_nonzero_floor(cmdf) nonzero_mad = self.format_nzf(nonzero_mad, cmdf) self._check_all_floors_exist( nonzero_mad) # checks that all age_groups/cancer/year/sex exist nonzero_mad_cols = self.merge_cols + ['floor'] nonzero_mad = nonzero_mad[nonzero_mad_cols] self.min_possible_val = nonzero_mad['floor'].min() self.df = self.df.merge(nonzero_mad, how='left', on=self.merge_cols) # ensure no floor values are missing assert self.df.floor.isnull().any() == False, "null floor values exist" report_if_merge_fail(self.df, 'floor', self.merge_cols) def get_diagnostic_dataframe(self): """Return diagnostics.""" try: return self.diag_df except AttributeError: print("You requested the diag dataframe before it was ready," " returning an empty dataframe.") return pd.DataFrame()