def simple_aggregate(self): """Aggregate causes.""" df = add_cause_metadata(self.df, ['secret_cause', 'parent_id', 'level'], merge_col='cause_id', cause_meta_df=self.cause_meta_df) # quick check that there are no secret causes secret_causes = df.loc[df['secret_cause'] == 1] if len(secret_causes) > 0: raise AssertionError("The following secret causes are still " "in the data: \n{}".format( secret_causes['cause_id'].unique())) cause_levels = sorted(range(2, 6, 1), reverse=True) for level in cause_levels: level_df = df[df['level'] == level] if len(level_df) > 0: # replace the cause_id with the parent_id level_df['cause_id'] = level_df['parent_id'] level_df['level'] = df['level'] - 1 level_df.drop('parent_id', axis=1, inplace=True) level_df = add_cause_metadata(level_df, ['parent_id'], merge_col=['cause_id'], cause_meta_df=self.cause_meta_df) # add in deaths by each level df = pd.concat([level_df, df], ignore_index=True) return df
def level_3_aggregate(self): df = add_cause_metadata(self.df, ['secret_cause', 'parent_id', 'level'], merge_col='cause_id', cause_meta_df=self.cause_meta_df) # quick check that there are no secret causes secret_causes = df.loc[df['secret_cause'] == 1] if len(secret_causes) > 0: raise AssertionError("The following secret causes are still " "in the data: \n{}".format( secret_causes['cause_id'].unique())) for level in [5, 4, 3]: level_df = df[df['level'] == level] if len(level_df) > 0: # replace the cause_id with the parent_id level_df['cause_id'] = level_df['parent_id'] level_df['level'] = level_df['level'] - 1 level_df.drop('parent_id', axis=1, inplace=True) # add parent_id back in for the newly changed cause_id # tried with mapping and this was faster level_df = add_cause_metadata(level_df, ['parent_id'], merge_col=['cause_id'], cause_meta_df=self.cause_meta_df) # add in deaths by each level df = pd.concat([level_df, df], ignore_index=True) return df
def level_3_aggregate(self): """Aggregate level 4 causes to their level 3 parent. This custom cause aggregation is only used where we have one type of cause in a source and we only want to aggregate to their parent Example: data with only road traffic injuries should only be aggregated to 'inj_trans_road' not all of 'inj_trans'. """ df = add_cause_metadata(self.df, ['secret_cause', 'parent_id', 'level'], merge_col='cause_id', cause_meta_df=self.cause_meta_df) # quick check that there are no secret causes secret_causes = df.loc[df['secret_cause'] == 1] if len(secret_causes) > 0: raise AssertionError("The following secret causes are still " "in the data: \n{}".format( secret_causes['cause_id'].unique())) for level in [5, 4, 3]: level_df = df[df['level'] == level] if len(level_df) > 0: # replace the cause_id with the parent_id level_df['cause_id'] = level_df['parent_id'] level_df['level'] = level_df['level'] - 1 level_df.drop('parent_id', axis=1, inplace=True) # add parent_id back in for the newly changed cause_id # tried with mapping and this was faster level_df = add_cause_metadata(level_df, ['parent_id'], merge_col=['cause_id'], cause_meta_df=self.cause_meta_df) # add in deaths by each level df = pd.concat([level_df, df], ignore_index=True) return df
def simple_aggregate(self): """Aggregate causes.""" df = add_cause_metadata(self.df, ['secret_cause', 'parent_id', 'level'], merge_col='cause_id', cause_meta_df=self.cause_meta_df) # quick check that there are no secret causes secret_causes = df.loc[df['secret_cause'] == 1] if len(secret_causes) > 0: raise AssertionError("The following secret causes are still " "in the data: \n{}".format( secret_causes['cause_id'].unique())) cause_levels = sorted(range(2, 6, 1), reverse=True) # TODO # there's another bit in stata where we only aggregate causes # that belong to the same (source source_label subdiv NID) groups # maybe this is not needed now that we're going by NID? for level in cause_levels: level_df = df[df['level'] == level] if len(level_df) > 0: # replace the cause_id with the parent_id level_df['cause_id'] = level_df['parent_id'] level_df['level'] = df['level'] - 1 level_df.drop('parent_id', axis=1, inplace=True) # add parent_id back in for the newly changed cause_id # tried with mapping and this was faster level_df = add_cause_metadata(level_df, ['parent_id'], merge_col=['cause_id'], cause_meta_df=self.cause_meta_df) # add in deaths by each level df = pd.concat([level_df, df], ignore_index=True) return df
def get_computed_dataframe(self, df): """Replace acauses with those in the bridge map.""" # VA sources are the only ones where this may not work # might need to split dataframe by data_type_id for bridge map df = add_nid_metadata(df, ['data_type_id'], **self.cache_options) has_verbal_autopsy = self.VA in df['data_type_id'].unique() df.drop(columns='data_type_id', inplace=True) if self.needs_bridging(has_verbal_autopsy): file_name = self.get_file_name(has_verbal_autopsy) map_df = pd.read_csv(self.bridge_map_path / file_name) map_df = map_df[['acause', 'bridge_code']] # add acause column to deaths data bridge_mapped = add_cause_metadata( df, ['acause'], merge_col='cause_id', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in somehow... bridge_mapped.loc[ bridge_mapped['cause_id'] == 606, 'acause' ] = 'gyne_femaleinfert' report_if_merge_fail(bridge_mapped, 'acause', 'cause_id') bridge_mapped.drop(['cause_id'], axis=1, inplace=True) # perform zz bridge code redistribution before other bridge mapping bridge_mapped = self.redistribute_zz_bridge_codes(bridge_mapped, map_df) bridge_mapped = bridge_mapped.merge( map_df, how='left', on='acause' ) bridge_mapped = self.acause_to_bridge_code(bridge_mapped) # bring cause_id back bridge_mapped = add_cause_metadata( bridge_mapped, ['cause_id'], merge_col='acause', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in bridge_mapped.loc[ bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id' ] = 606 report_if_merge_fail(bridge_mapped, 'cause_id', 'acause') # output diagnostic dataframe self.diag_df = bridge_mapped # drop unnecessary columns bridge_mapped = self.clean_up(bridge_mapped) return bridge_mapped else: self.diag_df = df df = self.clean_up(df) return df
def get_computed_dataframe(self, df): """Replace acauses with those in the bridge map.""" df = add_nid_metadata(df, ['data_type_id'], **self.cache_options) has_verbal_autopsy = self.VA in df['data_type_id'].unique() if self.needs_bridging(has_verbal_autopsy): sheet_name = self.get_sheet_name(has_verbal_autopsy) map_df = pd.read_excel(self.bridge_map_path, sheetname=sheet_name) map_df = map_df[['acause', 'bridge_code']] # add acause column to deaths data bridge_mapped = add_cause_metadata( df, ['acause'], merge_col='cause_id', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in somehow... bridge_mapped.loc[ bridge_mapped['cause_id'] == 606, 'acause' ] = 'gyne_femaleinfert' report_if_merge_fail(bridge_mapped, 'acause', 'cause_id') bridge_mapped.drop(['cause_id'], axis=1, inplace=True) bridge_mapped = bridge_mapped.merge( map_df, how='left', on='acause' ) bridge_mapped = self.acause_to_bridge_code(bridge_mapped) # bring cause_id back bridge_mapped = add_cause_metadata( bridge_mapped, ['cause_id'], merge_col='acause', cause_meta_df=self.cause_meta_df ) bridge_mapped.loc[ bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id' ] = 606 report_if_merge_fail(bridge_mapped, 'cause_id', 'acause') # output diagnostic dataframe self.diag_df = bridge_mapped # drop unnecessary columns bridge_mapped = self.clean_up(bridge_mapped) return bridge_mapped else: self.diag_df = df df = self.clean_up(df) return df
def get_rates_df(self, cause_meta_df): if self.correct_garbage: filepath_infix = 'PRE' else: filepath_infix = 'POST' rates = pd.read_stata("FILEPATH".format(fp=filepath_infix, iso=self.iso3)) rates = add_cause_metadata(rates, ['cause_id'], merge_col='acause', cause_meta_df=cause_meta_df) rates.loc[rates['acause'] == "_sepsis_gc", 'cause_id'] = self.sepsis_cause_id age_df = get_cod_ages() age_df = age_df.loc[~age_df['age_group_id'].isin([2, 3])] age_df['agecat'] = age_df['age_group_years_start'] age_df.loc[age_df['age_group_id'] == 4, 'agecat'] = 0 age_df = age_df[['agecat', 'age_group_id']] # merge on ages to rates data rates = rates.merge(age_df, on='agecat', how='left') assert not rates['age_group_id'].isnull().any() # clean up columns rates = rates.rename(columns={'sex': 'sex_id'}) rates = rates.drop(['acause', 'agecat'], axis=1) return rates
def get_rates_df(self, cause_meta_df): """Write a nice description here.""" if self.correct_garbage: filepath_infix = 'PRE' else: filepath_infix = 'POST' rates_path = self.conf.get_resource( 'hivcorr_global_causespecific_relrates').format( pre_post=filepath_infix, iso=self.iso3) rates = pd.read_stata(rates_path) # convert acause to cause_id rates = add_cause_metadata(rates, ['cause_id'], merge_col='acause', cause_meta_df=cause_meta_df) rates.loc[rates['acause'] == "_sepsis_gc", 'cause_id'] = self.sepsis_cause_id # convert age to age_group_id # TODO THIS NEEDS TO BE CHANGED B/C RIGHT NOW IT QUERIES DB # change get_demographics to have caching option too? age_df = get_cod_ages() age_df = age_df.loc[~age_df['age_group_id'].isin([2, 3])] age_df['agecat'] = age_df['age_group_years_start'] age_df.loc[age_df['age_group_id'] == 4, 'agecat'] = 0 age_df = age_df[['agecat', 'age_group_id']] # merge on ages to rates data rates = rates.merge(age_df, on='agecat', how='left') assert not rates['age_group_id'].isnull().any() # clean up columns rates = rates.rename(columns={'sex': 'sex_id'}) rates = rates.drop(['acause', 'agecat'], axis=1) return rates
def assert_valid_mappings(self, df, code_system_id): """Test that the mapping worked. Runs a suite of assertions to make sure that mapping was successful. Args: df (DataFrame): with at least code_id and cause_id Returns: None Raises: AssertionError: Any condition fails """ # add code value from cached code map print("Adding value") df = add_code_metadata(df, ['value'], code_system_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) report_if_merge_fail(df, 'value', 'code_id') # get acause from cached cause hierarchy print("Adding acause") df = add_cause_metadata(df, ['acause'], cause_set_version_id=self.cause_set_version_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) report_if_merge_fail(df, 'acause', 'cause_id') # Test that all causes starting with 'acause_' are mapped correctly. # acause_cvd, for example, should be mapped to 'cvd' (not 'cvd_ihd'). # 'acause__gc_X59' should be mapped to '_gc', etc. print("Checking implied acauses") check_df = df.loc[df['value'].str.startswith('acause_')] check_df['implied_acause'] = \ check_df['value'].str.replace('acause_', '', 1) check_df.loc[check_df['value'].str.contains("acause__gc"), 'implied_acause'] = "_gc" bad_df = check_df.loc[check_df['acause'] != check_df['implied_acause']] if len(bad_df) > 0: bad_stuff = bad_df[['value', 'acause']].drop_duplicates() raise AssertionError( "These code values do not match their acause: " "\n{}".format(bad_stuff)) print("Checking for bad values") # assert incorrect acauses are gone bad_acauses = [ 'acause_digest_gastrititis', 'acause_hiv_tb', 'acause_tb_drug' ] bad_df = df.loc[df['value'].isin(bad_acauses)].value.unique() if len(bad_df) > 0: raise AssertionError( "Found these bad code values in the data: {}".format( bad_stuff))
def get_diagnostic_dataframe(self): """Return diagnostics.""" # important to run this full method first df = self.get_computed_dataframe() # cause metadata was changed in the process, so merge that on again # add on cause metadata to see parent_ids and cause levels df = add_cause_metadata(df, ['parent_id', 'level'], merge_col='cause_id', cause_meta_df=self.cause_meta_df) return df
def drop_source_data(df, model_group, location_hierarchy, cause_meta_df): # for India VA if model_group == "VA-IND": srs = df['source'].str.startswith("India_SRS") scd = df['source'].str.startswith("India_SCD") df = df[~(srs | scd)] # special conditions for maternal sources if model_group.startswith("MATERNAL"): # grab countries for which we produce subnational estimates ihme_loc_dict = location_hierarchy.set_index( 'location_id')['ihme_loc_id'].to_dict() df['iso3'] = df['location_id'].map(ihme_loc_dict) subnational_modeled_iso3s = CONF.get_id('subnational_modeled_iso3s') agg_locs = df['iso3'].isin(subnational_modeled_iso3s) # sources allowed to have location aggregates in model_df agg_sources = ["Other_Maternal", "Mexico_BIRMM"] no_loc_agg_source = ~(df['source'].isin(agg_sources)) # drop rows where the source shouldn't have a location aggregate df = df[~(no_loc_agg_source & agg_locs)] # cleanup extra columns df.drop('iso3', axis=1, inplace=True) # these causes were likely introduced when adding in rd variance df = add_cause_metadata(df, 'yld_only', cause_meta_df=cause_meta_df) df = df.loc[df['yld_only'] != 1] df = df.drop('yld_only', axis=1) # VR data for Bolivia is dropped due to low completeness, only keep Chagas if model_group == 'VR-BOL': chagas = df['cause_id'] == 346 df = df.loc[chagas] # for the malaria model groups, only keep malaria malaria = df['cause_id'] == 345 if model_group.startswith('malaria'): df = df.loc[malaria] if model_group in ["malaria_IND_hypoendem", "malaria_IND_SRS_hypoendem"]: df = df.query('location_id != 163') if model_group in ["malaria_IND_mesoendem", "malaria_IND_SRS_mesoendem"]: df = df.loc[~(df['location_id'].isin([43902, 43938]))] return df
def assign_code_to_created_target_deaths(df, code_system_id, cause_meta_df): created = df[df['_merge'] == 'right_only'] original = df[df['_merge'] != 'right_only'] created = add_cause_metadata(created, 'acause', cause_meta_df=cause_meta_df) created['value'] = created['acause'].apply(lambda x: 'acause_' + x) created.drop(['code_id', 'acause'], axis=1, inplace=True) created = add_code_metadata(created, 'code_id', code_system_id=code_system_id, merge_col='value', cache_dir=CONF.get_directory('db_cache'), force_rerun=False, block_rerun=True) report_if_merge_fail(created, 'code_id', ['value']) df = original.append(created) df.drop(['_merge', 'value'], axis=1, inplace=True) return df
def drop_age_restricted_cols(df): start = len(df) age_meta_df = get_ages(force_rerun=False, block_rerun=True) # secret causes in restrictions cause_meta_df = get_current_cause_hierarchy(cause_set_id=4, **{ 'block_rerun': True, 'force_rerun': False }) restrict_df = pd.read_csv( "/homes/agesak/thesis/maps/injuries_overrides.csv") restrict_df = add_cause_metadata(restrict_df, add_cols='cause_id', merge_col='acause', cause_meta_df=cause_meta_df) restrict_df["age_start_group"] = restrict_df["age_start_group"].fillna(0) orig_cols = df.columns df = add_age_metadata( df, add_cols=['age_group_years_start', 'age_group_years_end'], age_meta_df=age_meta_df) df = df.merge(restrict_df, on='cause_id', how='left') # age_group_years_end is weird, 0-14 means age_group_years_end 15 too_young = df["age_group_years_end"] <= df["age_start_group"] too_old = df["age_group_years_start"] > df["age_end_group"] df = df[~(too_young | too_old)] df = df[orig_cols] end = len(df) print_log_message( f"dropping {start - end} cols that violate age restrictions") return df
def drop_source_data(df, model_group, location_hierarchy, cause_meta_df): """Drop source specific data from model dataframe. Note: technically only the DHS survey types in Other_Maternal should have a location aggregation going into nr, however, this messes up raking later. """ # for India VA if model_group == "VA-IND": srs = df['source'].str.startswith("India_SRS") scd = df['source'].str.startswith("India_SCD") df = df[~(srs | scd)] # for South Asia VA model group, Nepal_Burden_VA needs to be dropped if model_group == 'VA-158': df = df[df.source != "Nepal_Burden_VA"] # special conditions for maternal sources if model_group.startswith("MATERNAL"): # grab countries for which we produce subnational estimates ihme_loc_dict = location_hierarchy.set_index( 'location_id')['ihme_loc_id'].to_dict() df['iso3'] = df['location_id'].map(ihme_loc_dict) subnational_modeled_iso3s = CONF.get_id('subnational_modeled_iso3s') agg_locs = df['iso3'].isin(subnational_modeled_iso3s) # sources allowed to have location aggregates in model_df agg_sources = ["Other_Maternal", "Mexico_BIRMM"] no_loc_agg_source = ~(df['source'].isin(agg_sources)) # drop rows where the source shouldn't have a location aggregate df = df[~(no_loc_agg_source & agg_locs)] # cleanup extra columns df.drop('iso3', axis=1, inplace=True) # these causes were likely introduced when adding in rd variance df = add_cause_metadata(df, 'yld_only', cause_meta_df=cause_meta_df) df = df.loc[df['yld_only'] != 1] df = df.drop('yld_only', axis=1) # VR data for Bolivia is dropped due to low completeness, only keep Chagas if model_group == 'VR-BOL': chagas = df['cause_id'] == 346 df = df.loc[chagas] # for the malaria model groups, only keep malaria malaria = df['cause_id'] == 345 if model_group.startswith('malaria'): df = df.loc[malaria] # two model groups for India, but don't want national India to be duplicated # drop national level India data from this malaria model group # it will then only be included in the malaria_IND_mesoendem group if model_group in ["malaria_IND_hypoendem", "malaria_IND_SRS_hypoendem"]: df = df.query('location_id != 163') # similar fix for telangana, drop from mesoendem so it'll be in hypoendem if model_group in ["malaria_IND_mesoendem", "malaria_IND_SRS_mesoendem"]: df = df.loc[~(df['location_id'].isin([43902, 43938]))] return df