def aggregate_locations(self): """Aggregate sub national location_ids up to the country level.""" df = add_location_metadata( self.df, ['location_id', 'path_to_top_parent', 'level'], merge_col='location_id', location_meta_df=self.location_meta_df) # don't need larger than country level aggregation loc_levels = [x for x in df['level'].unique() if df['level'] >= 3] loc_levels.sort() loc_levels.reverse() for level in loc_levels: level_df = df[df['level'] == level] if len(level_df) > 0: # replace the location with the parent_id level_df['location_id'] = level_df['parent_id'] level_df['level'] = df['level'] - 1 level_df.drop('parent_id', axis=1, inplace=True) level_df = add_location_metadata( level_df, ['parent_id'], merge_col=['location_id'], location_meta_df=self.location_meta_df) # add in deaths by each level df = df.append(level_df, ignore_index=True) return df
def get_completeness(self): """Retrieve VR completeness. VR completeness is not calculated here so that that script can be shared with the demographics team """ df = pd.read_csv(self.completeness) df = add_location_metadata(df, 'ihme_loc_id', location_meta_df=self.location_meta_df) # the completeness csv has duplicates for some location/years within # mainland China for "sample" and "envelope" denominators # we want to use "sample" aka DDM for all of mainland China is_mainland_china = (df['ihme_loc_id'].str.startswith( "CHN")) & ~(df['ihme_loc_id'].isin(["CHN_354", "CHN_361"])) is_sample = df['denominator'] == "sample" df = df.loc[(is_mainland_china & is_sample) | (~is_mainland_china & ~is_sample)] assert not df[['ihme_loc_id', 'year_id']].duplicated().any() # set to 1 if it's over 1 df['comp'] = df['comp'].apply(lambda x: 1 if x > 1 else x) df = df[['location_id', 'year_id', 'comp']] return df
def check_subnational_locations(df): subnational_iso3s = CONF.get_id('subnational_modeled_iso3s') if 'UKR' in subnational_iso3s: subnational_iso3s.remove('UKR') df = add_location_metadata( df, 'ihme_loc_id', location_set_version_id=CONF.get_id('location_set_version')) nid_etid_pairs = list(df[['nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False)) nid_etid_pairs = [tuple(pair) for pair in nid_etid_pairs] for x in nid_etid_pairs: nid = x[0] etid = x[1] check_loc = list( df.loc[(df['nid'] == nid) & (df['extract_type_id'] == etid), 'ihme_loc_id'].unique())[0] try: assert check_loc not in subnational_iso3s except: print("Found {} where we model subnationally " "for NID {} extract_type_id {}. Setting is_active" " to False".format(check_loc, nid, etid)) df.loc[(df['nid'] == nid) & (df['extract_type_id'] == etid), 'is_active'] = 0 return df
def prep_stars(self, df): """Calculate stars related indicators.""" print_log_message("Calculating stars") stars_df = self.calculate_stars(df) # formatting pieces, conforming to previous versions stars_df = add_location_metadata( stars_df, [ "ihme_loc_id", "level", "location_ascii_name", "sort_order", "parent_id" ], location_meta_df=self.location_meta_df) stars_df = stars_df.rename( columns={ "sort_order": "location_sort_order", "level": "location_level", "location_ascii_name": "location_name" }) print_log_message("Calculating global stars, because why not") world_stars = self.measure_the_world(stars_df) stars_df = pd.concat([stars_df, world_stars], ignore_index=True) # add parent ihme_loc_id instead of parent_id ihme_loc_id_dict = self.location_meta_df.set_index( "location_id")["ihme_loc_id"].to_dict() stars_df["parent_ihme_loc_id"] = \ stars_df["parent_id"].map(ihme_loc_id_dict) stars_df.drop("parent_id", axis=1, inplace=True) assert stars_df.notnull().values.any() return stars_df
def adjust_representative_id(df): """ Change representative_id to 1 for non-VR. """ if 'ihme_loc_id' not in df.columns: df = add_location_metadata( df, 'ihme_loc_id', location_set_version_id=CONF.get_id('location_set_version')) df['iso3'] = df['ihme_loc_id'].str[0:3] override_locs = ['PAK', 'NGA', 'PHL'] vr_data_types = [9, 10] # TO DO # this function should be flexible enough to: # 1) check if there are VR data for this specific location/year # 2) change represenative id to 0 for non-VR if there is VR # 3) change representative id to 1 ONLY if there is no VR present # 4) adding a print statement/documentation of loc/years changed would be nice # today = datetime.date.today() # decomp_step_2_end = datetime.date(2019, 5, 20) # assert decomp_step_2_end > today, "Fix this method before we add new data!" df.loc[(df['iso3'].isin(override_locs)) & (df['ihme_loc_id'].str.len()) > 4 & (~df['data_type_id'].isin(vr_data_types)), 'representative_id'] = 1 return df
def format_for_nr(df, location_hierarchy): """Merge on needed location metadata.""" locs = df[['location_id']].drop_duplicates() locs = add_location_metadata( locs, add_cols=["ihme_loc_id", "path_to_top_parent"], merge_col="location_id", location_meta_df=location_hierarchy ) report_if_merge_fail(locs, 'path_to_top_parent', 'location_id') locs['country_id'] = locs['path_to_top_parent'].str.split(",").apply( lambda x: int(x[3])) locs['subnat_id'] = locs['ihme_loc_id'].apply( lambda x: int(x.split("_")[1]) if "_" in x else 0) locs['iso3'] = locs['ihme_loc_id'].str.slice(0, 3) different_locations = locs['country_id'] != locs['location_id'] locs.loc[different_locations, 'iso3'] = \ locs['iso3'].apply(lambda x: x + "_subnat") locs = locs[['location_id', 'country_id', 'subnat_id', 'iso3']] df = df.merge(locs, on='location_id', how='left') report_if_merge_fail(locs, 'country_id', 'location_id') # create indicator column for running a separate nr model # for national and subnational locations subnational_modeled_iso3s = CONF.get_id('subnational_modeled_iso3s') df['is_loc_agg'] = 0 df.loc[df['iso3'].isin(subnational_modeled_iso3s), 'is_loc_agg'] = 1 # remove 0 sample size rows df = df.loc[df['sample_size'] > 0] return df
def calculate_positive_excess(df, value_cols, pop_col, loc_meta_df): """Calculate the positive excess for flagged observations.""" # calculate the values after positive excess is removed df = add_location_metadata(df, 'ihme_loc_id', location_meta_df=loc_meta_df, cache_dir=CONF.get_directory('db_cache')) df['iso3'] = df['ihme_loc_id'].apply(lambda x: x[:3]) df = df.drop('ihme_loc_id', axis=1) df.loc[ df['flagged'] == 1, 'deaths_post'] = df['baserate_deaths'] * df[pop_col] * df['rrateWLD'] # calculate the positive excess df.loc[df['flagged'] == 1, 'positive_excess'] = df['deaths'] - df['deaths_post'] # where positive excess is negative, or flagged is 0, val_post = val no_pe = (df['positive_excess'] < 0) | (df['positive_excess'].isnull()) df.loc[no_pe, 'deaths_post'] = df['deaths'] df.loc[no_pe, 'positive_excess'] = 0 df = apply_ussr_tb_proportions(df) # Leave tuberculosis over age 60 # Currently 297 is `tb` and 934 is `tb_other` df.loc[(df['cause_id'].isin([297, 934])) & (df['age_group_id'] > 17), 'positive_excess'] = 0 df.loc[(df['cause_id'].isin([297, 934])) & (df['age_group_id'] > 17), 'deaths_post'] = df['deaths'] return df
def get_env_by_time_window(self): """Get an envelope by time window.""" # keep 1980 onwards env = self.env_meta_df.query('year_id >= 1980') # add time window env['time_window'] = env['year_id'].apply( lambda x: self.assign_time_group(x)) env = env.groupby(['location_id', 'time_window'], as_index=False)['mean_env'].sum() # restrict to level 3 env = add_location_metadata(env, "level", location_meta_df=self.location_meta_df) env = env.query('level == 3') # keep the right columns env = env[['location_id', 'time_window', 'mean_env']] # append 1980 onwards envelope env_all = env.groupby(['location_id'], as_index=False)['mean_env'].sum() env_all['time_window'] = self.full_time_series env = pd.concat([env_all, env], ignore_index=True) return env
def aggregate_locations(self): """Aggregate sub national location_ids up to the country level, and create any intermediate aggregates in between """ df = add_location_metadata(self.df, ['parent_id', 'level'], merge_col='location_id', location_meta_df=self.location_meta_df) # get the max location level in the data max_level = df['level'].max() # want to aggregate all levels between national and max level # if the entire dataframe is already national, no need to aggregate if max_level > 3: loc_levels = range(4, max_level + 1) loc_levels.reverse() for level in loc_levels: # basic strategy: start with largest level and aggregate to parent location # repeat process with next highest level until reaching national level_df = df.loc[df['level'] == level] if len(level_df) > 0: # replace the location with the parent_id level_df['location_id'] = level_df['parent_id'] level_df['level'] = df['level'] - 1 # need to drop and re-add parent_id for next level of aggregation level_df.drop('parent_id', axis=1, inplace=True) level_df = add_location_metadata( level_df, ['parent_id'], merge_col=['location_id'], location_meta_df=self.location_meta_df) # collapse the level_df down, for aggregates, want to set site_id to 2 group_cols = [ col for col in level_df.columns if col not in self.val_cols ] group_cols.remove('site_id') level_df = level_df.groupby( group_cols, as_index=False)[self.val_cols].sum() level_df['site_id'] = 2 # append aggregated deaths to original df df = df.append(level_df, ignore_index=True) df.drop(['parent_id', 'level'], axis=1, inplace=True) else: print("No locations more detailed than national to aggregate") return df
def add_reg_location_metadata(df, location_hierarchy): df = add_location_metadata(df, ['region_id', 'super_region_id'], location_meta_df=location_hierarchy) report_if_merge_fail(df, 'region_id', 'location_id') df['region_id'] = df['region_id'].astype(int) report_if_merge_fail(df, 'super_region_id', 'location_id') df['super_region_id'] = df['super_region_id'].astype(int) df = df.rename(columns={ 'super_region_id': 'super_region', 'region_id': 'region', 'location_id': 'country' }) df['global'] = 1 return df
def prep_vr_indicators(self, df): """Prep outputs for VR by location/year. This "view" is used for data drops. ALso passed to SDGs. """ is_vr = df['data_type_id'].isin([9, 10]) keep_cols = [ 'location_id', 'year_id', 'data_type_id', 'nid', 'source', 'nationally_representative', 'pctgarbage', 'pctagr', 'comp' ] vr_df = df.loc[is_vr, keep_cols] assert not vr_df[['location_id', 'year_id', 'data_type_id' ]].duplicated().any() vr_df = add_location_metadata(vr_df, 'ihme_loc_id', location_meta_df=self.location_meta_df) # added this column for the SDGs team vr_df['pct_well_cert'] = vr_df['comp'] * (1 - vr_df['pctgarbage'] - vr_df['pctagr']) # set variable logic for clarity sub_rep = vr_df['nationally_representative'] == 0 nigeria = vr_df['source'] == "Nigeria_VR" mccd = vr_df['source'].str.startswith("India_MCCD") garbage_over_50 = vr_df['pctgarbage'] > .5 comp_under_50 = vr_df['comp'] < .5 comp_over_20 = vr_df['comp'] > .2 comp_under_70 = (vr_df['comp'] < .7) # set drop_status variable based on completeness and garbage vr_df['drop_status'] = 0 vr_df.loc[garbage_over_50 | comp_under_50, 'drop_status'] = 1 # USERNAME made the decision to stop making an exception for this source (12/19/2018) # vr_df.loc[nigeria, 'drop_status'] = 0 vr_df.loc[mccd & comp_over_20 & ~garbage_over_50, 'drop_status'] = 0 vr_df.loc[sub_rep & ~garbage_over_50, 'drop_status'] = 0 # This source has been manually dropped since GBD 2019 decomp step 1 - now # adding this drop to the code (7/23/2019) vr_df.loc[nigeria, 'drop_status'] = 1 vr_df['comp_under_70'] = 0 vr_df.loc[comp_under_70, 'comp_under_70'] = 1 return vr_df
def make_india_state_aggregates(pafs, locations): """ Make state level aggregates for India. Will append these in addition to the national aggregate """ # restrict pafs data to just india urban/rural subnats detail_india = locations.loc[ (locations.ihme_loc_id.str.startswith('IND')) & (locations.level == 5)]['location_id'].unique().tolist() pafs = pafs.loc[pafs.location_id.isin(detail_india)] # add the parent_id and make it the location id pafs = add_location_metadata(pafs, 'parent_id', location_meta_df=locations) pafs['location_id'] = pafs['parent_id'] pafs.drop('parent_id', axis=1, inplace=True) # collapse the data draw_cols = [col for col in pafs.columns if 'draw' in col] state_aggs = pafs.groupby(['age_group_id', 'location_id', 'year'])[draw_cols].mean().reset_index() return state_aggs
def prep_star_level_location_hierarchy(self, df): """ Output a file used by central comp to populate star based location hieraerchy. This file contains only most detailed locations. Each location has a parent_id (either 1 or 0) indicating data rich or data sparse """ # add information for most detailed df = add_location_metadata(df, 'most_detailed') full_time = (df['time_window'] == self.full_time_series) most_detailed = (df['most_detailed'] == 1) # limit to most detailed locs and the full time series df = df.loc[full_time & most_detailed] # add the parent_ids df.loc[df.stars.isin([4, 5]), 'parent_id'] = 1 df.loc[df.stars.isin([0, 1, 2, 3]), 'parent_id'] = 0 assert df.parent_id.notnull().all() df = df[[ 'ihme_loc_id', 'location_id', 'parent_id', 'time_window', 'stars' ]] return df
def decomp_one_remap_location_id(self, loc_ids): """For decomp step one, there is no dismod data for new locations or new subnationals. For new subnationals, we will use dismod data for the subnational's parent. For new locations we will use dismod data from a neighboring region country. This function/step should be removed in step two (12/17/2018)""" df = pd.DataFrame({'location_id': loc_ids}) df = add_location_metadata( df, 'parent_id', location_set_version_id=self.conf.get_id('location_set_version'), **self.block_rerun) report_if_merge_fail(df, 'parent_id', 'location_id') # make subnational location the parent df.loc[df.parent_id.isin([16, 51, 86, 165, 214]), 'location_id'] = df['parent_id'] # remap Saint Kitts to Dominican Repuclic df.loc[df.location_id == 393, 'location_id'] = 111 # remap Monaco and San Marino to Italy df.loc[df.location_id.isin([367, 396]), 'location_id'] = 86 # remap Palau and Cook Islands to Papua New Guinea df.loc[df.location_id.isin([320, 380]), 'location_id'] = 26 return df.location_id.unique().tolist()
def check_subnational_locations(df): """Check that there are no national observations where we model subnationally. Use the list of subnationally modeled countries to ensure that there are no national observations. This will raise an assertion error, and the user should either re extract subnational information from the original data or accept that it will remain inactive. """ subnational_iso3s = CONF.get_id('subnational_modeled_iso3s') # exceptions for Ukraine which is location split in aggregation # remove Ukraine-- this is a special case where we're splitting # the national Ukraine data to Ukraine without crimea/sevastopol # later the in the prep process (see count_adjustments.py) if "UKR" in subnational_iso3s: subnational_iso3s.remove('UKR') df = add_location_metadata( df, 'ihme_loc_id', location_set_version_id=CONF.get_id('location_set_version')) nid_etid_pairs = list(df[['nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False)) nid_etid_pairs = [tuple(pair) for pair in nid_etid_pairs] for x in nid_etid_pairs: nid = x[0] etid = x[1] check_loc = list( df.loc[(df['nid'] == nid) & (df['extract_type_id'] == etid), 'ihme_loc_id'].unique())[0] if check_loc in subnational_iso3s: print("Found {} where we model subnationally " "for NID {} extract_type_id {}. Setting is_active" " to False".format(check_loc, nid, etid)) df.loc[(df['nid'] == nid) & (df['extract_type_id'] == etid), 'is_active'] = 0 return df
def calculate_positive_excess(df, value_cols, pop_col, loc_meta_df): df = add_location_metadata(df, 'ihme_loc_id', location_meta_df=loc_meta_df, cache_dir=CONF.get_directory('db_cache')) df['iso3'] = df['ihme_loc_id'].apply(lambda x: x[:3]) df = df.drop('ihme_loc_id', axis=1) df.loc[ df['flagged'] == 1, 'deaths_post'] = df['baserate_deaths'] * df[pop_col] * df['rrateWLD'] # calculate the positive excess df.loc[df['flagged'] == 1, 'positive_excess'] = df['deaths'] - df['deaths_post'] no_pe = (df['positive_excess'] < 0) | (df['positive_excess'].isnull()) df.loc[no_pe, 'deaths_post'] = df['deaths'] df.loc[no_pe, 'positive_excess'] = 0 df = apply_ussr_tb_proportions(df) df.loc[(df['cause_id'].isin([297, 934])) & (df['age_group_id'] > 17), 'positive_excess'] = 0 df.loc[(df['cause_id'].isin([297, 934])) & (df['age_group_id'] > 17), 'deaths_post'] = df['deaths'] return df
def map_extract_type_id(df, source, extract_type, conn_def='ADDRESS'): nid_extract_types = {} start_extract_type = extract_type for nid in df.nid.unique(): if source == "China_DSP_prov_ICD10": extract_type = "admin1: DSP sites only" elif start_extract_type is None: extract_type = None extract_type = induce_extraction_type(df.loc[df['nid'] == nid], conn_def=conn_def) else: extract_type = extract_type extract_type = str(extract_type).strip() # Determine (or create) the extract_type_id extract_type_id = pull_or_create_extract_type(extract_type, conn_def=conn_def) assert extract_type_id is not None, "Weird fail" nid_extract_types[nid] = extract_type_id df['extract_type_id'] = df['nid'].map(nid_extract_types) report_if_merge_fail(df, 'extract_type_id', 'nid') if (df['nid'].isin(SPLIT_ISO_NIDS).any()): nid_loc_df = df.loc[df['nid'].isin(SPLIT_ISO_NIDS), ['nid', 'location_id']].drop_duplicates() nid_loc_df = add_location_metadata( nid_loc_df, 'ihme_loc_id', location_set_version_id=CONF.get_id('location_set_version')) nid_loc_df['iso3'] = nid_loc_df['ihme_loc_id'].str.slice(0, 3) nid_loc_df['extract_type'] = nid_loc_df.apply( lambda x: "{nid}: {iso3} data".format(nid=x['nid'], iso3=x['iso3'] ), axis=1) extract_types = nid_loc_df[['extract_type']].drop_duplicates() extract_types['extract_type_id_new'] = \ extract_types['extract_type'].apply( lambda x: pull_or_create_extract_type(x, conn_def=conn_def)) nid_loc_df = nid_loc_df.merge(extract_types, on='extract_type', how='left') report_if_merge_fail(nid_loc_df, 'extract_type_id_new', 'extract_type') df = df.merge(nid_loc_df, on=['nid', 'location_id'], how='left') report_if_merge_fail(df.loc[df['nid'].isin(SPLIT_ISO_NIDS)], 'extract_type_id_new', ['nid', 'location_id']) df.loc[df['nid'].isin(SPLIT_ISO_NIDS), 'extract_type_id'] = df['extract_type_id_new'] df = df.drop( ['extract_type_id_new', 'iso3', 'ihme_loc_id', 'extract_type'], axis=1) if source == "VA_lit_GBD_2010_2013": code_system_to_extract_type_id = { 244: 350, 273: 347, 294: 353, 296: 356, } df.loc[df['nid'].isin([93570, 93664]), 'extract_type_id'] = df['code_system_id'].map( code_system_to_extract_type_id) # police data fixes elif source == "Various_RTI": code_system_to_extract_type_id = { 237: 719, 241: 722, } df.loc[df['nid'].isin([93599]), 'extract_type_id'] = df['code_system_id'].map( code_system_to_extract_type_id) elif source == 'Matlab_1963_1981': df.loc[df['nid'] == 935, 'extract_type_id'] = 380 elif source == 'Matlab_1982_1986': df.loc[df['nid'] == 935, 'extract_type_id'] = 383 elif source == 'Matlab_1987_2002': df.loc[df['nid'] == 935, 'extract_type_id'] = 371 elif source == 'Matlab_2003_2006': df.loc[df['nid'] == 935, 'extract_type_id'] = 374 elif source == 'Matlab_2007_2012': df.loc[df['nid'] == 935, 'extract_type_id'] = 377 elif source == "Combined_Census": df.loc[df['nid'] == 7942, 'extract_type_id'] = 479 elif source == "Maternal_Census": df.loc[df['nid'] == 7942, 'extract_type_id'] = 482 df.loc[df['nid'] == 10319, 'extract_type_id'] = 485 elif source == "Other_Maternal": df.loc[df['nid'] == 10319, 'extract_type_id'] = 488 elif source == "Pakistan_maternal_DHS_2006": df["extract_type_id"] = 539 elif source == "Pakistan_child_DHS_2006": df["extract_type_id"] = 536 if (df['data_type_id'].isin([5, 6, 7]).any()): csetid = { 49: 440, 50: 443, 52: 386, 162: 446, 163: 449, 155: 455, 148: 458, 149: 461, 150: 464, 156: 467, 577: 470 } df.loc[df['nid'].isin([24134, 108611, 125702, 132803, 154012]), 'extract_type_id'] = df['code_system_id'].map(csetid) data_type_dict = {5: 473, 7: 476} df.loc[df['code_system_id'] == 154, 'extract_type_id'] = df['data_type_id'].map(data_type_dict) return df
def finalize_formatting(df, source, write=False, code_system_id=None, extract_type=None, conn_def='ADDRESS', is_active=True): NID_META_COLS = [ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id', 'is_active' ] NID_LOCATION_YEAR_COLS = [ 'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id' ] FORMATTED_ID_COLS = [ 'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id', 'age_group_id', 'location_id' ] if 'code_id' in df.columns: code_col = 'code_id' map_code_id = False elif 'cause' in df.columns: code_col = 'cause' map_code_id = True else: raise AssertionError("Need either 'code_id' or 'cause' in columns") INCOMING_EXPECTED_ID_COLS = [ 'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col, 'site', 'data_type_id', 'representative_id', 'code_system_id' ] VALUE_COLS = ['deaths'] FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns) if len(missing_cols) > 0: raise AssertionError( """These columns are needed for formatting but not found in df: {} """.format(missing_cols)) # SET FORMATTING TIMESTAMP format_timestamp = cod_timestamp() print("Finalizing formatting with timestamp {}".format(format_timestamp)) # ADD SOURCE df['source'] = source # MAP OR CHECK CODE ID code_system_ids = df.code_system_id.unique() if map_code_id: cs_dfs = [] for code_system_id in code_system_ids: cs_df = df.loc[df['code_system_id'] == code_system_id].copy() # map code_id to the data cs_df['value'] = cs_df['cause'] cs_df = add_code_metadata(cs_df, ['code_id'], code_system_id=code_system_id, merge_col='value', force_rerun=True, cache_dir='standard') print(cs_df.loc[cs_df['code_id'].isnull()].value.unique()) report_if_merge_fail(cs_df, ['code_id'], ['value']) cs_df = cs_df.drop('value', axis=1) cs_dfs.append(cs_df) df = pd.concat(cs_dfs, ignore_index=True) else: # ADD TEST TO CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE # CODE SYSTEM all_codes_q = """ SELECT code_id FROM ADDRESS WHERE code_system_id IN ({}) """.format(",".join([str(c) for c in code_system_ids])) all_codes = ezfuncs.query(all_codes_q, conn_def='engine') bad_codes = set(df.code_id) - set(all_codes.code_id) if len(bad_codes) > 0: print("Found these code ids in data that can't exist in code " "systems {}: {}".format(code_system_ids, bad_codes)) # MAP SITE ID df = map_site_id(df, conn_def=conn_def) # MAP EXTRACT TYPE ID df = map_extract_type_id(df, source, extract_type, conn_def=conn_def) # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS df = group_six_minor_territories(df, sum_cols=VALUE_COLS) df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))] df = df.loc[~(df['nid'].isin([24143, 107307]))] # ENSURE NO NEGATIVES for val_col in VALUE_COLS: assert (df[val_col] >= 0).all(), \ "there are negative values in {}".format(val_col) input_df = df[FINAL_FORMATED_COLS].copy() assert not input_df.isnull().values.any(), "null values in df" dupped = input_df[input_df.duplicated()] if len(dupped) > 0: raise AssertionError("duplicate values in df: \n{}".format(dupped)) # GROUP IF NECESSARY if input_df[FORMATTED_ID_COLS].duplicated().any(): input_df = input_df.groupby(FORMATTED_ID_COLS, as_index=False)[VALUE_COLS].sum() # MAKE NID METADATA TABLE if 'parent_nid' not in df.columns: df['parent_nid'] = np.nan df['is_active'] = 1 * is_active # CHECK SUBNATIONAL LOCATIONS # alters is_active if needed df = check_subnational_locations(df) nid_meta_df = df[NID_META_COLS].drop_duplicates() nid_meta_df['last_updated_timestamp'] = format_timestamp # MAKE NID LOCATION YEAR TABLE nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates() nid_locyears['last_updated_timestamp'] = format_timestamp # check one iso3 per nid nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id') nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3) report_duplicates( nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(), ['nid', 'extract_type_id']) nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1) if write: # write nid metadata write_to_claude_nid_table(nid_meta_df, 'claude_nid_metadata', replace=True, conn_def=conn_def) # write nid location-year map write_to_claude_nid_table(nid_locyears, 'claude_nid_location_year', replace=True, conn_def=conn_def) insert_source_id(source) nid_extracts = input_df[['nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False) for nid, extract_type_id in nid_extracts: nid = int(nid) extract_type_id = int(extract_type_id) print("Writing nid {}, extract_type_id {}".format( nid, extract_type_id)) idf = input_df.loc[(input_df['nid'] == nid) & ( input_df['extract_type_id'] == extract_type_id)].copy() phase = 'formatted' launch_set_id = format_timestamp print("\nTotal deaths: {}".format(idf.deaths.sum())) write_phase_output(idf, phase, nid, extract_type_id, launch_set_id) # now refresh cache files for nid print("\nRefreshing claude nid metadata cache files") force_cache_options = { 'force_rerun': True, 'block_rerun': False, 'cache_dir': "standard", 'cache_results': True, 'verbose': True } get_nid_metadata(**force_cache_options) get_nidlocyear_map(**force_cache_options) return locals()
def prepare_square_df(df, location_hierarchy, reg_spec): locations = location_hierarchy.query('level == 3')[['location_id']].drop_duplicates() ages = df[['age']].drop_duplicates() sexes = df[['sex_id']].drop_duplicates() targets = df[['cause_id']].drop_duplicates() years = df[['year_id']].drop_duplicates() join_dfs = [locations, years, ages, sexes, targets] square_df = join_dfs[0] square_df['join_key'] = 1 for join_df in join_dfs[1:]: join_df['join_key'] = 1 square_df = square_df.merge(join_df, on='join_key', how='left') square_df = square_df.drop('join_key', axis=1) square_df = square_df.rename(columns={'location_id': 'country'}) # will predict based on prop unspecified, but within 15-year age groups unspec_props_id_cols = ['country', 'year_id', 'age', 'sex_id'] unspec_props = df[ unspec_props_id_cols + ['prop_garbage_bigage'] ].drop_duplicates() report_duplicates(unspec_props, unspec_props_id_cols) # fill with age-sex median garbage unspecified where we don't have the # location-year median_unspec_age_sex = unspec_props.groupby( ['age', 'sex_id'], as_index=False )['prop_garbage_bigage'].median() median_unspec_age_sex = median_unspec_age_sex.rename( columns={'prop_garbage_bigage': 'agesex_median_prop_garbage'} ) # add proportion unspecified by location-year where possible square_df = square_df.merge( unspec_props, on=unspec_props_id_cols, how='left' ) # use age-sex where necessary square_df = square_df.merge( median_unspec_age_sex, on=['age', 'sex_id'], how='left' ) # this should never have missings report_if_merge_fail(square_df, 'agesex_median_prop_garbage', ['age', 'sex_id']) square_df['prop_garbage'] = square_df['prop_garbage_bigage'].fillna( square_df['agesex_median_prop_garbage'] ) # create all-year proportion garbage column square_df['prop_garbage_all_years'] = square_df.groupby( ['country', 'age', 'sex_id', 'cause_id'] )['prop_garbage'].transform(np.mean) # create year window proportion garbage col square_df = add_year_window(square_df) square_df['prop_garbage_year_window'] = square_df.groupby( ['country', 'age', 'sex_id', 'cause_id', 'year_window'] )['prop_garbage'].transform(np.mean) square_df = square_df.rename(columns={'country': 'location_id'}) square_df = add_location_metadata( square_df, ['region_id', 'super_region_id'], location_meta_df=location_hierarchy ) square_df = square_df.rename(columns={ 'super_region_id': 'super_region', 'region_id': 'region', 'location_id': 'country', }) square_df['super_region'] = square_df['super_region'].astype(int) square_df['region'] = square_df['region'].astype(int) square_df['global'] = 1 square_df = square_df[ ['global', 'super_region', 'region', 'country', 'year_id', 'year_window', 'age', 'sex_id', 'cause_id', 'prop_garbage', 'prop_garbage_all_years', 'prop_garbage_year_window'] ] square_df = add_model_group(square_df) square_df = square_df.sort_values( by=['global', 'super_region', 'region', 'country', 'year_id', 'age', 'sex_id', 'cause_id'] ) if reg_spec['name_short'] == "unspec_diab": # haqi print_log_message("Adding haqi") square_df = add_covariate(square_df, 1099, "haqi", by_sex=False) # age-standardized diabetes prevalence print_log_message("Adding age-standardized diabetes prevalence") square_df = add_covariate(square_df, 29, "diabetes_prev") # under-15 age-standardized death rate due to diabetes print_log_message("Adding asdr due to diabetes under 15") square_df = add_covariate(square_df, 1249, "asdr_dm_015") # age-specific obesity print_log_message("Adding age-standardized prevalence of obesity") square_df = add_covariate(square_df, 455, "prev_obesity_agestd") # age-specific type-1 diabetes prevalence print_log_message("Adding type 1 diabetes prevalence") square_df = add_covariate(square_df, -4, "prev_type1") if reg_spec['name_short'] == "unspec_stroke": # cholesterol print_log_message("Adding cholesterol") square_df = add_covariate(square_df, 69, "cholesterol") # blood pressure print_log_message("Adding blood pressure") square_df = add_covariate(square_df, 70, "blood_pressure") # age-specific LDL print_log_message("Adding age-specific LDL") square_df = add_covariate(square_df, -1, "ldl_age_specific") # age-specific blood pressure print_log_message("Adding age-specific blood pressure") square_df = add_covariate(square_df, -2, "blood_pressure_age_specific") # stroke 28 day survivorship print_log_message("Adding percent survivor") square_df = add_covariate(square_df, -3, "percent_survivor") # stroke percent incidence by target group print_log_message("Adding percent incidence") square_df = add_covariate(square_df, -5, "percent_incidence") if "cancer" in reg_spec["name_short"]: # haqi print_log_message('Adding HAQI') square_df = add_covariate(square_df, 1099, "haqi", by_sex=False) # tobacco (cigarettes per capita) print_log_message('Adding cigarettes per capita') square_df = add_covariate(square_df, 19, "cigarettes_pc", by_sex=False) print_log_message('Adding SDI') square_df = add_covariate(square_df, 881, "sdi", by_sex=False) return square_df
def map_extract_type_id(df, source, extract_type, conn_def='ADDRESS'): nid_extract_types = {} start_extract_type = extract_type for nid in df.nid.unique(): if source == "China_DSP_prov_ICD10": extract_type = "admin1: DSP sites only" elif start_extract_type is None: extract_type = None extract_type = induce_extraction_type(df.loc[df['nid'] == nid], conn_def=conn_def) else: extract_type = extract_type extract_type = str(extract_type).strip() # Determine (or create) the extract_type_id extract_type_id = pull_or_create_extract_type(extract_type, conn_def=conn_def) assert extract_type_id is not None, "Weird fail" nid_extract_types[nid] = extract_type_id df['extract_type_id'] = df['nid'].map(nid_extract_types) report_if_merge_fail(df, 'extract_type_id', 'nid') split_iso_nids = list( pd.read_csv(CONF.get_resource('split_iso_nids'))['nid'].unique()) if (df['nid'].isin(split_iso_nids).any()): nid_loc_df = df.loc[df['nid'].isin(split_iso_nids), ['nid', 'location_id']].drop_duplicates() nid_loc_df = add_location_metadata( nid_loc_df, 'ihme_loc_id', location_set_version_id=CONF.get_id('location_set_version')) nid_loc_df['iso3'] = nid_loc_df['ihme_loc_id'].str.slice(0, 3) nid_loc_df['extract_type'] = nid_loc_df.apply( lambda x: "{nid}: {iso3} data".format(nid=x['nid'], iso3=x['iso3'] ), axis=1) extract_types = nid_loc_df[['extract_type']].drop_duplicates() extract_types['extract_type_id_new'] = \ extract_types['extract_type'].apply( lambda x: pull_or_create_extract_type(x, conn_def=conn_def)) nid_loc_df = nid_loc_df.merge(extract_types, on='extract_type', how='left') report_if_merge_fail(nid_loc_df, 'extract_type_id_new', 'extract_type') df = df.merge(nid_loc_df, on=['nid', 'location_id'], how='left') report_if_merge_fail(df.loc[df['nid'].isin(split_iso_nids)], 'extract_type_id_new', ['nid', 'location_id']) df.loc[df['nid'].isin(split_iso_nids), 'extract_type_id'] = df['extract_type_id_new'] df = df.drop( ['extract_type_id_new', 'iso3', 'ihme_loc_id', 'extract_type'], axis=1) if source == "VA_lit_GBD_2010_2013": # nids need extract types by code system # these have already been inserted to extract type table code_system_to_extract_type_id = { # DRC nationwide 2004 244: 350, # DRC EastWest 2002 273: 347, # Halder (30) 294: 353, # Moran (26) 296: 356, } df.loc[df['nid'].isin([93570, 93664]), 'extract_type_id'] = df['code_system_id'].map( code_system_to_extract_type_id) # police data fixes elif source == "Various_RTI": code_system_to_extract_type_id = { # Economic Commission for Europe 237: 719, # Turkey Traffic Accidents 241: 722, } df.loc[df['nid'].isin([93599]), 'extract_type_id'] = df['code_system_id'].map( code_system_to_extract_type_id) # distinguish between Matlab sources since all are NID 935 # these etids have already been inserted into the claude_extract_type elif source == 'Matlab_1963_1981': df.loc[df['nid'] == 935, 'extract_type_id'] = 380 elif source == 'Matlab_1982_1986': df.loc[df['nid'] == 935, 'extract_type_id'] = 383 elif source == 'Matlab_1987_2002': df.loc[df['nid'] == 935, 'extract_type_id'] = 371 elif source == 'Matlab_2003_2006': df.loc[df['nid'] == 935, 'extract_type_id'] = 374 elif source == 'Matlab_2007_2012': df.loc[df['nid'] == 935, 'extract_type_id'] = 377 # fixes for "maternal" data elif source == "Combined_Census": df.loc[df['nid'] == 7942, 'extract_type_id'] = 479 elif source == "Maternal_Census": df.loc[df['nid'] == 7942, 'extract_type_id'] = 482 df.loc[df['nid'] == 10319, 'extract_type_id'] = 485 elif source == "Other_Maternal": df.loc[df['nid'] == 10319, 'extract_type_id'] = 488 # these two Pakistan_DHS sources have the same NID elif source == "Pakistan_maternal_DHS_2006": df["extract_type_id"] = 539 elif source == "Pakistan_child_DHS_2006": df["extract_type_id"] = 536 # age-specific maps in nid 413394 - need different extracts # for different code systems elif source == 'VA_lit_GBD_2019': code_system_to_extract_type_id = {711: 1745, 712: 1748} df.loc[df['nid'] == 413394, 'extract_type_id'] =\ df['code_system_id'].map(code_system_to_extract_type_id) assert df.extract_type_id.notnull().all() if (df['data_type_id'].isin([5, 6, 7]).any()): # these have already been inserted to extract type table csetid = { 49: 440, 50: 443, 52: 386, 162: 446, 163: 449, 155: 455, 148: 458, 149: 461, 150: 464, 156: 467, 577: 470 } df.loc[df['nid'].isin([24134, 108611, 125702, 132803, 154012]), 'extract_type_id'] = df['code_system_id'].map(csetid) data_type_dict = {5: 473, 7: 476} df.loc[df['code_system_id'] == 154, 'extract_type_id'] = df['data_type_id'].map(data_type_dict) return df
def finalize_formatting(df, source, write=False, code_system_id=None, extract_type=None, conn_def='ADDRESS', is_active=False, refresh_cache=True, check_ages=True): """Finalize the formatting of the source and optionally write it out. Decides whether to map code_id based on whether code_id is already a column in the dataset. Needs the following information from either the df values or from the nid_meta_vals dict: data_type_id representative_id All of the above must have only one value per nid in df. Maps site_id to the data based on incoming 'site' column. Will upload any sites that are not in the cod.site table already. Arguments: df, pandas.DataFrame: The dataframe with near-formatted data source, str: The source this df is (should be the whole source and nothing but the source). Will break if there is no source in FILEPATH with this name, and you should pass the source without a leading underscore even if it is that way in J write, bool: whether to write the outputs extract_type, str: The manner in which the nid was extracted. If left as None, will be induced by the location_type_id of the location_id with the maximum level in the dataset. This should be over-ridden in cases like China DSP, where the same locations are used in two extraction types - "DSP + VR" and "DSP"; China DSP then gets two extraction types: "admin1" and "admin1: DSP sites only" (in the particular instance of DSP, extract type is built into this code. Feel free to add other source-extract type mappings here to force consistency.) check_ages, bool: Whether or not to enforce age group checks such as ensuring no overlaps or gaps. This can be turned off because sometimes raw data reports overlapping age groups (e.g. Palestine data has Gaza Strip and West Bank data with different age groupings). Returns: Every local value to the function Why? There are multiple df outputs, and formatting is a very engaged process so its helpful to just see everything sometimes """ # set column groups, and verify that we have everything we need NID_META_COLS = [ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id', 'is_active', 'is_mort_active' ] NID_LOCATION_YEAR_COLS = [ 'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id' ] FORMATTED_ID_COLS = [ 'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id', 'age_group_id', 'location_id' ] if 'code_id' in df.columns: code_col = 'code_id' map_code_id = False elif 'cause' in df.columns: code_col = 'cause' map_code_id = True else: raise AssertionError("Need either 'code_id' or 'cause' in columns") INCOMING_EXPECTED_ID_COLS = [ 'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col, 'site', 'data_type_id', 'representative_id', 'code_system_id' ] VALUE_COLS = ['deaths'] FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns) assert len(missing_cols) == 0, \ "Required formatting columns not found in df: \n{}".format(missing_cols) # SET FORMATTING TIMESTAMP format_timestamp = cod_timestamp() print("Finalizing formatting with timestamp {}".format(format_timestamp)) # ADD SOURCE df['source'] = source # MAP OR CHECK CODE ID code_system_ids = df.code_system_id.unique() if map_code_id: cs_dfs = [] for code_system_id in code_system_ids: cs_df = df.loc[df['code_system_id'] == code_system_id].copy() # map code_id to the data cs_df['value'] = cs_df['cause'] cs_df = add_code_metadata(cs_df, ['code_id'], code_system_id=code_system_id, merge_col='value', force_rerun=True, cache_dir='standard') report_if_merge_fail(cs_df, ['code_id'], ['value']) cs_df = cs_df.drop('value', axis=1) cs_dfs.append(cs_df) df = pd.concat(cs_dfs, ignore_index=True) else: # CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE CODE SYSTEM all_codes_q = """ SELECT code_id FROM engine_room.maps_code WHERE code_system_id IN ({}) """.format(",".join([str(c) for c in code_system_ids])) all_codes = ezfuncs.query(all_codes_q, conn_def='ADDRESS') bad_codes = set(df.code_id) - set(all_codes.code_id) assert len(bad_codes) == 0, "Found code ids in data that can't exist in code "\ "systems {}: {}".format(code_system_ids, bad_codes) check_vr_raw_causes(df) # MAP SITE ID df = map_site_id(df, conn_def=conn_def) # MAP EXTRACT TYPE ID df = map_extract_type_id(df, source, extract_type, conn_def=conn_def) # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS df = group_six_minor_territories(df, sum_cols=VALUE_COLS) # sorry for putting this here # drop these loc/years b/c env < deaths creating negative cc_code # maybe re run w/ another envelope? df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))] df = df.loc[~(df['nid'].isin([24143, 107307]))] # ENSURE NO NEGATIVES for val_col in VALUE_COLS: assert (df[val_col] >= 0).all(), \ "there are negative values in {}".format(val_col) ################################################ # keep all 0s now, messing up for NR in non-VR # df['val_sum_tmp'] = df[VALUE_COLS].sum(axis=1) # all-cause extractions want to keep zeroes # keep_zeroes = df['extract_type_id'] == ALL_CAUSE_EXTRACT_ID # otherwise, drop them # greater_than_zero = df['val_sum_tmp'] > 0 # df = df[greater_than_zero | keep_zeroes] # df = df.drop('val_sum_tmp', axis=1) ################################################ # CHECKS FOR FORMATTED PHASE OUTPUT input_df = df[FINAL_FORMATED_COLS].copy() assert not input_df.isnull().values.any(), "null values in df" dupped = input_df[input_df.duplicated()] if len(dupped) > 0: raise AssertionError("duplicate values in df: \n{}".format(dupped)) # GROUP IF NECESSARY if input_df[FORMATTED_ID_COLS].duplicated().any(): input_df = input_df.groupby(FORMATTED_ID_COLS, as_index=False)[VALUE_COLS].sum() # TESTS F0R CHECKING AGE GROUP IDS if check_ages: check_age_groups(df) # MORE TESTS FOR DEATHS - MAYBE THAT THEY AREN'T MORE THAN 1.25 THE # VALUE IN THE ENVELOPE BY LOCATION AGE YEAR SEX? # AND THEN WRITE A TABLE OF COMPARISONS OF DEATHS / ENVELOPE BY LOCATION # AGE YEAR SEX FOR REVIEW # MAKE NID METADATA TABLE if 'parent_nid' not in df.columns: df['parent_nid'] = np.nan if is_active is True: warnings.warn( """is_active is deprecated: use the update_nid_metadata_status function to change the status of finalized datasets""" ) # Use existing is_active and is_mort_active values, otherwise default to 0 nid_map = pull_nid_metadata() df = df.merge(nid_map, on=[ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id' ], how='left') df_na = df[pd.isnull(df['is_active'])] df_na = df_na[['nid', 'extract_type_id']].drop_duplicates() if df_na.shape[0] > 0: print("""New rows for the following NID/extract_type_id will be added with is_active and is_mort_active = 0:\n {}""".format(df_na)) df['is_active'] = df['is_active'].fillna(0) df['is_mort_active'] = df['is_mort_active'].fillna(0) # CHECK SUBNATIONAL LOCATIONS df = check_subnational_locations(df) # OVERRIDE REPRESENTATIVE ID FOR NON-VR df = adjust_representative_id(df) nid_meta_df = df[NID_META_COLS].drop_duplicates() nid_meta_df['last_formatted_timestamp'] = format_timestamp # MAKE NID LOCATION YEAR TABLE nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates() nid_locyears['last_formatted_timestamp'] = format_timestamp # check one iso3 per nid nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id') nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3) report_duplicates( nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(), ['nid', 'extract_type_id']) nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1) if write: # write nid metadata write_to_claude_nid_table(nid_meta_df, 'claude_nid_metadata', replace=True, conn_def=conn_def) # write nid location-year map write_to_claude_nid_table(nid_locyears, 'claude_nid_location_year', replace=True, conn_def=conn_def) # write to cod.source for new sources insert_source_id(source) nid_extracts = input_df[['nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False) for nid, extract_type_id in nid_extracts: nid = int(nid) extract_type_id = int(extract_type_id) print("Writing nid {}, extract_type_id {}".format( nid, extract_type_id)) idf = input_df.loc[(input_df['nid'] == nid) & ( input_df['extract_type_id'] == extract_type_id)].copy() phase = 'formatted' launch_set_id = format_timestamp print("\nTotal deaths: {}".format(idf.deaths.sum())) write_phase_output(idf, phase, nid, extract_type_id, launch_set_id) # now refresh cache files for nid if refresh_cache: refresh_claude_nid_cache_files() return locals()