def _prep_for_pe_correction(df, value_cols, pop_col, pop_df, pop_id_cols, correct_garbage): if type(value_cols) == str: value_cols = [value_cols] # make sure the 'flagged' column is set assert_flagged_col(df) # id cols will be original columns - val columns id_cols = list( set(df.columns) - set(value_cols) - set(['flagged']) - set([pop_col])) if correct_garbage: gc_dup_cols = id_cols + ['flagged'] dup_cols = gc_dup_cols else: dup_cols = id_cols # these should not be duplicated dups = df[df[dup_cols].duplicated()] if len(dups) > 0: raise AssertionError("Found duplicates in non-value columns " "{c}: \{df}".format(c=id_cols, df=dups)) # add population to the data if pop_col not in df.columns: if pop_df is None: raise AssertionError( "If pop_col ('{}') is not already in the dataframe, " "pop_df must not be None".format(pop_col)) df = df.merge(pop_df, on=pop_id_cols, how='left') # every row should have a population report_if_merge_fail(df, pop_col, pop_id_cols) return df
def map_code_id(df, code_map, remove_decimal=True, value_col='value'): if remove_decimal: df[value_col] = df[value_col].str.replace(".", "") code_map[value_col] = code_map[value_col].str.replace(".", "") df[value_col] = df[value_col].str.strip() code_map[value_col] = code_map[value_col].str.strip() code_ids = code_map[['code_system_id', value_col, "code_id"]].drop_duplicates() df.loc[df[value_col] == 'acause_digest_gastrititis', value_col] = 'acause_digest_gastritis' assert not code_ids[['code_system_id', value_col, "code_id" ]].duplicated().values.any() df = pd.merge(df, code_ids, on=['code_system_id', value_col], how='left') for num_dig_retry in [4, 3]: if df.code_id.isnull().any(): print("Trying mapping again at {} digits...".format(num_dig_retry)) # try again at 4 digits filled_mappings = remap_code_id(df, code_ids, num_dig_retry) df = df.loc[df['code_id'].notnull()] df = df.append(filled_mappings, ignore_index=True) report_if_merge_fail(df, 'code_id', ['code_system_id', value_col]) return df
def check_age_groups(df): '''Raise a series of assertion errors if standard checks for age groups are violated. Input: pandas DataFrame Returns: None, raises assertion errors Checks for: - age_group_ids not in shared.ages - overlapping age groups - lack of terminal age group (e.g. 95+) in VR ''' for nid_etid, nid_etid_df in df.groupby(['nid', 'extract_type_id']): nid_etid_df = add_age_metadata( nid_etid_df, ['age_group_years_start', 'age_group_years_end']) report_if_merge_fail(df, 'age_group_id', 'age_group_years_start') report_if_merge_fail(df, 'age_group_id', 'age_group_years_end') nid_etid_df = nid_etid_df[[ 'age_group_id', 'age_group_years_start', 'age_group_years_end', 'data_type_id' ]].drop_duplicates().sort_values( by=['age_group_years_start', 'age_group_years_end']) # NOTE: report_gaps_and_overlaps MUST come before report_too_much_age_detail # report_too_much_age_detail cannot handle age groups with gaps or overlaps report_gaps_and_overlaps(nid_etid, nid_etid_df) report_no_terminal_ages(nid_etid, nid_etid_df) report_too_much_age_detail(nid_etid, nid_etid_df)
def collapse_sites(df, **cache_options): """ Collapse sites together and reassign site_id. This function exists to collapse sites in historical VA data. When new VA is added to our database, sites should be collapsed in formatting, not here. """ # Get some properties so that we don't change things start_cols = df.columns start_deaths = df.deaths.sum() # Get the existing site names df = add_site_metadata(df, 'site_name', **cache_options) report_if_merge_fail(df, check_col='site_name', merge_cols='site_id') old_sites = set(df.site_name.unique()) # Assign new aggregated sites # We assume that within the same study, location, and year, each site was surveyed # for the same ages, sexes, and causes, and that any missingness across age, sex, # or cause comes from a lack of deaths rather than a lack of observation site_dem_cols = ['nid', 'extract_type_id', 'location_id', 'year_id'] df['agg_site'] = df.groupby(site_dem_cols)['site_name'].transform( lambda x: ', '.join(x.drop_duplicates().sort_values())) # Collapse the old sites and replace with the aggregate sites df = df.groupby(df.columns.drop(['site_id', 'site_name', 'deaths']).tolist(), as_index=False)['deaths'].sum() df.rename({'agg_site': 'site_name'}, axis='columns', inplace=True) # This is pretty bad, should think of something better # Handle aggregated sites that are too long for the db (>250 characters) too_long = (df.site_name.str.len() > 250) Bangladesh_study = ((df.nid == 243436) & (df.extract_type_id == 1)) Mozambique_study = ((df.nid == 93710) & (df.extract_type_id == 1)) df.loc[Bangladesh_study & too_long, 'site_name'] = '51 upazilas in Bangladesh' df.loc[Mozambique_study & too_long, 'site_name'] = '8 areas in Sofala province' # Add new site_ids df = add_site_metadata(df, add_cols='site_id', merge_col='site_name', **cache_options) report_if_merge_fail(df, check_col='site_id', merge_cols='site_name') # Report what changed (if anything) new_sites = set(df.site_name.unique()) if new_sites != old_sites: print("Collapsed sites: \n{} \ninto sites: \n{}".format( old_sites, new_sites)) else: print("Did not collapse any sites") df.drop("site_name", axis='columns', inplace=True) assert set(df.columns) == set(start_cols) assert np.isclose(df.deaths.sum(), start_deaths) return df
def add_covariate(df, covariate_id, covariate_column_name, by_sex=True): assert 'location_id' not in df.columns, "Unexpected df structure: has location_id" assert 'country' in df.columns, "Unexpected df structure: lacks country" merge_cols = ['country', 'year_id', 'sex_id'] cov_df = None if not by_sex: merge_cols.remove('sex_id') # For covariates passed to me via flat files (aka .csvs), I manually # assign them negative covariate values. Handle them as a separate case if (covariate_id < 0): cov_df, merge_cols = get_flat_covariate_estimates(covariate_id) else: cov_df = db_queries.get_covariate_estimates(covariate_id, decomp_step="step1") cov_df = cov_df.rename(columns={ 'location_id': 'country', 'mean_value': covariate_column_name }) cov_df = cov_df[merge_cols + [covariate_column_name]] report_duplicates(cov_df, merge_cols) df = df.merge(cov_df, on=merge_cols, how='left') # As of 2/7/19, flat files go from 1990 - 2017, so missing # 80s throws an error. I deal with this in regression setup if (covariate_id > 0): report_if_merge_fail(df, covariate_column_name, merge_cols) return df
def adjust_ukr(self, env_df, split_type): orig_id = self.split_ids[split_type][0] no_cs = self.split_ids[split_type][1] crimea = self.split_ids[split_type][2] sev = self.split_ids[split_type][3] env_df['prop_no_cs'] = env_df[no_cs] / env_df[orig_id] env_df['prop_crimea'] = env_df[crimea] / env_df[orig_id] env_df['prop_sev'] = env_df[sev] / env_df[orig_id] df = self.df.merge(env_df, on=['age_group_id', 'year_id', 'sex_id'], how='left') report_if_merge_fail(df, 'prop_no_cs', ['age_group_id', 'year_id', 'sex_id']) no_cs_df = df.copy() no_cs_df[ 'sample_size'] = no_cs_df['prop_no_cs'] * no_cs_df['sample_size'] no_cs_df['location_id'] = no_cs crimea_df = df.copy() crimea_df['sample_size'] = crimea_df['prop_crimea'] * crimea_df[ 'sample_size'] crimea_df['location_id'] = crimea sev_df = df.copy() sev_df['sample_size'] = sev_df['prop_crimea'] * sev_df['sample_size'] sev_df['location_id'] = sev df = pd.concat([no_cs_df, crimea_df, sev_df], ignore_index=True) return df
def calculate_env_coverage(self, df): """Calculate the percentage of envelope covered.""" # demographic variables dem_cols = self.geo_cols + ['age_group_id', 'sex_id'] # prep envelope env_df = self.env_meta_df.loc[ (self.env_meta_df['age_group_id'].isin(self.cod_ages)) & (self.env_meta_df['sex_id'].isin([1, 2]))] env_df = env_df[dem_cols + ['mean_env']] env_df['total_env'] = env_df.groupby( self.geo_cols)['mean_env'].transform(sum) # only keep demographics represented in the data df = df[self.source_cols + dem_cols] df = df.drop_duplicates() # merge on envelope df df = df.merge(env_df, on=dem_cols, how='left') report_if_merge_fail(df, 'mean_env', dem_cols) df['env_covered'] = df.groupby( self.source_cols + self.geo_cols)['mean_env'].transform(sum) assert not ((df['env_covered'] - df['total_env']) > .0001).any() df['pct_env_coverage'] = df['env_covered'] / df['total_env'] # all VR should be 1 df.loc[df['data_type_id'].isin([9, 10]), 'pct_env_coverage'] = 1 df = df.drop(['env_covered', 'total_env'], axis=1) df = df[self.source_cols + self.geo_cols + ['pct_env_coverage']].drop_duplicates() assert not df.duplicated(self.source_cols + self.geo_cols).values.any() return df
def aggregate_to_country_level(orig_df, location_set_version_id): """Aggregate sub nationals to country level.""" df = orig_df.copy() # merge on country level location_ids location_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id) country_location_ids = \ get_country_level_location_id(df.location_id.unique(), location_meta_df) df = df.merge(country_location_ids, how='left', on='location_id') report_if_merge_fail(df, 'country_location_id', ['location_id']) # aggregate sub national locations to national level df = df[df['location_id'] != df['country_location_id']] df['location_id'] = df['country_location_id'] df = df.drop(['country_location_id'], axis=1) group_cols = [col for col in df.columns if col not in VAL_COLS] df = df.groupby(group_cols, as_index=False)[VAL_COLS].sum() df['loc_agg'] = 1 # append aggregates to original dataframe orig_df['loc_agg'] = 0 df = df.append(orig_df) return df
def format_for_nr(df, location_hierarchy): """Merge on needed location metadata.""" locs = df[['location_id']].drop_duplicates() locs = add_location_metadata( locs, add_cols=["ihme_loc_id", "path_to_top_parent"], merge_col="location_id", location_meta_df=location_hierarchy ) report_if_merge_fail(locs, 'path_to_top_parent', 'location_id') locs['country_id'] = locs['path_to_top_parent'].str.split(",").apply( lambda x: int(x[3])) locs['subnat_id'] = locs['ihme_loc_id'].apply( lambda x: int(x.split("_")[1]) if "_" in x else 0) locs['iso3'] = locs['ihme_loc_id'].str.slice(0, 3) different_locations = locs['country_id'] != locs['location_id'] locs.loc[different_locations, 'iso3'] = \ locs['iso3'].apply(lambda x: x + "_subnat") locs = locs[['location_id', 'country_id', 'subnat_id', 'iso3']] df = df.merge(locs, on='location_id', how='left') report_if_merge_fail(locs, 'country_id', 'location_id') # create indicator column for running a separate nr model # for national and subnational locations subnational_modeled_iso3s = CONF.get_id('subnational_modeled_iso3s') df['is_loc_agg'] = 0 df.loc[df['iso3'].isin(subnational_modeled_iso3s), 'is_loc_agg'] = 1 # remove 0 sample size rows df = df.loc[df['sample_size'] > 0] return df
def adjust_ukr(self, env_df, split_type): """Adjust sample size for Ukraine without Crimea and Sevastopol.""" orig_id = self.split_ids[split_type][0] no_cs = self.split_ids[split_type][1] crimea = self.split_ids[split_type][2] sev = self.split_ids[split_type][3] env_df['prop_no_cs'] = env_df[no_cs] / env_df[orig_id] env_df['prop_crimea'] = env_df[crimea] / env_df[orig_id] env_df['prop_sev'] = env_df[sev] / env_df[orig_id] df = self.df.merge(env_df, on=['age_group_id', 'year_id', 'sex_id'], how='left') report_if_merge_fail(df, 'prop_no_cs', ['age_group_id', 'year_id', 'sex_id']) # resulting dataset will have Ukraine without Crimea and Sevastopol, # Crimea, and Sevastopol location_ids no_cs_df = df.copy() no_cs_df[ 'sample_size'] = no_cs_df['prop_no_cs'] * no_cs_df['sample_size'] no_cs_df['location_id'] = no_cs crimea_df = df.copy() crimea_df['sample_size'] = crimea_df['prop_crimea'] * crimea_df[ 'sample_size'] crimea_df['location_id'] = crimea sev_df = df.copy() sev_df['sample_size'] = sev_df['prop_crimea'] * sev_df['sample_size'] sev_df['location_id'] = sev df = pd.concat([no_cs_df, crimea_df, sev_df], ignore_index=True) return df
def merge_acause_and_collapse(df, cause_map): """Add acause column and collapse before appending split groups.""" cause_map = cause_map[['cause_id', 'value']].copy() cause_map = cause_map.rename(columns={'value': 'cause'}) df = df.merge(cause_map, how='left', on='cause') report_if_merge_fail(df, 'cause_id', 'cause') # there is some confusion during this cause set version between our # db server and the engine room as to which is the right cause set # version. Resolve that here. if CONF.get_id('cause_set_version') == 229: # replace any urinary/gyne with genitourinary print_log_message("Fixing urinary/gyne causes for csvid 229.") urinary_id = 594 gyne_id = 603 genitourinary_id = 982 df.loc[df['cause_id'].isin([gyne_id, urinary_id]), 'cause_id'] = genitourinary_id df = df.drop(['cause', 'split_group'], axis=1) df = df.groupby([col for col in df.columns if col != 'freq'], as_index=False).sum() return df
def merge_with_detail_map(df, detail_map): """Merge incoming data with the detail map.""" assert 'detail_level' not in df.columns detail_map = detail_map[["code_id", "detail_level_id"]] df = df.merge(detail_map, on=['code_id'], how='left') report_if_merge_fail(df, 'detail_level_id', 'code_id') return df
def calculate_cc_code(df, env_meta_df, code_map): df_cc = df.copy() # groupby everything except cause + code_id group_cols = [ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid', 'extract_type_id', 'site_id' ] df_cc = df_cc.groupby(group_cols, as_index=False).deaths.sum() # merge on envelope df_cc = add_envelope(df_cc, env_df=env_meta_df) df_cc['value'] = 'cc_code' df_cc = add_code_metadata(df_cc, ['code_id'], merge_col='value', code_map=code_map) report_if_merge_fail(df_cc, ['code_id'], ['value']) df_cc['cause_id'] = 919 df_cc['deaths'] = df_cc['mean_env'] - df_cc['deaths'] assert df_cc.notnull().values.any() # append together df = pd.concat([df, df_cc], ignore_index=True) assert np.isclose(df['deaths'].sum(), df.mean_env.sum()) df = df.drop(['mean_env', 'value'], axis=1) return df
def calculate_cc_code(df, env_meta_df, code_map): """Calculate total deaths denominator. Note: This step is usually done in formatting. Moving this calculation after age/sex splitting should return more accurate results for data that has a mix of known, detailed age groups and unknown ages. """ df_cc = df.copy() # groupby everything except cause + code_id group_cols = [ 'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid', 'extract_type_id', 'site_id' ] df_cc = df_cc.groupby(group_cols, as_index=False).deaths.sum() # merge on envelope df_cc = add_envelope(df_cc, env_df=env_meta_df) df_cc['value'] = 'cc_code' df_cc = add_code_metadata(df_cc, ['code_id'], merge_col='value', code_map=code_map) report_if_merge_fail(df_cc, ['code_id'], ['value']) df_cc['cause_id'] = 919 df_cc['deaths'] = df_cc['mean_env'] - df_cc['deaths'] assert df_cc.notnull().values.any() # append together df = pd.concat([df, df_cc], ignore_index=True) assert np.isclose(df['deaths'].sum(), df.mean_env.sum()) df = df.drop(['mean_env', 'value'], axis=1) return df
def get_computed_dataframe(self, df): orig_cols = list(df.columns) print_log_message("Starting") print_log_message("Reading data") agg_cause_ids = list(df['cause_id'].unique()) print_log_message("Adding garbage envelope") garbage_deaths = self.get_redistribution_envelope(df, agg_cause_ids) if len(garbage_deaths) > 0: gbg_merge_cols = [ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'site_id', 'cause_id' ] df = df.merge(garbage_deaths, how='left', on=gbg_merge_cols) report_if_merge_fail(df, 'garbage_targeting_cause', gbg_merge_cols) report_duplicates(df, gbg_merge_cols) else: df['garbage_targeting_cause'] = 0 print_log_message("Adding residual variance") residual_variance = self.get_residual_variance() resid_merge_cols = ['cause_id', 'age_group_id', 'sex_id'] df = df.merge(residual_variance, how='left', on=resid_merge_cols) df[RD_VAR_COL] = df[RD_VAR_COL]**2 assert df[RD_VAR_COL].notnull().any() if self.has_misdc: print_log_message( "Getting dismod variance for midsiagnosis corrected causes") misdc_variance = self.get_misdiagnosiscorrection_variance() df_misdc = df[df['cause_id'].isin(MISDC_CAUSES)] df_misdc = df_misdc.merge(misdc_variance, on=MISDC_MERGE_COLS, how='left') df_misdc[RD_VAR_COL] = df_misdc[MISDC_VAR_COL] df_misdc = df_misdc.drop(MISDC_VAR_COL, axis=1) df = df.loc[~df['cause_id'].isin(MISDC_CAUSES)] df = df.append(df_misdc, ignore_index=True) df[RD_VAR_COL] = df[RD_VAR_COL].fillna(0) print_log_message("Measuring redistribution variance") df = df.apply(self.calculate_redistribution_variance_wrapper, axis=1) print_log_message("Done") self.diag_df = df.copy() draw_cols = ['draw_{}'.format(i) for i in range(0, N_DRAWS)] keep_cols = list(orig_cols) + list(draw_cols) df = df[keep_cols] return df
def map_site_id(df, site_col='site', conn_def='ADDRESS', upload=True): """Map site_id to the data given site_col. Will upload sites to cod.site if necessary. """ # set cache options force_cache_options = { 'force_rerun': True, 'block_rerun': False, 'cache_dir': "standard", 'cache_results': True, 'verbose': True } # named site_name in db df = df.rename(columns={site_col: 'site_name'}) df['site_name'] = df['site_name'].fillna("") # get site names in db unique_sites = df[['site_name']].drop_duplicates() db_sites = get_sites(**force_cache_options) # make site_name both lower for merge, since mysql site name is # case-insensitive # and this is how outliers are stored unique_sites['site_name_orig'] = unique_sites['site_name'] unique_sites['site_name'] = unique_sites['site_name'].str.strip( ).str.lower() db_sites['site_name'] = db_sites['site_name'].str.strip().str.lower() # merge onto ones in df unique_sites = unique_sites.merge(db_sites, how='left') unique_sites['site_name'] = unique_sites['site_name_orig'] unique_sites = unique_sites.drop('site_name_orig', axis=1) # find missings upload_sites = unique_sites[unique_sites['site_id'].isnull()] upload_sites = upload_sites[['site_name']].drop_duplicates() if len(upload_sites) > 0: print("No site_id for sites {}".format( upload_sites.site_name.unique())) if upload: print("Uploading new sites...") # if any, upload them insert_names('site', upload_sites, conn_def=conn_def) # refresh db_sites db_sites = get_sites(**force_cache_options) unique_sites = unique_sites.drop('site_id', axis=1) unique_sites = unique_sites.merge(db_sites, how='left') report_if_merge_fail(unique_sites, 'site_id', 'site_name') else: print("Not uploading new sites, allowing merge to fail...") df = df.merge(unique_sites, on='site_name', how='left') if upload: report_if_merge_fail(unique_sites, 'site_id', 'site_name') return df
def assert_valid_mappings(self, df, code_system_id): """Test that the mapping worked. Runs a suite of assertions to make sure that mapping was successful. Args: df (DataFrame): with at least code_id and cause_id Returns: None Raises: AssertionError: Any condition fails """ # add code value from cached code map print("Adding value") df = add_code_metadata(df, ['value'], code_system_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) report_if_merge_fail(df, 'value', 'code_id') # get acause from cached cause hierarchy print("Adding acause") df = add_cause_metadata(df, ['acause'], cause_set_version_id=self.cause_set_version_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir) report_if_merge_fail(df, 'acause', 'cause_id') # Test that all causes starting with 'acause_' are mapped correctly. # acause_cvd, for example, should be mapped to 'cvd' (not 'cvd_ihd'). # 'acause__gc_X59' should be mapped to '_gc', etc. print("Checking implied acauses") check_df = df.loc[df['value'].str.startswith('acause_')] check_df['implied_acause'] = \ check_df['value'].str.replace('acause_', '', 1) check_df.loc[check_df['value'].str.contains("acause__gc"), 'implied_acause'] = "_gc" bad_df = check_df.loc[check_df['acause'] != check_df['implied_acause']] if len(bad_df) > 0: bad_stuff = bad_df[['value', 'acause']].drop_duplicates() raise AssertionError( "These code values do not match their acause: " "\n{}".format(bad_stuff)) print("Checking for bad values") # assert incorrect acauses are gone bad_acauses = [ 'acause_digest_gastrititis', 'acause_hiv_tb', 'acause_tb_drug' ] bad_df = df.loc[df['value'].isin(bad_acauses)].value.unique() if len(bad_df) > 0: raise AssertionError( "Found these bad code values in the data: {}".format( bad_stuff))
def get_computed_dataframe(self, df): """Replace acauses with those in the bridge map.""" # VA sources are the only ones where this may not work # might need to split dataframe by data_type_id for bridge map df = add_nid_metadata(df, ['data_type_id'], **self.cache_options) has_verbal_autopsy = self.VA in df['data_type_id'].unique() df.drop(columns='data_type_id', inplace=True) if self.needs_bridging(has_verbal_autopsy): file_name = self.get_file_name(has_verbal_autopsy) map_df = pd.read_csv(self.bridge_map_path / file_name) map_df = map_df[['acause', 'bridge_code']] # add acause column to deaths data bridge_mapped = add_cause_metadata( df, ['acause'], merge_col='cause_id', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in somehow... bridge_mapped.loc[ bridge_mapped['cause_id'] == 606, 'acause' ] = 'gyne_femaleinfert' report_if_merge_fail(bridge_mapped, 'acause', 'cause_id') bridge_mapped.drop(['cause_id'], axis=1, inplace=True) # perform zz bridge code redistribution before other bridge mapping bridge_mapped = self.redistribute_zz_bridge_codes(bridge_mapped, map_df) bridge_mapped = bridge_mapped.merge( map_df, how='left', on='acause' ) bridge_mapped = self.acause_to_bridge_code(bridge_mapped) # bring cause_id back bridge_mapped = add_cause_metadata( bridge_mapped, ['cause_id'], merge_col='acause', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in bridge_mapped.loc[ bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id' ] = 606 report_if_merge_fail(bridge_mapped, 'cause_id', 'acause') # output diagnostic dataframe self.diag_df = bridge_mapped # drop unnecessary columns bridge_mapped = self.clean_up(bridge_mapped) return bridge_mapped else: self.diag_df = df df = self.clean_up(df) return df
def extract_positive_excess(df, id_cols, cause_to_targets_map): """Extract positive excess and assign target causes.""" df = df[id_cols + ['positive_excess']] df['target_cause_id'] = df['cause_id'].map(cause_to_targets_map) df.loc[df['cause_id'] == 606, 'target_cause_id'] = 294 # every cause should have a target, even if it is not corrected report_if_merge_fail(df, 'target_cause_id', 'cause_id') df['cause_id'] = df['target_cause_id'] df = df.groupby(id_cols, as_index=False)['positive_excess'].sum() return df
def add_in_out_hospital_proportions(self, df, props_df): df = df.merge(props_df, how='left') df.loc[df['hospdead'] == self.missing_hospdead_id, self.in_out_hosp_prop_name] = 1 report_if_merge_fail( df, self.in_out_hosp_prop_name, ['location_id', 'age_group_id', 'sex_id', 'strata', 'hospdead']) return df
def add_geo_group_col(self, df): self.geo_groups = df[self.geo_cols].drop_duplicates().reset_index() self.geo_groups = self.geo_groups.rename( columns={'index': 'geo_group'}) # ensure that index is unique assert len(set(self.geo_groups.index)) == len(self.geo_groups) df = df.merge(self.geo_groups, on=self.geo_cols, how='left') report_if_merge_fail(df, 'geo_group', self.geo_cols) df.drop(self.geo_cols, axis=1, inplace=True) return df
def add_rd_locations(df, lsvid): """Merge on location hierarchy specific to redistribution.""" lhh = get_current_location_hierarchy(location_set_version_id=lsvid, force_rerun=False, block_rerun=True, cache_dir=CACHE_DIR) rd_lhh = get_redistribution_locations(lhh) df = pd.merge(df, rd_lhh, on='location_id', how='left') report_if_merge_fail(df, 'global', 'location_id') report_if_merge_fail(df, 'dev_status', 'location_id') return df
def add_reg_location_metadata(df, location_hierarchy): df = add_location_metadata(df, ['region_id', 'super_region_id'], location_meta_df=location_hierarchy) report_if_merge_fail(df, 'region_id', 'location_id') df['region_id'] = df['region_id'].astype(int) report_if_merge_fail(df, 'super_region_id', 'location_id') df['super_region_id'] = df['super_region_id'].astype(int) df = df.rename(columns={ 'super_region_id': 'super_region', 'region_id': 'region', 'location_id': 'country' }) df['global'] = 1 return df
def make_codem_codviz_metrics(df, pop_df): """Use draws to calculate inputs for CODEm and CoDViz.""" add_cols = [LOWER_RD_COL, UPPER_RD_COL, LOGIT_CF_VAR_COL, LOG_DEATHRATE_VAR_COL] for col in add_cols: df[col] = np.nan if N_DRAWS > 0: cf_draw_cols = RedistributionVarianceEstimator.cf_draw_cols df = add_population(df, pop_df=pop_df) report_if_merge_fail( df.query('age_group_id != 27'), 'population', ['age_group_id', 'location_id', 'year_id', 'sex_id'] ) # get variance for CODEm df = df.apply( RedistributionVarianceEstimator.calculate_codem_variances, cf_draw_cols=cf_draw_cols, axis=1 ) # get the upper and lower bounds for CoDViz df = df.apply( RedistributionVarianceEstimator.calculate_codviz_bounds, axis=1 ) # drop draw/diagnostic/pop columns df = df.drop(cf_draw_cols + ['population'], axis=1) else: df[LOWER_RD_COL], df[UPPER_RD_COL] = df['cf_final'], df['cf_final'] df[LOGIT_CF_VAR_COL], df[LOG_DEATHRATE_VAR_COL] = 0, 0 # make sure there aren't any null values in the added columns check_no_nulls = [ LOWER_RD_COL, UPPER_RD_COL, MEAN_RD_COL, LOGIT_CF_VAR_COL, LOG_DEATHRATE_VAR_COL ] null_vals = df.loc[ df[check_no_nulls].isnull().any(axis=1), MISDC_MERGE_COLS + check_no_nulls ] if len(null_vals) > 0: raise AssertionError( 'there are null values in redistribution uncertainty ' 'columns: \n{}'.format(null_vals) ) return df
def merge_nonzero_mad_info(self, cmdf): """Read in the floor input and merge onto main dataframe.""" # load nonzero floor values nonzero_mad = self.compile_nonzero_floor(cmdf) nonzero_mad = self.format_nzf(nonzero_mad, cmdf) self._check_all_floors_exist( nonzero_mad) # checks that all age_groups/cancer/year/sex exist nonzero_mad_cols = self.merge_cols + ['floor'] nonzero_mad = nonzero_mad[nonzero_mad_cols] self.min_possible_val = nonzero_mad['floor'].min() self.df = self.df.merge(nonzero_mad, how='left', on=self.merge_cols) # ensure no floor values are missing assert self.df.floor.isnull().any() == False, "null floor values exist" report_if_merge_fail(self.df, 'floor', self.merge_cols)
def prune_cancer_registry_data(df, location_meta_df): ukraine_nid_extract = (df['nid'] == 284465) & (df['extract_type_id'] == 53) assert (df[ukraine_nid_extract]['location_id'] == 63).all(), \ "Now ukraine data has more than just ukraine national, and code " \ "should be changed" df.loc[ukraine_nid_extract, 'location_id'] = 50559 df = add_location_metadata(df, ['most_detailed'], location_meta_df=location_meta_df) report_if_merge_fail(df, 'most_detailed', 'location_id') df = df.query('most_detailed == 1') df = df.drop('most_detailed', axis=1) return df
def add_in_out_hospital_proportions(self, df, props_df): df = df.merge(props_df, how='left') # in years where data is not disaggregated by in/out of hospital, no # hospital weighting will be done df.loc[ df['hospdead'] == self.missing_hospdead_id, self.in_out_hosp_prop_name] = 1 report_if_merge_fail( df, self.in_out_hosp_prop_name, ['location_id', 'age_group_id', 'sex_id', 'strata', 'hospdead'] ) return df
def get_computed_dataframe(self, df): """Replace acauses with those in the bridge map.""" df = add_nid_metadata(df, ['data_type_id'], **self.cache_options) has_verbal_autopsy = self.VA in df['data_type_id'].unique() if self.needs_bridging(has_verbal_autopsy): sheet_name = self.get_sheet_name(has_verbal_autopsy) map_df = pd.read_excel(self.bridge_map_path, sheetname=sheet_name) map_df = map_df[['acause', 'bridge_code']] # add acause column to deaths data bridge_mapped = add_cause_metadata( df, ['acause'], merge_col='cause_id', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in somehow... bridge_mapped.loc[ bridge_mapped['cause_id'] == 606, 'acause' ] = 'gyne_femaleinfert' report_if_merge_fail(bridge_mapped, 'acause', 'cause_id') bridge_mapped.drop(['cause_id'], axis=1, inplace=True) bridge_mapped = bridge_mapped.merge( map_df, how='left', on='acause' ) bridge_mapped = self.acause_to_bridge_code(bridge_mapped) # bring cause_id back bridge_mapped = add_cause_metadata( bridge_mapped, ['cause_id'], merge_col='acause', cause_meta_df=self.cause_meta_df ) bridge_mapped.loc[ bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id' ] = 606 report_if_merge_fail(bridge_mapped, 'cause_id', 'acause') # output diagnostic dataframe self.diag_df = bridge_mapped # drop unnecessary columns bridge_mapped = self.clean_up(bridge_mapped) return bridge_mapped else: self.diag_df = df df = self.clean_up(df) return df
def get_age_group_ids(df): # clean the age column df['who_age'] = df['who_age'].str.replace('Deaths', '') df['who_age'] = df['who_age'].astype(int) # can drop Deaths1 and Deaths2, they're just subtotals df = df.loc[~(df.who_age.isin([1, 2]))] start_len = len(df) # load codebooks that are used to map age group id to who ages adult_cb = get_adult_age_codebook() infant_cb = get_infant_age_codebook() adult_cb.rename(columns={ "frmat": "Frmat", 'cod_age': 'who_age' }, inplace=True) infant_cb.rename(columns={ 'im_frmat': 'IM_Frmat', 'cod_age': 'who_age' }, inplace=True) adult_cb = adult_cb[['Frmat', 'who_age', 'age_group_id']] infant_cb = infant_cb[['IM_Frmat', 'who_age', 'age_group_id']] # subset df to infants and adults infant_df = df.loc[df.who_age.isin([91, 92, 93, 94])] adult_df = df.loc[df.who_age.isin(range(3, 27))] # merge age group_ids on with respective codebooks infant_df = infant_df.merge(infant_cb, on=['IM_Frmat', 'who_age'], how='left') adult_df = adult_df.merge(adult_cb, on=['Frmat', 'who_age'], how='left') # handle the unknown ages adult_df.loc[(adult_df.Frmat == 9) & (adult_df.who_age == 26), 'age_group_id'] = 283 df = pd.concat([infant_df, adult_df], ignore_index=True) # make sure we didn't add/drop any rows in this process assert len(df) == start_len, "You added/dropped rows in age mapping" # first we need to make adjustments to rows with zero deaths df = fix_rows_with_zero_deaths(df) report_if_merge_fail(df, 'age_group_id', ['Frmat', 'IM_Frmat', 'who_age']) assert df.age_group_id.notnull().all() return df
def get_computed_dataframe(self, df): """Return mapped dataframe.""" # list of all cause columns raw_cause_cols = MCoDMapper.get_code_columns(df) df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id) print_log_message("Mapping underlying cause/primary diagnosis") cause_map = get_cause_map(code_map_version_id=self.code_map_version_id, **self.cache_options) code_map = MCoDMapper.prep_cause_map(cause_map) df['cause_mapped'] = df['cause'].map(code_map) print_log_message( "Trimming ICD codes and remapping underlying cause/primary diagnosis" ) df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map, self.code_system_id) report_if_merge_fail(df, 'cause_mapped', 'cause') # merge on the cause_id for the underlying cause df = df.rename(columns={'cause_mapped': 'code_id'}) df['code_id'] = df['code_id'].astype(int) df = add_code_metadata(df, 'cause_id', code_map_version_id=self.code_map_version_id, **self.cache_options) report_if_merge_fail(df, 'cause_id', 'code_id') print_log_message("Mapping chain causes") # get the special intermediate cause map int_cause_map = self.prep_int_cause_map() df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause) print_log_message("Trimming ICD codes and remapping chain causes") int_cause_cols = [x for x in df.columns if self.int_cause in x] int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary( raw_cause_cols, int_cause_cols) df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map, self.code_system_id) print_log_message( "Identifying rows with intermediate cause of interest") df = self.capture_int_cause(df, int_cause_cols) if not self.drop_p2: df = self.set_part2_flag(df) return df