def get_korean_war_locations(): side_a = ["China", "Russian Federation"] locs = get_current_location_hierarchy() side_a = pd.DataFrame(data=side_a, columns={"location_name"}) side_a = pd.merge(side_a, locs[['location_name', 'location_id']], how='left') side_a = list(locs[locs['parent_id'].isin( side_a['location_id'])]['location_id']) side_b_us = ['United States'] side_b_us = pd.DataFrame(data=side_b_us, columns={"location_name"}) side_b_us = pd.merge(side_b_us, locs[['location_name', 'location_id']], how='left') side_b_us = list(locs[locs['parent_id'].isin( side_b_us['location_id'])]['location_id']) side_b_uk = ['United Kingdom'] side_b_uk = pd.DataFrame(data=side_b_uk, columns={"location_name"}) side_b_uk = pd.merge(side_b_uk, locs[['location_name', 'location_id']], how='left') side_b_uk = list(locs[locs['parent_id'].isin( side_b_uk['location_id'])]['location_id']) side_b_uk = list(locs[locs['parent_id'].isin(side_b_uk)]['location_id']) side_b_uk = list(locs[locs['parent_id'].isin(side_b_uk)]['location_id']) location_id = [ 16, 76, 18, 179, 125, 82, 80, 89, 101, 71, 155, 44850, 44851 ] + side_a + side_b_uk + side_b_us return location_id
def aggregate_to_country_level(orig_df, location_set_version_id): """Aggregate sub nationals to country level.""" df = orig_df.copy() # merge on country level location_ids location_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id) country_location_ids = \ get_country_level_location_id(df.location_id.unique(), location_meta_df) df = df.merge(country_location_ids, how='left', on='location_id') report_if_merge_fail(df, 'country_location_id', ['location_id']) # aggregate sub national locations to national level df = df[df['location_id'] != df['country_location_id']] df['location_id'] = df['country_location_id'] df = df.drop(['country_location_id'], axis=1) group_cols = [col for col in df.columns if col not in VAL_COLS] df = df.groupby(group_cols, as_index=False)[VAL_COLS].sum() df['loc_agg'] = 1 # append aggregates to original dataframe orig_df['loc_agg'] = 0 df = df.append(orig_df) return df
def format_source(release_date): # read the raw data and the WHO provided country/year map df = read_data(release_date) country_map = get_country_map(release_date) # subset to just the new loc/years # also apply location/year restrictions df = subset_location_years(df, country_map) # map location information loc_meta = get_current_location_hierarchy( location_set_id=CONF.get_id('location_set'), location_set_version_id=CONF.get_id('location_set_version'), force_rerun=False, block_rerun=True) df = get_gbd_locations(df, country_map, loc_meta) # replicating age adjustments for WHO data from df = adjust_WHO_ages(df) # Limit the dataframe to the columns needed and melt ages wide to long df = melt_df(df) # assign age group ids df = get_age_group_ids(df) # map code ids and apply special remaps cause_map = get_cause_map(1, force_rerun=False, block_rerun=True) df = map_code_id(df, cause_map) # add manual cols and cleanup df = cleanup(df) # apply nids df = map_nids(df, release_date) # apply any final special adjustments df = apply_special_adjustments(df) # final grouping and finalize formatting df = df[FINAL_FORMATTED_COLS] assert df.notnull().values.all() df = df.groupby(ID_COLS, as_index=False)[VALUE_COL].sum() # run finalize formatting locals_present = finalize_formatting(df, SYSTEM_SOURCE, write=WRITE) nid_meta_df = locals_present['nid_meta_df'] # update nid metadata status if WRITE: nid_extracts = nid_meta_df[[ 'nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False) for nid, extract_type_id in nid_extracts: nid = int(nid) extract_type_id = int(extract_type_id) update_nid_metadata_status(nid, extract_type_id, is_active=IS_ACTIVE, is_mort_active=IS_MORT_ACTIVE)
def main(model_group, location_set_version_id, cause_set_version_id, launch_set_id): print_log_message( "Beginning NR modeling for model_group {}".format(model_group)) cache_dir = CONF.get_directory('db_cache') read_file_cache_options = { 'block_rerun': True, 'cache_dir': cache_dir, 'force_rerun': False, 'cache_results': False } print_log_message("Preparing location hierarchy") location_hierarchy = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **read_file_cache_options) cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **read_file_cache_options) age_meta_df = get_ages(**read_file_cache_options) print_log_message("Preparing model data") model_df = get_model_data(model_group, location_hierarchy, location_set_version_id, cause_meta_df) print_log_message("Got {} rows of model data".format(len(model_df))) if len(model_df) == 0: print_log_message("Exiting...") return model_df = model_df.drop('deaths', axis=1) code_system_cause_dict = get_code_system_cause_ids(model_df) if (model_group.startswith("VR")) or (model_group.startswith("Cancer")): print_log_message("Bringing back zeros (squaring) so noise reduction " "knows to depress time series") squarer = Squarer(cause_meta_df, age_meta_df) model_df = squarer.get_computed_dataframe(model_df) elif "HH_SURVEYS" in model_group: model_df = square_dhs_data(model_df, cause_meta_df, age_meta_df, location_hierarchy) print_log_message(log_statistic(model_df)) print_log_message("Restricting model data to only existing cause_ids") model_df = restrict_to_cause_ids(code_system_cause_dict, model_df) print_log_message("Adding NR location info") model_df = format_for_nr(model_df, location_hierarchy) if model_group_is_run_by_cause(model_group): run_phase_by_cause(model_df, model_group, launch_set_id) else: run_phase_by_model_group(model_df, model_group, launch_set_id) print_log_message("Job complete. Exiting...")
def add_rd_locations(df, lsvid): """Merge on location hierarchy specific to redistribution.""" lhh = get_current_location_hierarchy(location_set_version_id=lsvid, force_rerun=False, block_rerun=True, cache_dir=CACHE_DIR) rd_lhh = get_redistribution_locations(lhh) df = pd.merge(df, rd_lhh, on='location_id', how='left') report_if_merge_fail(df, 'global', 'location_id') report_if_merge_fail(df, 'dev_status', 'location_id') return df
def __init__(self): self.cg = Configurator('standard') self.cache_dir = self.cg.get_directory('db_cache') # if you do not want to write any output files then set test to "True" self.test = False self.cache_options = { 'force_rerun': True, 'block_rerun': False, 'cache_dir': self.cache_dir } self.dataset_filters = { 'data_type_id': [8, 9, 10, 12], 'location_set_id': 35, 'is_active': True, 'year_id': range(1980, 2050) } self.national_nids = self.cg.get_resource("nid_replacements") # resources self.completeness = self.cg.get_resource("completeness") self.env_meta_df = get_env(env_run_id=self.cg.get_id('env_run'), **self.cache_options) self.location_meta_df = get_current_location_hierarchy( location_set_version_id=self.cg.get_id('location_set_version'), **self.cache_options) self.cod_ages = list( get_cod_ages(**self.cache_options)['age_group_id'].unique()) # identifiers self.source_cols = ["source", "nid", "data_type_id"] self.geo_cols = ["location_id", "year_id"] self.meta_cols = ["nationally_representative", "detail_level_id"] self.value_cols = ['deaths'] self.year_end = self.cg.get_id('year_end') self.full_time_series = "full_time_series" # directories self.current_best_version = "2018_04_03_151739" self.out_dir = "FILEPATH" self.arch_dir = "{}/_archive".format(self.out_dir) self.timestamp = cod_timestamp()
def run_proportions_prep(shared_package_id, outdir, vr_pull_timestamp, data_id, test=False): location_set_version = CONF.get_id('location_set_version') location_hierarchy = get_current_location_hierarchy( # location_set_version_id=location_set_version, gbd_round_id=5 # FIXME: change this when covars have all loc values! ) reg_spec = get_regression_specification(shared_package_id) input_data_path = "FILEPATH".format(outdir, data_id) print_log_message("Running input data prep") df = pull_vr_data_for_rdp_reg( reg_spec, location_hierarchy, vr_pull_timestamp=vr_pull_timestamp, data_id=data_id, small_test=test ) print_log_message("Formatting regression input") df = format_reg_data_for_modeling( df, reg_spec, location_hierarchy ) # df = df.rename(columns={ # 'prop_pkgtarg_target': 'cf_target', # 'prop_pkgtarg_garbage': 'cf_garbage' # }) print_log_message("Writing regression input") df.to_csv(input_data_path, index=False) # # make square df print_log_message("Writing square dataset") square_df = prepare_square_df(df, location_hierarchy, reg_spec) square_df = add_model_group(square_df, with_age=False) square_df.to_csv("FILEPATH".format(outdir, data_id), index=False) print_log_message("Done")
def run_pipeline(nid, extract_type_id, launch_set_id, df, code_system_id, cause_set_version_id, location_set_version_id, pop_run_id, env_run_id, distribution_set_version_id, diagnostic=False): """Run the full pipeline""" cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_results': False, 'cache_dir': CONF.get_directory('FILEPATH'), 'verbose': False } location_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **cache_options) code_map = get_cause_map(code_system_id=code_system_id, **cache_options) source = get_value_from_nid(nid, "source", extract_type_id) print("Overriding causes when necessary") df = overrides(df, location_meta_df) print("Dropping data out of scope") df = drop_data_out_of_scope(df, location_meta_df, source) if len(df) > 0: # make sure six minor territories are grouped correctly assert_no_six_minor_territories(df) # run mapping print("\nDeaths before MAPPING: {}".format(df.deaths.sum())) Mapper = GBDCauseMapper(cause_set_version_id, code_map) df = Mapper.get_computed_dataframe(df, code_system_id) if diagnostic: write_phase_output(df, 'mapping', nid, extract_type_id, launch_set_id, sub_dirs='diagnostic') print("\nDeaths before AGESEXSPLIT: {}".format(df.deaths.sum())) # run age sex splitting MySplitter = AgeSexSplitter(cause_set_version_id, pop_run_id, distribution_set_version_id, verbose=True, collect_diagnostics=False) df = MySplitter.get_computed_dataframe(df, location_meta_df) if diagnostic: diag_df = MySplitter.get_diagnostic_dataframe() write_phase_output(diag_df, 'agesexsplit', nid, extract_type_id, launch_set_id, sub_dirs='diagnostic') print("\nDeaths before CORRECTIONS: {}".format(df.deaths.sum())) # run restrictions corrections Corrector = RestrictionsCorrector(code_system_id, cause_set_version_id, collect_diagnostics=False, verbose=True) df = Corrector.get_computed_dataframe(df) # calculate cc_code for some sources if source in ['Iran_maternal_surveillance', 'Iran_forensic']: env_meta_df = get_env(env_run_id=env_run_id, **cache_options) df = calculate_cc_code(df, env_meta_df, code_map) print("\nDeaths after adding cc_code: {}".format(df.deaths.sum())) # adjust deaths for New Zealand by maori/non-maori ethnicities if source in ["NZL_MOH_ICD9", "NZL_MOH_ICD10"]: df = correct_maori_non_maori_deaths(df) print("\nDeaths after Maori/non-Maori adjustment: {}".format( df.deaths.sum())) print("\nDeaths at END: {}".format(df.deaths.sum())) return df
def get_locations(): """Fetch the best location hierarchy in the location set.""" locations = get_current_location_hierarchy( location_set_version_id=CONF.get_id("location_set_version")) return locations
def run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id, cmvid, launch_set_id, remove_decimal, write_diagnostics=True): read_file_cache_options = { 'block_rerun': True, 'cache_dir': CACHE_DIR, 'force_rerun': False, 'cache_results': False } iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=lsvid) code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id) cause_map = get_cause_map(code_map_version_id=cmvid, **read_file_cache_options) orig_deaths_sum = int(df['deaths'].sum()) if remove_decimal: print_log_message("Removing decimal from code map") cause_map['value'] = cause_map['value'].apply( lambda x: x.replace(".", "")) if needs_garbage_correction(iso3, data_type_id): print_log_message("Correcting Garbage for {}".format(iso3)) orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid, **read_file_cache_options) age_meta_df = get_ages(**read_file_cache_options) loc_meta_df = get_current_location_hierarchy( location_set_version_id=lsvid, **read_file_cache_options) pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options) hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=True) df = hiv_corrector.get_computed_dataframe() after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) after_deaths_sum = int(df['deaths'].sum()) print_log_message(""" Stage [gc deaths / total deaths] Before GC correction [{gco} / {to}] After GC correction [{gca} / {ta}] """.format(gco=orig_gc_sum, to=orig_deaths_sum, gca=after_gc_sum, ta=after_deaths_sum)) df = add_code_metadata(df, ['value', 'code_system_id'], code_map=cause_map, **read_file_cache_options) assert (df['code_system_id'] == code_system_id).all(), "Variable code " \ "system id {} did not agree with all values of df code " \ "system id: \n{}".format( code_system_id, df.loc[df['code_system_id'] != code_system_id]) print_log_message("Formatting data for redistribution") df = format_age_groups(df) # drop observations with 0 deaths df = drop_zero_deaths(df) # merge on redistribution location hierarchy df = add_rd_locations(df, lsvid) # fill in any missing stuff that may have come from rd hierarchy df = fill_missing_df(df, verify_all=True) df = add_split_group_id_column(df) # final check to make sure we have all the necessary columns df = format_columns_for_rd(df, code_system_id) split_groups = list(df.split_group.unique()) parallel = len(split_groups) > 1 print_log_message("Submitting/Running split groups") for split_group in split_groups: # remove intermediate files from previous run delete_split_group_output(nid, extract_type_id, split_group) # save to file split_df = df.loc[df['split_group'] == split_group] write_split_group_input(split_df, nid, extract_type_id, split_group) if parallel: submit_split_group(nid, extract_type_id, split_group, code_system_id, launch_set_id) else: worker_main(nid, extract_type_id, split_group, code_system_id) if parallel: print_log_message("Waiting for splits to complete...") wait('claude_redistributionworker_{}'.format(nid), 30) print_log_message("Done waiting. Appending them together") df = read_append_split_groups(split_groups, nid, extract_type_id, cause_map) print_log_message("Done appending files - {} rows assembled".format( len(df))) df = revert_variables(df) after_deaths_sum = int(df['deaths'].sum()) before_after_text = """ Before GC redistribution: {a} After GC redistribution: {b} """.format(a=orig_deaths_sum, b=after_deaths_sum) diff = abs(orig_deaths_sum - after_deaths_sum) diff_threshold = max(.02 * orig_deaths_sum, 5) if not diff < diff_threshold: raise AssertionError("Deaths not close.\n" + before_after_text) else: print_log_message(before_after_text) return df
def run_phase(df, csvid, nid, extract_type_id, lsvid, pop_run_id, cmvid, launch_set_id, remove_decimal, write_diagnostics=True): """String together processes for redistribution.""" # what to do about caching throughout the phase read_file_cache_options = { 'block_rerun': True, 'cache_dir': CACHE_DIR, 'force_rerun': False, 'cache_results': False } # the iso3 of this data iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=lsvid) # the code system id code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) # the data type data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id) # cause map cause_map = get_cause_map(code_map_version_id=cmvid, **read_file_cache_options) orig_deaths_sum = int(df['deaths'].sum()) if remove_decimal: print_log_message("Removing decimal from code map") cause_map['value'] = cause_map['value'].apply( lambda x: x.replace(".", "")) if needs_garbage_correction(iso3, data_type_id): print_log_message("Correcting Garbage for {}".format(iso3)) orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid, **read_file_cache_options) # get age group ids age_meta_df = get_ages(**read_file_cache_options) loc_meta_df = get_current_location_hierarchy( location_set_version_id=lsvid, **read_file_cache_options) pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options) # Move garbage to hiv first hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=True) df = hiv_corrector.get_computed_dataframe() after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum()) after_deaths_sum = int(df['deaths'].sum()) print_log_message(""" Stage [gc deaths / total deaths] Before GC correction [{gco} / {to}] After GC correction [{gca} / {ta}] """.format(gco=orig_gc_sum, to=orig_deaths_sum, gca=after_gc_sum, ta=after_deaths_sum)) df = add_code_metadata(df, ['value', 'code_system_id'], code_map=cause_map, **read_file_cache_options) # recognizing that it is weird for code_system_id to come from two places, # make sure they are consistent assert (df['code_system_id'] == code_system_id).all(), "Variable code " \ "system id {} did not agree with all values of df code " \ "system id: \n{}".format( code_system_id, df.loc[df['code_system_id'] != code_system_id]) print_log_message("Formatting data for redistribution") # do we have all the packages we need? # verify_packages(df) # format age groups to match package parameters df = format_age_groups(df) # drop observations with 0 deaths df = drop_zero_deaths(df) # merge on redistribution location hierarchy df = add_rd_locations(df, lsvid) # fill in any missing stuff that may have come from rd hierarchy df = fill_missing_df(df, verify_all=True) # create split groups # NO SPLIT GROUP NEEDED df = add_split_group_id_column(df) # final check to make sure we have all the necessary columns df = format_columns_for_rd(df, code_system_id) split_groups = list(df.split_group.unique()) parallel = len(split_groups) > 1 print_log_message("Submitting/Running split groups") for split_group in split_groups: # remove intermediate files from previous run delete_split_group_output(nid, extract_type_id, split_group) # save to file split_df = df.loc[df['split_group'] == split_group] write_split_group_input(split_df, nid, extract_type_id, split_group) # submit jobs or just run them here if parallel: submit_split_group(nid, extract_type_id, split_group, code_system_id, launch_set_id) else: worker_main(nid, extract_type_id, split_group, code_system_id) if parallel: print_log_message("Waiting for splits to complete...") # wait until all jobs for a given nid have completed # eventually need logic for files not being present wait('claude_redistributionworker_{}'.format(nid), 30) # This seems to be necessary to wait for files print_log_message("Done waiting. Appending them together") # append split groups together df = read_append_split_groups(split_groups, nid, extract_type_id, cause_map) print_log_message("Done appending files - {} rows assembled".format( len(df))) df = revert_variables(df) after_deaths_sum = int(df['deaths'].sum()) before_after_text = """ Before GC redistribution: {a} After GC redistribution: {b} """.format(a=orig_deaths_sum, b=after_deaths_sum) diff = abs(orig_deaths_sum - after_deaths_sum) # bad if change 2% or 5 deaths, whichever is greater # (somewhat arbitrary, just trying to avoid annoying/non-issue failures) diff_threshold = max(.02 * orig_deaths_sum, 5) if not diff < diff_threshold: raise AssertionError("Deaths not close.\n" + before_after_text) else: print_log_message(before_after_text) return df
def run_phase(df, nid, extract_type_id, pop_run_id, cause_set_version_id, location_set_version_id): """Run the full phase, chaining together computational elements.""" cache_dir = CONF.get_directory('FILEPATH') orig_deaths = df['deaths'].sum() standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } code_system_id = get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id) # this queries the database, maybe should be passed in directly code_system = get_code_system_from_id(code_system_id) source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) # get cause hierarchy cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) is_vr = data_type_id in [9, 10] if not skip_hiv_correction(source) and is_vr: # get location hierarchy loc_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) # get population pop_meta_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) # get age metadata age_meta_df = get_ages(**standard_cache_options) # get the country iso3 = get_value_from_nid( nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) assert pd.notnull(iso3), "Could not find iso3 for nid {}, " \ "extract_type_id {}".format(nid, extract_type_id) hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=False) print_log_message("Running hiv correction for iso3 {}".format(iso3)) df = hiv_corrector.get_computed_dataframe() if needs_injury_redistribution(source): print_log_message("Correcting injuries") if not 'loc_meta_df' in vars(): # get location hierarchy loc_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) injury_redistributor = InjuryRedistributor(df, loc_meta_df, cause_meta_df) df = injury_redistributor.get_computed_dataframe() df = combine_with_rd_raw(df, nid, extract_type_id, location_set_version_id) val_cols = ['deaths', 'deaths_raw', 'deaths_corr', 'deaths_rd'] # run china VR rescaling if needs_subnational_rescale(source): china_rescaler = ChinaHospitalUrbanicityRescaler() df = china_rescaler.get_computed_dataframe(df) if needs_strata_collapse(source): # set site id to blank site id and collapse df['site_id'] = 2 group_cols = list(set(df.columns) - set(val_cols)) df = df.groupby(group_cols, as_index=False)[val_cols].sum() if is_vr: # drop if deaths are 0 across all current deaths columns df = df.loc[df[val_cols].sum(axis=1) != 0] # restrict causes based on code system print_log_message("Running bridge mapper") bridge_mapper = BridgeMapper(source, cause_meta_df, code_system) df = bridge_mapper.get_computed_dataframe(df) # run recodes based on expert opinion print_log_message("Enforcing some very hard priors (expert opinion)") expert_opinion_recoder = Recoder(cause_meta_df, source, code_system_id, data_type_id) df = expert_opinion_recoder.get_computed_dataframe(df) end_deaths = df['deaths'].sum() print_log_message("Checking no large loss or gain of deaths") if abs(orig_deaths - end_deaths) >= (.1 * end_deaths): diff = round(abs(orig_deaths - end_deaths), 2) old = round(abs(orig_deaths)) new = round(abs(end_deaths)) raise AssertionError("Change of {} deaths [{}] to [{}]".format( diff, old, new)) return df
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id, location_set_version_id, cause_set_version_id): cache_dir = CONF.get_directory('db_cache') source = get_value_from_nid( nid, 'source', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } # ************************************************************ # Get cached metadata # ************************************************************ print_log_message("Getting cached db resources") location_hierarchy = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) env_df = get_env(env_run_id=env_run_id, **standard_cache_options) age_weight_df = get_age_weights(**standard_cache_options) cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) age_meta_df = get_ages(**standard_cache_options) # ************************************************************ # RAKING # ************************************************************ # Rake if appropriate based on this logic if ((data_type_id in [8, 9, 10] and (source != 'Other_Maternal')) or source in MATERNAL_NR_SOURCES): if source not in NOT_RAKED_SOURCES: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # for the Other_Maternal source we only rake household surveys elif source == "Other_Maternal": model_groups = get_datasets(nid, extract_type_id, block_rerun=True, force_rerun=False).model_group.unique() assert len(model_groups) == 1 model_group = model_groups[0] if "HH_SURVEYS" in model_group: if model_group == 'MATERNAL-HH_SURVEYS-IND': print_log_message("Raking sub national estimates," \ " applying double raking for India Maternal" ) raker = Raker(df, source, double=True) df = raker.get_computed_dataframe(location_hierarchy) else: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # ************************************************************ # DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA # ************************************************************ # data with zero sample size is almost certaintly some anomolous result # of a program generating data it shouldn't have, and it shouldn't be # included in codem models. Was probably already dropped, anyway, before # running noise reduction. df = df.query('sample_size != 0') # uploading data before 1980 is a waste of space because neither codem # nor codviz use it df = df.loc[df['year_id'] >= 1980] print_log_message("Enforcing age sex restrictions") # this actually drops data from the dataframe if it violates age/sex # restrictions (e.g. male maternity disorders) df = enforce_asr(df, cause_meta_df, age_meta_df) # ************************************************************ # FIT EACH DRAW TO NON-ZERO FLOOR # ************************************************************ print_log_message("Fitting to non-zero floor...") nonzero_floorer = NonZeroFloorer(df) df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df) # ************************************************************ # AGE AGGREGATION # ************************************************************ print_log_message("Creating age standardized and all ages groups") age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df) df = age_aggregator.get_computed_dataframe() # ************************************************************ # Make CODEm and CoDViz metrics for uncertainty # ************************************************************ # columns that should be present in the phase output final_cols = [ 'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd', 'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id', 'site_id', 'year_id' ] # Use draws to make metrics for uncertainty to # be used by CODEm and CoDViz # also creates cf_final from mean of draws print_log_message("Making metrics for CODEm and CoDViz") if dataset_has_redistribution_variance(data_type_id, source): df = RedistributionVarianceEstimator.make_codem_codviz_metrics( df, pop_df) final_cols += [ 'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr', 'variance_rd_logit_cf' ] # we did this in the old code-- no cfs over 1 nor below 0 for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']: df.loc[df[cf_col] > 1, cf_col] = 1 df.loc[df[cf_col] < 0, cf_col] = 0 df = df[final_cols] return df
def run_phase(df, nid, extract_type_id, env_run_id, pop_run_id, location_set_version_id, cause_set_version_id): cache_dir = CONF.get_directory('db_cache') source = get_value_from_nid( nid, 'source', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } # ************************************************************ # Get cached metadata # ************************************************************ print_log_message("Getting cached db resources") location_hierarchy = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) pop_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) env_df = get_env(env_run_id=env_run_id, **standard_cache_options) age_weight_df = get_age_weights(**standard_cache_options) cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) age_meta_df = get_ages(**standard_cache_options) # ************************************************************ # RAKING # ************************************************************ if ((data_type_id in [8, 9, 10] and (source != "Other_Maternal")) or source in MATERNAL_NR_SOURCES): if source not in NOT_RAKED_SOURCES: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # for the Other_Maternal source we only rake household surveys elif source == "Other_Maternal": model_groups = get_datasets(nid, extract_type_id, block_rerun=True, force_rerun=False).model_group.unique() assert len(model_groups) == 1 model_group = model_groups[0] if "HH_SURVEYS" in model_group: print_log_message("Raking sub national estimates") raker = Raker(df, source) df = raker.get_computed_dataframe(location_hierarchy) # ************************************************************ # DROP ZERO SAMPLE SIZE AND RESTRICTED AGE/SEX DATA # ************************************************************ df = df.query('sample_size != 0') df = df.loc[df['year_id'] >= 1980] print_log_message("Enforcing age sex restrictions") df = enforce_asr(df, cause_meta_df, age_meta_df) # ************************************************************ # FIT EACH DRAW TO NON-ZERO FLOOR # ************************************************************ print_log_message("Fitting to non-zero floor...") nonzero_floorer = NonZeroFloorer(df) df = nonzero_floorer.get_computed_dataframe(pop_df, env_df, cause_meta_df) # ************************************************************ # AGE AGGREGATION # ************************************************************ print_log_message("Creating age standardized and all ages groups") age_aggregator = AgeAggregator(df, pop_df, env_df, age_weight_df) df = age_aggregator.get_computed_dataframe() # ************************************************************ # Make CODEm and CoDViz metrics for uncertainty # ************************************************************ # columns that should be present in the phase output final_cols = [ 'age_group_id', 'cause_id', 'cf_corr', 'cf_final', 'cf_raw', 'cf_rd', 'extract_type_id', 'location_id', 'nid', 'sample_size', 'sex_id', 'site_id', 'year_id' ] print_log_message("Making metrics for CODEm and CoDViz") if dataset_has_redistribution_variance(data_type_id, source): df = RedistributionVarianceEstimator.make_codem_codviz_metrics( df, pop_df) final_cols += [ 'cf_final_high_rd', 'cf_final_low_rd', 'variance_rd_log_dr', 'variance_rd_logit_cf' ] for cf_col in ['cf_final', 'cf_rd', 'cf_raw', 'cf_corr']: df.loc[df[cf_col] > 1, cf_col] = 1 df.loc[df[cf_col] < 0, cf_col] = 0 df = df[final_cols] return df
def run_phase(df, nid, extract_type_id, pop_run_id, cause_set_version_id, location_set_version_id): """Run the full phase, chaining together computational elements.""" # get filepaths cache_dir = CONF.get_directory('db_cache') orig_deaths = df['deaths'].sum() standard_cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': cache_dir, 'cache_results': False } code_system_id = get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id) code_system = get_code_system_from_id(code_system_id, **standard_cache_options) source = get_value_from_nid(nid, 'source', extract_type_id=extract_type_id) data_type_id = get_value_from_nid( nid, 'data_type_id', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) # get cause hierarchy cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **standard_cache_options) is_vr = data_type_id in [9, 10] # run hiv correction on VR, but not Other_Maternal # countries to correct will be further pruned by the master cause # selections csv in the hiv corrector class if not skip_hiv_correction(source) and is_vr: # get location hierarchy loc_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) # get population pop_meta_df = get_pop(pop_run_id=pop_run_id, **standard_cache_options) # get age metadata age_meta_df = get_ages(**standard_cache_options) # get the country iso3 = get_value_from_nid( nid, 'iso3', extract_type_id=extract_type_id, location_set_version_id=location_set_version_id) assert pd.notnull(iso3), "Could not find iso3 for nid {}, " \ "extract_type_id {}".format(nid, extract_type_id) hiv_corrector = HIVCorrector(df, iso3, code_system_id, pop_meta_df, cause_meta_df, loc_meta_df, age_meta_df, correct_garbage=False) print_log_message("Running hiv correction for iso3 {}".format(iso3)) df = hiv_corrector.get_computed_dataframe() if needs_injury_redistribution(source): print_log_message("Correcting injuries") if not 'loc_meta_df' in vars(): # get location hierarchy loc_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **standard_cache_options) injury_redistributor = InjuryRedistributor(df, loc_meta_df, cause_meta_df) df = injury_redistributor.get_computed_dataframe() # apply redistribution of LRI to tb in under 15, non-neonatal ages based # on location/year specific proportions print_log_message( "Applying special redistribution of LRI to TB in under 15") lri_tb_redistributor = LRIRedistributor(df, cause_meta_df) df = lri_tb_redistributor.get_computed_dataframe() # merge in raw and rd here because recodes and bridge mapping should # also apply to the causes that are in previous phases (raw deaths for # secret codes need to be moved up to their parent cause, for example) df = combine_with_rd_raw(df, nid, extract_type_id, location_set_version_id) val_cols = ['deaths', 'deaths_raw', 'deaths_corr', 'deaths_rd'] # run china VR rescaling if needs_subnational_rescale(source): china_rescaler = ChinaHospitalUrbanicityRescaler() df = china_rescaler.get_computed_dataframe(df) if needs_strata_collapse(source): # set site id to blank site id and collapse df['site_id'] = 2 group_cols = list(set(df.columns) - set(val_cols)) df = df.groupby(group_cols, as_index=False)[val_cols].sum() if is_vr: # drop if deaths are 0 across all current deaths columns df = df.loc[df[val_cols].sum(axis=1) != 0] # restrict causes based on code system print_log_message("Running bridge mapper") bridge_mapper = BridgeMapper(source, cause_meta_df, code_system) df = bridge_mapper.get_computed_dataframe(df) # run recodes based on expert opinion print_log_message("Enforcing some very hard priors (expert opinion)") expert_opinion_recoder = Recoder(cause_meta_df, source, code_system_id, data_type_id) df = expert_opinion_recoder.get_computed_dataframe(df) end_deaths = df['deaths'].sum() print_log_message("Checking no large loss or gain of deaths") if abs(orig_deaths - end_deaths) >= (.1 * end_deaths): diff = round(abs(orig_deaths - end_deaths), 2) old = round(abs(orig_deaths)) new = round(abs(end_deaths)) raise AssertionError("Change of {} deaths [{}] to [{}]".format( diff, old, new)) return df