def get_all_cause_vr(location_set_version_id): """Pull age/sex split VR with all cause mortality.""" all_cause_filters = {'extract_type_id': [167], 'location_set_id': 35} print_log_message("Pulling all cause VR") df = get_claude_data("formatted", location_set_version_id=location_set_version_id, **all_cause_filters) df = _collapse_to_group_cols(df) return df
def run_phase(nid, extract_type_id, launch_set_id, data_type_id, source, model_group, malaria_model_group): run_noise_reduction = True run_by_cause = False # determine above values using the source and model group if model_group == "NO_NR": run_noise_reduction = False if model_group_is_run_by_cause(model_group): run_by_cause = True if run_noise_reduction: if run_by_cause: filepath = "FILEPATH".format(nr=NR_DIR, model_group=model_group, lsid=launch_set_id) causes = sorted(list(pd.read_csv(filepath)['cause_id'].unique())) print_log_message("Reading cause-specific files".format( len(causes))) else: causes = [None] print_log_message("Reducing cause-appended file") dfs = [] for cause_id in causes: df = get_noise_reduction_model_result(nid, extract_type_id, launch_set_id, model_group, malaria_model_group, cause_id=cause_id) if 'Unnamed: 0' in df.columns: df = df.drop('Unnamed: 0', axis=1) dfs.append(df) df = pd.concat(dfs, ignore_index=True) print_log_message( "Running bayesian noise reduction algorithm using fitted priors") noise_reducer = NoiseReducer(data_type_id, source) df = noise_reducer.get_computed_dataframe(df) else: # simply get the aggregated result print_log_message("Skipping noise reduction for source {} and model " "group {}".format(source, model_group)) df = get_claude_data("aggregation", nid=nid, extract_type_id=extract_type_id) return df
def get_cod_vr(location_set_version_id, vr_filter=None): """Pull age/sex split VR with cause of death data.""" dataset_filters = { 'data_type_id': [9, 10], 'location_set_id': 35, 'is_active': True, } # optionally, only get VR data for specific sources/nids/iso3s/etc. if vr_filter is not None: dataset_filters.update(vr_filter) print_log_message("Pulling CoD VR") df = get_claude_data("disaggregation", location_set_version_id=location_set_version_id, **dataset_filters) df = _collapse_to_group_cols(df) return df
def get_va_vr_sourcemetadata(self): """Pull VA + VR (and sample VR) + CHAMPS source metadata.""" df = get_claude_data(phase="sourcemetadata", **self.dataset_filters) # get rid of northern ireland and wales in 1980, nothing matches # get rid of GBR 1980 (it should be dropped from our DB...) df = df.query( '~((location_id == 433 | location_id == 434) & year_id == 1980)') # get rid of other maternal VR df = df.query('~(source == "Other_Maternal" & location_id == 38)') # collapse to remove extract type id group_cols = self.geo_cols + self.source_cols + \ ['age_group_id', 'sex_id'] + self.meta_cols df = df.groupby(group_cols, as_index=False)[self.value_cols].sum() return df
def pull_vr_old(cause_ids, start_year, end_year): assert type(start_year) is int, "Starting year must be an integer" assert type(end_year) is int, "Ending year must be an integer" assert end_year >= start_year, "End year must be greater than or equal to starting year" year_ids = list(range(start_year, end_year + 1)) if type(cause_ids) is int: cause_ids = [cause_ids] assert type( cause_ids) is list, "Cause IDs must be passed as a list of ints" vr_old = get_claude_data("disaggregation", year_id=year_ids, data_type_id=9, verbose=True, source="ICD7A", location_set_id=35, exec_function=subset_causes, exec_function_args=[cause_ids]) return vr_old
def main(nid, extract_type_id, code_system_id, launch_set_id, remove_decimal): """Main method""" start_time = time.time() df = get_claude_data( "disaggregation", nid=nid, extract_type_id=extract_type_id ) data_type_id = get_value_from_nid(nid, 'data_type_id', extract_type_id=extract_type_id) iso3 = get_value_from_nid(nid, 'iso3', extract_type_id=extract_type_id) df = run_pipeline(df, nid, extract_type_id, code_system_id, remove_decimal, data_type_id, iso3) run_time = time.time() - start_time print_log_message("Finished in {} seconds".format(run_time)) write_phase_output( df, "misdiagnosiscorrection", nid, extract_type_id, launch_set_id ) return df
def pull_vr_data_for_rdp_reg(reg_spec, location_hierarchy, data_id, small_test=False, vr_pull_timestamp=None): """Pull vr used to make redistribution proportions. If vr_pull_timestamp is passed, and it does in fact exist, then this will just read that. Otherwise, it runs a custom get_claude_data based on the passed regression specification. """ shared_package_id = reg_spec['shared_package_id'] if vr_pull_timestamp is not None: timestamp = vr_pull_timestamp else: timestamp = cod_timestamp() outdir = "FILEPATH".format(RDP_REG_DIR, shared_package_id) outpath = "FILEPATH".format(outdir, data_id, timestamp) if vr_pull_timestamp is not None: print_log_message("Reading VR data pulled on {}".format(vr_pull_timestamp)) if not os.path.exists(outpath): raise ValueError( "Passed [vr_pull_timestamp={}], but {} does not exist. " "Need to either pass a different version that does exist, or" " run a new vr pull by passing vr_pull_timestamp=None.".format( vr_pull_timestamp, outpath) ) df = pd.read_csv(outpath) else: print_log_message( "Pulling a fresh version of VR with timestamp {}".format( timestamp) ) # regressions only use detailed code systems code_system_id = MODEL_DATA_CODE_SYSTEMS # regressions only use national-level data to avoid biasing the sample # toward subnational datasets country_loc_map = get_country_loc_id_map(location_hierarchy) if small_test: year_id = [2010, 2011] print("Pulling data for year subset: {}".format(year_id)) else: year_id = range(1980, 2018) dfs = [] for code_system_id in code_system_id: print_log_message("Code system id: {}".format(code_system_id)) garbage_code_ids = get_package_code_ids(reg_spec, code_system_id) target_cause_ids = reg_spec['target_cause_ids'] df = get_claude_data( "disaggregation", data_type_id=9, code_system_id=code_system_id, is_active=True, year_id=year_id, location_set_id=35, exec_function=collapse_to_reg_df, exec_function_args=[garbage_code_ids, target_cause_ids, country_loc_map], attach_launch_set_id=True ) dfs.append(df) df = pd.concat(dfs, ignore_index=True) df['vr_pull_timestamp'] = timestamp df.to_csv(outpath, index=False) return df
def run_phase(nid, extract_type_id, launch_set_id, data_type_id, source, model_group, malaria_model_group): """Chain together computational elements for NR.""" # types that take noise reduction # maternal logic: noise reduce household surveys (in Other_Maternal source) # and sources in maternal_nr_sources run_noise_reduction = True run_by_cause = False # determine above values using the source and model group if model_group == "NO_NR": run_noise_reduction = False if model_group_is_run_by_cause(model_group): run_by_cause = True if run_noise_reduction: if run_by_cause: filepath = "FILEPATH".format(nr=NR_DIR, model_group=model_group, lsid=launch_set_id) # NR ran by cause, so pick up the causes that ran and read em causes = sorted(list(pd.read_csv(filepath)['cause_id'].unique())) print_log_message("Reading cause-specific files".format( len(causes))) else: # cause_id = None means all causes causes = [None] print_log_message("Reducing cause-appended file") # when causes = [None], this will not actually run by cause, as there # will be one entry in the loop and it will do all the causes. The # pd.concat only adds like .7 seconds to the total time as opposed to # running outside of a loop (from some testing in a notebook) dfs = [] for cause_id in causes: df = get_noise_reduction_model_result(nid, extract_type_id, launch_set_id, model_group, malaria_model_group, cause_id=cause_id) if 'Unnamed: 0' in df.columns: df = df.drop('Unnamed: 0', axis=1) dfs.append(df) df = pd.concat(dfs, ignore_index=True) print_log_message( "Running bayesian noise reduction algorithm using fitted priors") noise_reducer = NoiseReducer(data_type_id, source) df = noise_reducer.get_computed_dataframe(df) else: # simply get the aggregated result print_log_message("Skipping noise reduction for source {} and model " "group {}".format(source, model_group)) df = get_claude_data("aggregation", nid=nid, extract_type_id=extract_type_id) return df
def get_model_data(model_group, location_hierarchy, location_set_version_id, cause_meta_df): """Get data to run in NR model with incoming data.""" iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique() regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique() super_region_ids = location_hierarchy.query( 'level == 1')['location_id'].unique() # need to be string for later test that what comes after "VA-" is a # super region (otherwise, would have to compare ints, and whats after # "VA-" might not be convertible to an int) super_region_ids = [str(s) for s in super_region_ids] super_region_to_region_ids = location_hierarchy.query('level == 2') # location id here is the region id, and parent id is the super region id # becomes a dictionary from super region id to list of region ids super_region_to_region_ids = ( super_region_to_region_ids[['location_id', 'parent_id']].groupby( 'parent_id' ).apply(lambda df: list(set(df['location_id']))).to_dict() ) regions_to_ids = location_hierarchy.query( 'level == 2').set_index('ihme_loc_id')['region_id'] level_three_location_ids = location_hierarchy.query( 'level == 3')['location_id'].unique() model_group_filters = {} bad_model_group = False if model_group.startswith("VR-"): model_group_filters['data_type_id'] = [9, 10] loc_code = model_group.replace("VR-", "") if loc_code in iso3s: model_group_filters['iso3'] = loc_code elif loc_code in regions: region_id = regions_to_ids[loc_code] model_group_filters['region_id'] = region_id model_group_filters['exec_function'] = restrict_to_location_ids model_group_filters['exec_function_args'] = [ level_three_location_ids ] elif loc_code == "GRL-AK": AK_LOC_ID = 524 GRL_LOC_ID = 349 model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID] else: bad_model_group = True elif model_group.startswith("VA-"): model_group_filters['data_type_id'] = [8, 12] if model_group == "VA-SRS-IND": model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "VA-SRS-IDN": model_group_filters['source'] = IDN_SRS_SOURCES elif model_group == "VA-Matlab": model_group_filters['source'] = MATLAB_SOURCES elif model_group == "VA-Nepal-Burden": model_group_filters['source'] = "Nepal_Burden_VA" elif model_group == "VA-IND": model_group_filters['iso3'] = "IND" elif model_group == "VA-158": # potential bug from GBD2016 - super region 158 keeps only # Pakistan, Nepal, and Bangledesh, doesn't get India data # Also keep Bhutan in case we ever have VA there model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD', 'BTN'] else: loc_code = model_group.replace("VA-", "") if loc_code in super_region_ids: super_region_id = int(loc_code) model_group_filters['region_id'] = \ super_region_to_region_ids[super_region_id] else: bad_model_group = True elif model_group == "Cancer_Registry": model_group_filters['source'] = "Cancer_Registry" # keep data by source/iso3/survey type # model groups follow MATERNAL-{source}-{iso3} format # except for the household surveys within Other_Maternal elif model_group.startswith("MATERNAL"): for source in MATERNAL_NR_SOURCES: if source in model_group: model_group_filters['source'] = source if "HH_SURVEYS" in model_group: model_group_filters['survey_type'] = ["DHS", "RHS", "AHS", "DLHS", "NFHS"] model_group_filters['iso3'] = model_group[-3:] # special malaria model groups for VA data elif model_group.startswith('malaria'): model_group_filters['data_type_id'] = [8, 12] model_group_filters['malaria_model_group'] = model_group if "IND_SRS" in model_group: model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "CHAMPS": model_group_filters['data_type_id'] = [12] else: bad_model_group = True if bad_model_group: raise AssertionError( "Unrecognized model group: {}".format(bad_model_group) ) model_df = get_claude_data( phase="aggregation", is_active=True, is_dropped=False, location_set_id=35, year_id=range(1980, 2050), assert_all_available=True, location_set_version_id=location_set_version_id, **model_group_filters ) add_cols = ['code_system_id'] if model_group.startswith(("VA", "MATERNAL", "malaria", "CHAMPS")) or \ model_group in ["VR-RUS", "VR-R9"]: add_cols.append('source') if model_group.startswith('MATERNAL-HH_SURVEYS'): model_df = add_survey_type(model_df) # add on code_system_id model_df = add_nid_metadata( model_df, add_cols, force_rerun=False, block_rerun=True, cache_dir='standard', cache_results=False ) if model_group == "VR-RUS" or model_group == "VR-R9": # treat this like Russia_FMD_1989_1998 for purpose of cause list, # as it has now been bridge mapped that way replace_source = "Russia_FMD_ICD9" replace_csid = 213 fmd_conv_10 = model_df['source'] == replace_source num_replace = len(model_df[fmd_conv_10]) assert num_replace > 0, \ "No rows found with source {} in " \ "model group {}".format(replace_source, model_group) print_log_message( "Setting code system to {cs} for {s} " "source: {n} rows changed".format( cs=replace_csid, s=replace_source, n=num_replace) ) model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid report_if_merge_fail( model_df, 'code_system_id', ['nid', 'extract_type_id'] ) # special source drops for certain groups model_df = drop_source_data(model_df, model_group, location_hierarchy, cause_meta_df) return model_df
def get_model_data(model_group, location_hierarchy, location_set_version_id, cause_meta_df): """Get data to run in NR model with incoming data.""" iso3s = location_hierarchy.query('level == 3')['ihme_loc_id'].unique() regions = location_hierarchy.query('level == 2')['ihme_loc_id'].unique() super_region_ids = location_hierarchy.query( 'level == 1')['location_id'].unique() super_region_ids = [str(s) for s in super_region_ids] super_region_to_region_ids = location_hierarchy.query('level == 2') super_region_to_region_ids = (super_region_to_region_ids[[ 'location_id', 'parent_id' ]].groupby('parent_id').apply( lambda df: list(set(df['location_id']))).to_dict()) regions_to_ids = location_hierarchy.query('level == 2').set_index( 'ihme_loc_id')['region_id'] level_three_location_ids = location_hierarchy.query( 'level == 3')['location_id'].unique() model_group_filters = {} bad_model_group = False if model_group.startswith("VR-"): model_group_filters['data_type_id'] = [9, 10] loc_code = model_group.replace("VR-", "") if loc_code in iso3s: model_group_filters['iso3'] = loc_code elif loc_code in regions: region_id = regions_to_ids[loc_code] model_group_filters['region_id'] = region_id model_group_filters['exec_function'] = restrict_to_location_ids model_group_filters['exec_function_args'] = [ level_three_location_ids ] elif loc_code == "GRL-AK": AK_LOC_ID = 524 GRL_LOC_ID = 349 model_group_filters['location_id'] = [AK_LOC_ID, GRL_LOC_ID] else: bad_model_group = True elif model_group.startswith("VA-"): model_group_filters['data_type_id'] = 8 if model_group == "VA-SRS-IND": model_group_filters['source'] = IND_SRS_SOURCES elif model_group == "VA-SRS-IDN": model_group_filters['source'] = IDN_SRS_SOURCES elif model_group == "VA-Matlab": model_group_filters['source'] = MATLAB_SOURCES elif model_group == "VA-IND": model_group_filters['iso3'] = "IND" elif model_group == "VA-158": model_group_filters['iso3'] = ['PAK', 'NPL', 'BGD'] else: loc_code = model_group.replace("VA-", "") if loc_code in super_region_ids: super_region_id = int(loc_code) model_group_filters['region_id'] = \ super_region_to_region_ids[super_region_id] else: bad_model_group = True elif model_group == "Cancer_Registry": model_group_filters['source'] = "Cancer_Registry" elif model_group.startswith("MATERNAL"): for source in MATERNAL_NR_SOURCES: if source in model_group: model_group_filters['source'] = source if "HH_SURVEYS" in model_group: model_group_filters['survey_type'] = [ "DHS", "RHS", "AHS", "DLHS", "NFHS" ] model_group_filters['iso3'] = model_group[-3:] elif model_group.startswith('malaria'): model_group_filters['data_type_id'] = 8 model_group_filters['malaria_model_group'] = model_group if "IND_SRS" in model_group: model_group_filters['source'] = IND_SRS_SOURCES else: bad_model_group = True if bad_model_group: raise AssertionError( "Unrecognized model group: {}".format(bad_model_group)) model_df = get_claude_data(phase="aggregation", is_active=True, is_dropped=False, location_set_id=35, year_id=range(1980, 2050), assert_all_available=True, location_set_version_id=location_set_version_id, **model_group_filters) add_cols = ['code_system_id'] if model_group.startswith("VA") or model_group.startswith("MATERNAL") or \ model_group in ["VR-RUS", "VR-R9"] or model_group.startswith('malaria'): add_cols.append('source') if model_group.startswith('MATERNAL-HH_SURVEYS'): model_df = add_survey_type(model_df) # add on code_system_id model_df = add_nid_metadata(model_df, add_cols, force_rerun=False, block_rerun=True, cache_dir='standard', cache_results=False) if model_group == "VR-RUS" or model_group == "VR-R9": replace_source = "Russia_FMD_ICD9" replace_csid = 213 fmd_conv_10 = model_df['source'] == replace_source num_replace = len(model_df[fmd_conv_10]) assert num_replace > 0, \ "No rows found with source {} in " \ "model group {}".format(replace_source, model_group) print_log_message("Setting code system to {cs} for {s} " "source: {n} rows changed".format(cs=replace_csid, s=replace_source, n=num_replace)) model_df.loc[fmd_conv_10, 'code_system_id'] = replace_csid report_if_merge_fail(model_df, 'code_system_id', ['nid', 'extract_type_id']) # special source drops for certain groups model_df = drop_source_data(model_df, model_group, location_hierarchy, cause_meta_df) return model_df