def __init__(self, regression_launch_set_id=None, shared_package_id=None, shared_package_name=None, formula=None, run_description=None, data_id=None, data_description=None, test=False): # Assert arguments passed in are valid to give flexibility in # creating a RegressionLauncher if run_description is None: raise AssertionError("Argument 'run_description' missing.") if data_id is not None and data_description is not None: # If both data_id and data_description were supplied, # that's an issue! We either use an old data version (data_id # supplied) or we give new data description and create new data raise AssertionError( "Only one of the following arguments may be supplied: " "'data_id', 'data_description'. Supply 'data_id' to " "use an old data version. Supply 'data_description' to " "create a new set of data.") if regression_launch_set_id is None: # If the RLSID isn't passed in, all necessary arguments must # be supplied by user! for arg in [shared_package_id, formula]: assert arg is not None, "Missing needed argument without an " \ "explicit regression_launch_set_id " \ "('shared_package_id', 'formula')." assert data_id is not None or data_description is not None, "Either 'data_id' or data_description' must be supplied if no RLSID is." self._test = test if not self._test: self.assert_repo_is_committed() else: print("***TEST*** - Not checking script commit status") self._launch_set_dict = { "regression_launch_set_id": RegressionLauncher.new_launch_set_id(), "username": getpass.getuser(), "commit_hash": get_git_commit_hash(), "timestamp": cod_timestamp(), "run_description": run_description, "formula": formula, "shared_package_id": shared_package_id, "shared_package_name": shared_package_name, "data_id": data_id, "data_description": data_description } self.fill_in_launch_set_gaps(regression_launch_set_id)
def __init__(self): self.cg = Configurator('standard') self.cache_dir = self.cg.get_directory('db_cache') # if you do not want to write any output files then set test to "True" self.test = False self.cache_options = { 'force_rerun': True, 'block_rerun': False, 'cache_dir': self.cache_dir } self.dataset_filters = { 'data_type_id': [8, 9, 10, 12], 'location_set_id': 35, 'is_active': True, 'year_id': range(1980, 2050) } self.national_nids = self.cg.get_resource("nid_replacements") # resources self.completeness = self.cg.get_resource("completeness") self.env_meta_df = get_env(env_run_id=self.cg.get_id('env_run'), **self.cache_options) self.location_meta_df = get_current_location_hierarchy( location_set_version_id=self.cg.get_id('location_set_version'), **self.cache_options) self.cod_ages = list( get_cod_ages(**self.cache_options)['age_group_id'].unique()) # identifiers self.source_cols = ["source", "nid", "data_type_id"] self.geo_cols = ["location_id", "year_id"] self.meta_cols = ["nationally_representative", "detail_level_id"] self.value_cols = ['deaths'] self.year_end = self.cg.get_id('year_end') self.full_time_series = "full_time_series" # directories self.current_best_version = "2018_04_03_151739" self.out_dir = "FILEPATH" self.arch_dir = "{}/_archive".format(self.out_dir) self.timestamp = cod_timestamp()
def pull_vr_data_for_rdp_reg(reg_spec, location_hierarchy, data_id, small_test=False, vr_pull_timestamp=None): """Pull vr used to make redistribution proportions. If vr_pull_timestamp is passed, and it does in fact exist, then this will just read that. Otherwise, it runs a custom get_claude_data based on the passed regression specification. """ shared_package_id = reg_spec['shared_package_id'] if vr_pull_timestamp is not None: timestamp = vr_pull_timestamp else: timestamp = cod_timestamp() outdir = "FILEPATH".format(RDP_REG_DIR, shared_package_id) outpath = "FILEPATH".format(outdir, data_id, timestamp) if vr_pull_timestamp is not None: print_log_message("Reading VR data pulled on {}".format(vr_pull_timestamp)) if not os.path.exists(outpath): raise ValueError( "Passed [vr_pull_timestamp={}], but {} does not exist. " "Need to either pass a different version that does exist, or" " run a new vr pull by passing vr_pull_timestamp=None.".format( vr_pull_timestamp, outpath) ) df = pd.read_csv(outpath) else: print_log_message( "Pulling a fresh version of VR with timestamp {}".format( timestamp) ) # regressions only use detailed code systems code_system_id = MODEL_DATA_CODE_SYSTEMS # regressions only use national-level data to avoid biasing the sample # toward subnational datasets country_loc_map = get_country_loc_id_map(location_hierarchy) if small_test: year_id = [2010, 2011] print("Pulling data for year subset: {}".format(year_id)) else: year_id = range(1980, 2018) dfs = [] for code_system_id in code_system_id: print_log_message("Code system id: {}".format(code_system_id)) garbage_code_ids = get_package_code_ids(reg_spec, code_system_id) target_cause_ids = reg_spec['target_cause_ids'] df = get_claude_data( "disaggregation", data_type_id=9, code_system_id=code_system_id, is_active=True, year_id=year_id, location_set_id=35, exec_function=collapse_to_reg_df, exec_function_args=[garbage_code_ids, target_cause_ids, country_loc_map], attach_launch_set_id=True ) dfs.append(df) df = pd.concat(dfs, ignore_index=True) df['vr_pull_timestamp'] = timestamp df.to_csv(outpath, index=False) return df
def finalize_formatting(df, source, write=False, code_system_id=None, extract_type=None, conn_def='ADDRESS', is_active=False, refresh_cache=True, check_ages=True): """Finalize the formatting of the source and optionally write it out. Decides whether to map code_id based on whether code_id is already a column in the dataset. Needs the following information from either the df values or from the nid_meta_vals dict: data_type_id representative_id All of the above must have only one value per nid in df. Maps site_id to the data based on incoming 'site' column. Will upload any sites that are not in the cod.site table already. Arguments: df, pandas.DataFrame: The dataframe with near-formatted data source, str: The source this df is (should be the whole source and nothing but the source). Will break if there is no source in FILEPATH with this name, and you should pass the source without a leading underscore even if it is that way in J write, bool: whether to write the outputs extract_type, str: The manner in which the nid was extracted. If left as None, will be induced by the location_type_id of the location_id with the maximum level in the dataset. This should be over-ridden in cases like China DSP, where the same locations are used in two extraction types - "DSP + VR" and "DSP"; China DSP then gets two extraction types: "admin1" and "admin1: DSP sites only" (in the particular instance of DSP, extract type is built into this code. Feel free to add other source-extract type mappings here to force consistency.) check_ages, bool: Whether or not to enforce age group checks such as ensuring no overlaps or gaps. This can be turned off because sometimes raw data reports overlapping age groups (e.g. Palestine data has Gaza Strip and West Bank data with different age groupings). Returns: Every local value to the function Why? There are multiple df outputs, and formatting is a very engaged process so its helpful to just see everything sometimes """ # set column groups, and verify that we have everything we need NID_META_COLS = [ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id', 'is_active', 'is_mort_active' ] NID_LOCATION_YEAR_COLS = [ 'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id' ] FORMATTED_ID_COLS = [ 'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id', 'age_group_id', 'location_id' ] if 'code_id' in df.columns: code_col = 'code_id' map_code_id = False elif 'cause' in df.columns: code_col = 'cause' map_code_id = True else: raise AssertionError("Need either 'code_id' or 'cause' in columns") INCOMING_EXPECTED_ID_COLS = [ 'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col, 'site', 'data_type_id', 'representative_id', 'code_system_id' ] VALUE_COLS = ['deaths'] FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns) assert len(missing_cols) == 0, \ "Required formatting columns not found in df: \n{}".format(missing_cols) # SET FORMATTING TIMESTAMP format_timestamp = cod_timestamp() print("Finalizing formatting with timestamp {}".format(format_timestamp)) # ADD SOURCE df['source'] = source # MAP OR CHECK CODE ID code_system_ids = df.code_system_id.unique() if map_code_id: cs_dfs = [] for code_system_id in code_system_ids: cs_df = df.loc[df['code_system_id'] == code_system_id].copy() # map code_id to the data cs_df['value'] = cs_df['cause'] cs_df = add_code_metadata(cs_df, ['code_id'], code_system_id=code_system_id, merge_col='value', force_rerun=True, cache_dir='standard') report_if_merge_fail(cs_df, ['code_id'], ['value']) cs_df = cs_df.drop('value', axis=1) cs_dfs.append(cs_df) df = pd.concat(cs_dfs, ignore_index=True) else: # CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE CODE SYSTEM all_codes_q = """ SELECT code_id FROM engine_room.maps_code WHERE code_system_id IN ({}) """.format(",".join([str(c) for c in code_system_ids])) all_codes = ezfuncs.query(all_codes_q, conn_def='ADDRESS') bad_codes = set(df.code_id) - set(all_codes.code_id) assert len(bad_codes) == 0, "Found code ids in data that can't exist in code "\ "systems {}: {}".format(code_system_ids, bad_codes) check_vr_raw_causes(df) # MAP SITE ID df = map_site_id(df, conn_def=conn_def) # MAP EXTRACT TYPE ID df = map_extract_type_id(df, source, extract_type, conn_def=conn_def) # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS df = group_six_minor_territories(df, sum_cols=VALUE_COLS) # sorry for putting this here # drop these loc/years b/c env < deaths creating negative cc_code # maybe re run w/ another envelope? df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))] df = df.loc[~(df['nid'].isin([24143, 107307]))] # ENSURE NO NEGATIVES for val_col in VALUE_COLS: assert (df[val_col] >= 0).all(), \ "there are negative values in {}".format(val_col) ################################################ # keep all 0s now, messing up for NR in non-VR # df['val_sum_tmp'] = df[VALUE_COLS].sum(axis=1) # all-cause extractions want to keep zeroes # keep_zeroes = df['extract_type_id'] == ALL_CAUSE_EXTRACT_ID # otherwise, drop them # greater_than_zero = df['val_sum_tmp'] > 0 # df = df[greater_than_zero | keep_zeroes] # df = df.drop('val_sum_tmp', axis=1) ################################################ # CHECKS FOR FORMATTED PHASE OUTPUT input_df = df[FINAL_FORMATED_COLS].copy() assert not input_df.isnull().values.any(), "null values in df" dupped = input_df[input_df.duplicated()] if len(dupped) > 0: raise AssertionError("duplicate values in df: \n{}".format(dupped)) # GROUP IF NECESSARY if input_df[FORMATTED_ID_COLS].duplicated().any(): input_df = input_df.groupby(FORMATTED_ID_COLS, as_index=False)[VALUE_COLS].sum() # TESTS F0R CHECKING AGE GROUP IDS if check_ages: check_age_groups(df) # MORE TESTS FOR DEATHS - MAYBE THAT THEY AREN'T MORE THAN 1.25 THE # VALUE IN THE ENVELOPE BY LOCATION AGE YEAR SEX? # AND THEN WRITE A TABLE OF COMPARISONS OF DEATHS / ENVELOPE BY LOCATION # AGE YEAR SEX FOR REVIEW # MAKE NID METADATA TABLE if 'parent_nid' not in df.columns: df['parent_nid'] = np.nan if is_active is True: warnings.warn( """is_active is deprecated: use the update_nid_metadata_status function to change the status of finalized datasets""" ) # Use existing is_active and is_mort_active values, otherwise default to 0 nid_map = pull_nid_metadata() df = df.merge(nid_map, on=[ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id' ], how='left') df_na = df[pd.isnull(df['is_active'])] df_na = df_na[['nid', 'extract_type_id']].drop_duplicates() if df_na.shape[0] > 0: print("""New rows for the following NID/extract_type_id will be added with is_active and is_mort_active = 0:\n {}""".format(df_na)) df['is_active'] = df['is_active'].fillna(0) df['is_mort_active'] = df['is_mort_active'].fillna(0) # CHECK SUBNATIONAL LOCATIONS df = check_subnational_locations(df) # OVERRIDE REPRESENTATIVE ID FOR NON-VR df = adjust_representative_id(df) nid_meta_df = df[NID_META_COLS].drop_duplicates() nid_meta_df['last_formatted_timestamp'] = format_timestamp # MAKE NID LOCATION YEAR TABLE nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates() nid_locyears['last_formatted_timestamp'] = format_timestamp # check one iso3 per nid nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id') nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3) report_duplicates( nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(), ['nid', 'extract_type_id']) nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1) if write: # write nid metadata write_to_claude_nid_table(nid_meta_df, 'claude_nid_metadata', replace=True, conn_def=conn_def) # write nid location-year map write_to_claude_nid_table(nid_locyears, 'claude_nid_location_year', replace=True, conn_def=conn_def) # write to cod.source for new sources insert_source_id(source) nid_extracts = input_df[['nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False) for nid, extract_type_id in nid_extracts: nid = int(nid) extract_type_id = int(extract_type_id) print("Writing nid {}, extract_type_id {}".format( nid, extract_type_id)) idf = input_df.loc[(input_df['nid'] == nid) & ( input_df['extract_type_id'] == extract_type_id)].copy() phase = 'formatted' launch_set_id = format_timestamp print("\nTotal deaths: {}".format(idf.deaths.sum())) write_phase_output(idf, phase, nid, extract_type_id, launch_set_id) # now refresh cache files for nid if refresh_cache: refresh_claude_nid_cache_files() return locals()
def finalize_formatting(df, source, write=False, code_system_id=None, extract_type=None, conn_def='ADDRESS', is_active=True): NID_META_COLS = [ 'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id', 'code_system_id', 'is_active' ] NID_LOCATION_YEAR_COLS = [ 'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id' ] FORMATTED_ID_COLS = [ 'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id', 'age_group_id', 'location_id' ] if 'code_id' in df.columns: code_col = 'code_id' map_code_id = False elif 'cause' in df.columns: code_col = 'cause' map_code_id = True else: raise AssertionError("Need either 'code_id' or 'cause' in columns") INCOMING_EXPECTED_ID_COLS = [ 'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col, 'site', 'data_type_id', 'representative_id', 'code_system_id' ] VALUE_COLS = ['deaths'] FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns) if len(missing_cols) > 0: raise AssertionError( """These columns are needed for formatting but not found in df: {} """.format(missing_cols)) # SET FORMATTING TIMESTAMP format_timestamp = cod_timestamp() print("Finalizing formatting with timestamp {}".format(format_timestamp)) # ADD SOURCE df['source'] = source # MAP OR CHECK CODE ID code_system_ids = df.code_system_id.unique() if map_code_id: cs_dfs = [] for code_system_id in code_system_ids: cs_df = df.loc[df['code_system_id'] == code_system_id].copy() # map code_id to the data cs_df['value'] = cs_df['cause'] cs_df = add_code_metadata(cs_df, ['code_id'], code_system_id=code_system_id, merge_col='value', force_rerun=True, cache_dir='standard') print(cs_df.loc[cs_df['code_id'].isnull()].value.unique()) report_if_merge_fail(cs_df, ['code_id'], ['value']) cs_df = cs_df.drop('value', axis=1) cs_dfs.append(cs_df) df = pd.concat(cs_dfs, ignore_index=True) else: # ADD TEST TO CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE # CODE SYSTEM all_codes_q = """ SELECT code_id FROM ADDRESS WHERE code_system_id IN ({}) """.format(",".join([str(c) for c in code_system_ids])) all_codes = ezfuncs.query(all_codes_q, conn_def='engine') bad_codes = set(df.code_id) - set(all_codes.code_id) if len(bad_codes) > 0: print("Found these code ids in data that can't exist in code " "systems {}: {}".format(code_system_ids, bad_codes)) # MAP SITE ID df = map_site_id(df, conn_def=conn_def) # MAP EXTRACT TYPE ID df = map_extract_type_id(df, source, extract_type, conn_def=conn_def) # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS df = group_six_minor_territories(df, sum_cols=VALUE_COLS) df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))] df = df.loc[~(df['nid'].isin([24143, 107307]))] # ENSURE NO NEGATIVES for val_col in VALUE_COLS: assert (df[val_col] >= 0).all(), \ "there are negative values in {}".format(val_col) input_df = df[FINAL_FORMATED_COLS].copy() assert not input_df.isnull().values.any(), "null values in df" dupped = input_df[input_df.duplicated()] if len(dupped) > 0: raise AssertionError("duplicate values in df: \n{}".format(dupped)) # GROUP IF NECESSARY if input_df[FORMATTED_ID_COLS].duplicated().any(): input_df = input_df.groupby(FORMATTED_ID_COLS, as_index=False)[VALUE_COLS].sum() # MAKE NID METADATA TABLE if 'parent_nid' not in df.columns: df['parent_nid'] = np.nan df['is_active'] = 1 * is_active # CHECK SUBNATIONAL LOCATIONS # alters is_active if needed df = check_subnational_locations(df) nid_meta_df = df[NID_META_COLS].drop_duplicates() nid_meta_df['last_updated_timestamp'] = format_timestamp # MAKE NID LOCATION YEAR TABLE nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates() nid_locyears['last_updated_timestamp'] = format_timestamp # check one iso3 per nid nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id') nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3) report_duplicates( nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(), ['nid', 'extract_type_id']) nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1) if write: # write nid metadata write_to_claude_nid_table(nid_meta_df, 'claude_nid_metadata', replace=True, conn_def=conn_def) # write nid location-year map write_to_claude_nid_table(nid_locyears, 'claude_nid_location_year', replace=True, conn_def=conn_def) insert_source_id(source) nid_extracts = input_df[['nid', 'extract_type_id' ]].drop_duplicates().to_records(index=False) for nid, extract_type_id in nid_extracts: nid = int(nid) extract_type_id = int(extract_type_id) print("Writing nid {}, extract_type_id {}".format( nid, extract_type_id)) idf = input_df.loc[(input_df['nid'] == nid) & ( input_df['extract_type_id'] == extract_type_id)].copy() phase = 'formatted' launch_set_id = format_timestamp print("\nTotal deaths: {}".format(idf.deaths.sum())) write_phase_output(idf, phase, nid, extract_type_id, launch_set_id) # now refresh cache files for nid print("\nRefreshing claude nid metadata cache files") force_cache_options = { 'force_rerun': True, 'block_rerun': False, 'cache_dir': "standard", 'cache_results': True, 'verbose': True } get_nid_metadata(**force_cache_options) get_nidlocyear_map(**force_cache_options) return locals()