def assert_df_is_square(self): """ Assert that the dataframe has all locations and is square on age, sex, and cause. Throws: AssertionError if not true """ df = self.mktscan_codcorr.copy() locations = get_location_metadata(location_set_id=35, gbd_round_id=4)\ .query('level >= 3')[['location_id']] locations['join_col'] = 1 causes = df.cause_id.drop_duplicates().reset_index() causes['join_col'] = 1 sexes = df.sex_id.drop_duplicates().reset_index() sexes['join_col'] = 1 ages = df.age_group_id.drop_duplicates().reset_index() ages['join_col'] = 1 square = locations.merge(causes).merge(sexes).merge(ages) square = square.drop('join_col', axis=1) m = square.merge(df, how='inner') assert len(m) == len(square), \ 'the dataset is not square or is missing some locations'
def map_to_country(df): """ much of our location data is subnational, but we sometimes want to tally things by country this function will map from any subnational location id to its parent country """ pre = df.shape[0] cols = df.shape[1] locs = get_location_metadata(location_set_id=35) countries = locs.loc[locs.location_type == 'admin0', ['location_id', 'location_ascii_name']].copy() countries.columns = ["merge_loc", "country_name"] df = df.merge(locs[['location_id', 'path_to_top_parent']], how='left', on='location_id') df = pd.concat([df, df.path_to_top_parent.str.split(",", expand=True)], axis=1) if df[3].isnull().any(): warnings.warn( "There are locations missing from the loc set 35 hierarchy, I'm going to break" ) df['merge_loc'] = df[3].astype(int) df = df.merge(countries, how='left', on='merge_loc') to_drop = ['path_to_top_parent', 0, 1, 2, 3, 4, 5, 6, 'merge_loc'] to_drop = [d for d in to_drop if d in df.columns] df.drop(to_drop, axis=1, inplace=True) assert df.shape[0] == pre assert df.shape[1] == cols + 1 assert df.country_name.isnull().sum() == 0,\ "Something went wrong {}".format(df[df.country_name.isnull()]) return df
def run_cod_age_sex_splitting(df, conn_def, cause_set_version_id, pop_run_id): cause_metadata = get_cause_metadata( cause_set_version_id=cause_set_version_id) possible_causes = cause_metadata['cause_id'].unique().tolist() for cause_id in df['cause_id'].unique().tolist(): assert cause_id in possible_causes, "Cause ID {} not in hierarchy".format( cause_id) loc_meta = get_location_metadata(gbd_round_id=6, location_set_id=21) possible_locs = loc_meta['location_id'].tolist() df = df.loc[df['location_id'].isin(possible_locs), :] df = df.loc[df['best'] > 0, :] df['hi_best_ratio'] = df['high'] / df['best'] df['lo_best_ratio'] = df['low'] / df['best'] df = df.reset_index(drop=True) df['unique_join'] = df.index df_merge_later = df.loc[:, ['unique_join', 'hi_best_ratio', 'lo_best_ratio']] df = df.drop(labels=['high', 'low', 'hi_best_ratio', 'lo_best_ratio'], axis=1) splitter = AgeSexSplitter(cause_set_version_id=cause_set_version_id, pop_run_id=pop_run_id, distribution_set_version_id=62, id_cols=['unique_join'], value_column='best') split_df = splitter.get_computed_dataframe(df=df, location_meta_df=loc_meta) split_df = pd.merge(left=split_df, right=df_merge_later, on=['unique_join'], how='left') split_df['low'] = split_df['best'] * split_df['lo_best_ratio'] split_df['high'] = split_df['best'] * split_df['hi_best_ratio'] split_df = split_df.drop( labels=['unique_join', 'lo_best_ratio', 'hi_best_ratio'], axis=1) return split_df
def copy_draws(draws_dir, meid): locs = [] for f in glob.glob(f"{draws_dir}/{meid}/*.csv"): locs.append(int( f.rsplit("/")[-1][:-4] ) ) study_locs = db.get_demographics("epi")["location_id"] loc_h = db.get_location_metadata(35) missing = [l for l in study_locs if l not in locs] zero_draws = pd.read_csv(f'{draws_dir}/{meid}/101.csv') draw_cols = zero_draws.columns[zero_draws.columns.str.contains("draw")] zero_draws[draw_cols] = zero_draws[draw_cols] * 0.0 print(len(missing)) for place in missing: if loc_h.loc[loc_h.location_id == place, "level"].values[0] == 3: zero_draws['location_id'] = place zero_draws.to_csv(f'{draws_dir}/{meid}/{place}.csv') elif loc_h.loc[loc_h.location_id == place, "level"].values[0] == 4: parent = loc_h.loc[loc_h.location_id == place, "parent_id"].values[0] draws = pd.read_csv(f'{draws_dir}/{meid}/{parent}.csv') draws['location_id'] = place draws.to_csv(f'{draws_dir}/{meid}/{place}.csv') print(place) return None
def get_location_hierarchy(location_set_id): result_df = get_location_metadata(location_set_id) return result_df[[ 'location_id', 'location_name', 'path_to_top_parent', 'parent_id', 'level', 'is_estimate', 'most_detailed', 'sort_order' ]]
from db_queries import get_location_metadata from fbd_core import YearRange, argparse from fbd_core.file_interface import FBDPath, open_xr from datetime import datetime EXT_YEAR = 2095 # Height for different cell types CELL_HT = {"title": 3, "location": 1, "stage": 0, "data_cols": 2} # dict: Python dictionary for mapping indentation levels to their # corresponding cause levels. Used for formatting the 'Cause' column # in the table. INDENT_MAP = {0: "", 1: " ", 2: " ", 3: " "} # Query gbd shared tables and get locations needed GBD_LOC_DF = get_location_metadata(gbd_round_id=5, location_set_id=35) def check_locs_array(df): """Function used to find and programmatically add in those locations that are in the GBD database but not in UNPD and WITT data""" return df["location_id"].isin(GBD_LOC_DF["location_id"]) def floating_style(list_nums): """Convert the decimal point in the UI to a lancet style floating single decimal for both past and future data ARGS: list_nums (list): list containing numbers to be converted to floating style period
import pandas as pd import subprocess from db_queries import get_location_metadata locs = get_location_metadata(location_set_id=22) locs = locs.location_id.unique().tolist() covid = 261 covname = 'vita_supp' meid = 2640 measid = 5 for loc in locs: job_name = "covariate_{}".format(loc) call = ('qsub -l mem_free=10.0G -pe multi_slot 5' ' -cwd -P PROJECT -o' ' FILEPATH' ' -e FILEPATH -N {0}' ' FILEPATH' ' dismod_to_cov.py' ' {1} {2} {3} {4} {5}'.format(job_name, str(int(loc)), str(int(measid)), str(int(meid)), str(int(covid)), str(covname))) subprocess.call(call, shell=True)
def apply_corrections(df, run_id, cf_model_type): """ Applies the marketscan correction factors to the hospital data at the icg level. The corrections are merged on by 'age_start', 'sex_id', 'icg_id' and 'location_id'. Reads in the corrections from 3 static csv. Parameters: df: Pandas DataFrame Must be aggregated and collapsed to the icg level run_id: (int or str) Identifies which clinical run we're using, ie 1, 2, 'test' """ assert "icg_id" in df.columns, "'icg_id' must exist." start_columns = df.columns if cf_model_type == 'rmodels': corr_files = glob.glob("FILEPATH"\ "FILEPATH".format(run_id)) id_cols = ['age_start', 'sex_id', 'cf_location_id', 'icg_id', 'icg_name'] elif cf_model_type == 'mr-brt': corr_files = glob.glob("FILEPATH".format(run_id)) id_cols = ['age_start', 'sex_id', 'icg_id', 'icg_name'] else: assert False, "{} is not a recognized correction factor type".format(cf_model_type) idx = -4 assert corr_files, "There are no correction factor files" corr_list = [] cf_names = [] for f in corr_files: draw_name = os.path.basename(f)[:idx] draw_name = draw_name[5:] cf_names.append(draw_name) dat = pd.read_csv(f) dat.rename(columns={'mean_' + draw_name: draw_name}, inplace=True) if "Unnamed: 0" in dat.columns: dat.drop("Unnamed: 0", 1, inplace = True) pre_rows = dat.shape[0] assert dat.shape[0] == pre_rows, "The number of rows changed" if draw_name == 'prevalence' and cf_model_type == 'rmodels': locs = get_location_metadata(location_set_id = 35) locs = pd.concat([locs, locs.path_to_top_parent.str.split(",", expand=True)], axis=1) locs = locs[locs[3].notnull()] locs['cf_location_id'] = locs[3].astype(int) locs = locs[['cf_location_id', 'super_region_id']].drop_duplicates() dat.rename(columns={'cf_location_id': 'super_region_id'}, inplace=True) dat = dat.merge(locs, how='left', on='super_region_id') dat.drop('super_region_id', axis=1, inplace=True) corr_list.append(dat) del dat correction_factors = functools.reduce(lambda x, y:\ pd.merge(x, y, on=id_cols, how='outer'), corr_list) if 'sex' in correction_factors.columns: correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True) df = hosp_prep.group_id_start_end_switcher(df) id_cols = [f + "_id" if f == "sex" else f for f in id_cols] pre_shape = df.shape[0] locs = get_location_metadata(location_set_id=35)[['location_id', 'path_to_top_parent']] locs = pd.concat([locs, locs.path_to_top_parent.str.split(",", expand=True)], axis=1) locs = locs[locs[3].notnull()] locs['cf_location_id'] = locs[3].astype(int) locs = locs[['cf_location_id', 'location_id']] df = df.merge(locs, how='left', on='location_id') df = df.merge(correction_factors, how='left', on=id_cols) assert pre_shape == df.shape[0], ("You unexpectedly added rows while " "merging on the correction factors. Don't do that!") for col in ['super_region_id', 'model_prediction', 'cf_location_id']: if col in df.columns: df.drop(col, axis=1, inplace=True) for level in cf_names: df["mean_" + level] = df["mean_raw"] * df[level] df = hosp_prep.group_id_start_end_switcher(df) df.drop(cf_names, axis=1, inplace=True) assert set(start_columns).issubset(set(df.columns)), """ Some columns that were present at the start are missing now""" return df
def add_columns_and_upload(bundle_ids, out_dir, status_dir, in_path): """ Description: uploads data under each respective bundle ID and NID. Args: bundle_ids (list) """ # organize modelable entity IDs, bundle IDs, cause IDs, and NID IDs = pd.DataFrame({'bundle_id':[285, 286, 287, 288, 289, 290], 'cause_id':[493, 498, 520, 492, 499, 385], 'nid':[250478, 250479, 250480, 250481, 250482, 250483], 'me_id':[2414, 2415, 2416, 2417, 2418, 2419], }) # Find the respective cause ID, NID, and ME ID for the given bundle ID/s. IDs = IDs.query('bundle_id in {}'.format(bundle_ids)) cause_ids = IDs['cause_id'].tolist() nids = IDs['nid'].tolist() me_ids = IDs['me_id'].tolist() if not os.path.exists(out_dir): os.makedirs(out_dir) if not os.path.exists(status_dir): os.makedirs(status_dir) # Bring in the new HF proportion inputs. new_df = pd.read_csv('{in_path}heart_failure_target_props_subnat.csv'.format( in_path=in_path)) # Filter out Sub-Saharan Africa. super_regions = get_location_metadata(location_set_id=35)[['location_id','super_region_id']] new_df = new_df.merge(super_regions, on='location_id', how='inner') new_df = new_df.query('super_region_id != 166') new_df.drop('super_region_id', axis=1, inplace=True) count = 0 for bundle_id, nid, cause_id, me_id in izip(bundle_ids, nids, cause_ids, me_ids): #### For debugging #### fix = True #fix = False #### For debugging #### print cause_id print nid print me_id if fix: # Fill in some necessary columns for the merge/replacement of pre-existing input data. new_inputs = new_df.query('cause_id == {cause_id}'.format(cause_id=cause_id)) new_inputs.drop('cause_id', axis=1, inplace=True) new_inputs['measure_id'] = 18 new_inputs['nid'] = nid new_inputs['bundle_id'] = bundle_id new_inputs.rename(columns={'hf_target_prop':'mean', 'std_err_adj':'standard_error'}, inplace=True) # recode age groups to age range q0 = ('SELECT age_group_id, age_group_years_start AS age_start, ' 'age_group_years_end AS age_end ' 'FROM shared.age_group') age_df = query(q0, conn_def="shared") age_df['age_end'] = age_df.apply(age_fix, axis=1) new_inputs = new_inputs.merge(age_df, on='age_group_id', how='inner') new_inputs.drop('age_group_id', axis=1, inplace=True) # recode sex IDs to sex names new_inputs.rename(columns = {'sex_id':'sex'}, inplace=True) sexes = new_inputs['sex'] sexes = sexes.apply(sex_fix) new_inputs['sex'] = sexes # set years (put year_end as 2015 for merging purposes, recode to 2016) new_inputs['year_start'] = 1990 new_inputs['year_end'] = 2016 # fill in "seqs" new_inputs = assign_row_nums(new_inputs, bundle_id, nid, me_id) # Write the upload sheet as an Excel sheet labeled "extraction" -- this is # the required format by the Epi Uploader writer = pd.ExcelWriter('{out_dir}new_inputs_{bundle_id}.xlsx'.format( out_dir=out_dir, bundle_id=bundle_id), engine='xlsxwriter') new_inputs.to_excel(writer, sheet_name='extraction', index=False, encoding='utf-8') writer.save() print new_inputs.shape count += 1 print "{0} main/composite etiology inputs ready to to {0} bundle ID".format(count)
def assign_row_nums(df, bundle_id, nid, me_id): """Fills in missing seqs in input dataframe Args: df (object): pandas dataframe object of input data engine (object): ihme_databases class instance with dUSERt engine set me_id (int): modelable_entity_id for the dataset in memory Returns: Returns a copy of the dataframe with the seqs filled in, increment strarting from the max of the database for the given modelable_entity. """ # variable used for indicating if Epi database row deletion is necessary delete_rows = False # necessary columns needed_cols = ['year_start', 'year_end', 'age_start', 'age_end', 'sex', 'location_id', 'mean', 'standard_error', 'measure_id', 'nid', 'bundle_id'] index_cols = ['location_id', 'year_start', 'year_end', 'age_start', 'age_end', 'sex', 'measure_id', 'nid', 'bundle_id'] # time stamp for upload metadata Time = timestamp() # Query that pulls the data for the unique bundle ID q = ('''SELECT seq, location_id, year_start, year_end, age_start, age_end, sex_id, measure_id, nid, bundle_id FROM epi.bundle_dismod WHERE bundle_id={bundle_id} AND nid={nid};'''.format(bundle_id=bundle_id, nid=nid)) # execute query data = query(q, conn_def="epi") # get row numbers for all rows including for other NIDS all_seqs = data['seq'] # recode sex_id to be sex names data.rename(columns = {'sex_id':'sex'}, inplace=True) sexes = data['sex'] sexes = sexes.apply(sex_fix) data['sex'] = sexes # recode year to be current latest year data['year_end'] = 2016 # get location names locations_df = get_location_metadata(location_set_id=35)[['location_id', 'location_name']] # if the data pulled has zero rows, then make new row numbers if len(data) == 0: #df['seq'] = range(1,len(df)+ 1) df['seq'] = np.nan # drop all unneeded columns df = df[needed_cols + ['seq']] #df = df[needed_cols] # append location names df = df.merge(locations_df, on='location_id', how='inner') else: # make a identifier "new" for the data to check the merge df['new'] = 1 # perform an outer merge of the new data on old data on location_id, # and year_start df = df.merge(data, on=index_cols, how='outer') # find all the rows where "seq" is null -- these are rows that need to be inserted null_df = df[df['seq'].isnull()] # find all the rows where "new" is null -- these are rows that need to be deleted no_match = df[df['new'].isnull()] print "LENGTH", len(df) # take all the rows where all the index columns matched (the inner merge) # these are the rows to be updated df = df[(df['seq'].notnull())&(df['mean'].notnull())&(df['new'].notnull())] # drop all unnecessary columns df = df[needed_cols + ['seq']] print df.seq.unique() # if it wasn't a perfect merge -- if there are nulls, then print "LENGTH", len(null_df) print "LENGTH", len(no_match) print "LENGTH", len(df) if len(null_df) != 0 or len(no_match) != 0: # drop the null row numbers (null "seq"). null_df = null_df[needed_cols] # Append location name. null_df = null_df.merge(locations_df, on='location_id', how='inner') # If the number of rows to be deleted is greater than the number of rows that need to be # inserted then if len(no_match) > len(null_df): # get the row numbers of the rows that need to be deleted, the leftovers rows will be replaced by # those to be inserted replace_seqs = no_match['seq'].tolist()[:len(null_df)] for seq in replace_seqs.seq.unique(): assert seq not in df.seqs.unique(), "seq {} is a dupliacte.".format(seq) get_rid = no_match.query('seq not in {}'.format(replace_seqs)).copy() # make all columns of get_rid of empty except seq, and bundle ID get_rid['bundle_id'] = np.nan get_rid['nid'] = np.nan get_rid['location_id'] = np.nan get_rid['sex'] = np.nan get_rid['mean'] = np.nan get_rid['standard_error'] = np.nan get_rid['measure_id'] = np.nan get_rid['year_start'] = np.nan get_rid['year_end'] = np.nan get_rid['age_start'] = np.nan get_rid['age_end'] = np.nan get_rid['unit_type'] = np.nan get_rid['unit_type_value'] = np.nan get_rid['measure_issue'] = np.nan get_rid['uncertainty_type'] = np.nan get_rid['uncertainty_type_value'] = np.nan get_rid['extractor'] = np.nan get_rid['representative_name'] = np.nan get_rid['urbanicity_type'] = np.nan get_rid['response_rate'] = np.nan get_rid['sampling_type'] = np.nan get_rid['recall_type'] = np.nan get_rid['recall_type_value'] = np.nan get_rid['case_name'] = np.nan get_rid['case_definition'] = np.nan get_rid['case_diagnostics'] = np.nan get_rid['note_modeler'] = np.nan get_rid['cv_hospital'] = np.nan get_rid['cv_marketscan'] = np.nan get_rid['cv_low_income_hosp'] = np.nan get_rid['cv_high_income_hosp'] = np.nan get_rid['is_outlier'] = np.nan get_rid['cases'] = np.nan get_rid['measure'] = np.nan get_rid['sample_size'] = np.nan get_rid['effective_sample_size'] = np.nan get_rid['source_type'] = np.nan get_rid['underlying_nid'] = np.nan get_rid['input_type'] = np.nan get_rid['design_effect'] = np.nan get_rid['unit_value_as_published'] = np.nan get_rid['date_inserted'] = np.nan get_rid['last_updated'] = np.nan get_rid['inserted_by'] = np.nan get_rid['last_updated_by'] = np.nan get_rid['upper'] = np.nan get_rid['lower'] = np.nan # flip the "delete rows" indicator to True delete_rows = True # otherwise the rows to be deleted are replaced until new rows need to be inserted entirely: # Make the row numbers blank else: null_df['seq'] = np.nan null_df.reset_index(inplace=True) null_df.drop('index', axis=1, inplace=True) replace_seqs = no_match['seq'].tolist() null_df.loc[0:len(replace_seqs)-1, 'seq'] = replace_seqs # and append them to those being updated df = df.append(null_df) # check if row nums assigned properly print len(df[df.seq.notnull()]) print len(df[df.seq.isnull()]) assert not any(df[df.seq.notnull()].seq.duplicated()), "Duplicate row numbers assigned" # fill in columns required by the Epi Uploader df['unit_type'] = "Person" df['unit_type_value'] = 2.0 df['measure_issue'] = 0.0 df['uncertainty_type'] = "Standard error" #df['uncertainty_type_id'] = 1 df['uncertainty_type_value'] = np.nan df['extractor'] = "USER" df['representative_name'] = "Nationally and subnationally representative" df['urbanicity_type'] = "Unknown" df['response_rate'] = np.nan df['sampling_type'] = np.nan df['recall_type'] = "Point" df['recall_type_value'] = 1.0 df['case_name'] = np.nan df['case_definition'] = np.nan df['case_diagnostics'] = np.nan df['note_modeler'] = 'Proportion generated from CODEm deaths using Marketscan data' df['cv_hospital'] = 0 df['cv_marketscan'] = 1 df['cv_low_income_hosp'] = 0 df['cv_high_income_hosp'] = 0 df['is_outlier'] = 0 df['cases'] = np.nan df['measure'] = "proportion" df['sample_size'] = np.nan df['effective_sample_size'] = np.nan df['source_type'] = "Mixed or estimation" df['underlying_nid'] = np.nan df['input_type'] = "extracted" df['design_effect'] = np.nan df['unit_value_as_published'] = 1 df['date_inserted'] = Time df['last_updated'] = Time df['inserted_by'] = "USERNAME" df['last_updated_by'] = "USERNAME" df['upper'] = np.nan df['lower'] = np.nan # Query the Epi database for modelable entity names q1 = '''SELECT modelable_entity_name FROM epi.modelable_entity WHERE modelable_entity_id={};'''.format(me_id) me_name = str(query(q1, conn_def="epi").loc[0,'modelable_entity_name']) df['modelable_entity_id'] = me_id df['modelable_entity_name'] = me_name # If the "delete rows" indicator is on, if delete_rows: # then append the rows set up to be deleted: leaving only NID, bundle ID, and seq (row number) df = df.append(get_rid) return df
def run_master(root_dir, envr, sweep_lt, sweep_yld, sweep_hale, prep_lt, prep_yld, calc_hale, summarize, upload_hale, n_draws, loc_set_id, year_id, yld_version, local, test_location, custom_lt, log_dir='DIRECTORY'): ############################################### #Start jobmon and launch different jobs. Also #set up directories, and run get_population #to cache pop for compile_yld file ############################################### if not os.path.isdir(log_dir): os.mkdir(log_dir) if local: out_dir = root_dir else: out_dir = 'DIRECTORY' parameter_csv.run_param(envr, yld_version, loc_set_id, year_id, gbd_round_id=GBD_ROUND_ID) param_sheet = pd.read_csv('%s/inputs/parameters.csv' % root_dir) param_sheet = param_sheet.loc[param_sheet['status'] == 'best'] hale_version = param_sheet['hale_version'].item() mort_version = param_sheet['mort_run'].item() print('HALE VERSION IS {}'.format(hale_version)) print('MORT VERSION IS {}'.format(mort_version)) print('YLD VERSION IS {}'.format(yld_version)) prog_dir = '%s/v%s' % (out_dir, hale_version) draw_dir = '%s/draws' % prog_dir summ_dir = '%s/summaries' % prog_dir for direc in [prog_dir, draw_dir, summ_dir]: if not os.path.isdir(direc): os.mkdir(direc) os.chmod(direc, 0o777) if custom_lt is not None: lt_in = custom_lt else: lt_in = ("DIRECTORY") lt_tmp = '%s/lt' % draw_dir lt_dir = '%s/lt' % summ_dir yld_tmp = '%s/yld' % draw_dir yld_dir = '%s/yld' % summ_dir hale_tmp = '%s/results' % draw_dir hale_dir = '%s/results' % summ_dir sweep([lt_tmp, lt_dir], sweep_lt) sweep([yld_tmp, yld_dir], sweep_yld) sweep([hale_tmp, hale_dir], sweep_hale) err = glob('{}/*.e*'.format(log_dir)) out = glob('{}/*.o*'.format(log_dir)) ps = glob('{}/*.p*'.format(log_dir)) for log in err + out + ps: os.remove(log) if test_location is not None: locations = [test_location] else: locations = [] for location_set in loc_set_id: location_meta = get_location_metadata(location_set_id=location_set, gbd_round_id=GBD_ROUND_ID) location_meta = location_meta.loc[ location_meta['location_id'] != 44620] locs = location_meta['location_id'].unique().tolist() locations = locations + locs locations = list(set(locations)) year_draws = list(zip(year_id, n_draws)) d_str = "[%m/%d/%Y %H:%M:%S]" wf = Workflow('HALE_{}'.format(datetime.now().strftime(d_str)), project='proj_hale', stderr=log_dir, stdout=log_dir) print('Building DAG') if prep_lt: lt_task = {} for location in locations: for year, draws in year_draws: args = [ '--lt_in', lt_in, '--lt_tmp', lt_tmp, '--location', location, '--year', year, '--n_draws', draws ] script = os.path.join(root_dir, '01_compile_lt.py') name = 'lt_{}_{}_prep'.format(location, year) lt_task[(location, year)] = PythonTask(script=script, args=args, name=name, slots=4, mem_free=8, max_attempts=3, tag='lt_prep') wf.add_task(lt_task[(location, year)]) if prep_yld: population = get_population(location_id=locations, year_id=year_id, age_group_id='all', sex_id='all', gbd_round_id=GBD_ROUND_ID) population.drop('run_id', axis=1, inplace=True) population.set_index('location_id', inplace=True) population.to_csv('%s/inputs/pop.csv' % root_dir) yld_task = {} for location in locations: for year, draws in year_draws: args = [ '--yld_tmp', yld_tmp, '--root_dir', root_dir, '--location', location, '--yld_version', yld_version, '--year', year, '--n_draws', draws ] script = os.path.join(root_dir, '02_compile_yld.py') name = 'yld_{}_{}_prep'.format(location, year) yld_task[(location, year)] = PythonTask(script=script, args=args, name=name, slots=4, mem_free=8, max_attempts=3, tag='yld_prep') wf.add_task(yld_task[(location, year)]) if calc_hale: hale_task = {} for location in locations: for year in year_id: if prep_yld and prep_lt: upstream_tasks = [ lt_task[(location, year)], yld_task[(location, year)] ] elif prep_yld: upstream_tasks = [yld_task[(location, year)]] elif prep_lt: upstream_tasks = [lt_task[(location, year)]] else: upstream_tasks = None args = [ '--hale_tmp', hale_tmp, '--lt_tmp', lt_tmp, '--yld_tmp', yld_tmp, '--location', location, '--year', year ] script = os.path.join(root_dir, '03_calc_hale.py') name = 'hale_{}_{}_calc'.format(location, year) hale_task[(location, year)] = PythonTask(script=script, args=args, name=name, slots=4, mem_free=8, max_attempts=3, tag='hale_calc', upstream_tasks=upstream_tasks) wf.add_task(hale_task[(location, year)]) if summarize: summary_task = {} for location in locations: if calc_hale: upstream_tasks = [ hale_task[(location, year)] for year in year_id ] else: upstream_tasks = None args = [ '--lt_tmp', lt_tmp, '--lt_dir', lt_dir, '--yld_tmp', yld_tmp, '--yld_dir', yld_dir, '--hale_tmp', hale_tmp, '--hale_dir', hale_dir, '--location', location ] script = os.path.join(root_dir, '04_calc_summaries.py') name = 'summary_{}_calc'.format(location) summary_task[location] = PythonTask(script=script, args=args, name=name, slots=4, mem_free=8, max_attempts=3, tag='summarize', upstream_tasks=upstream_tasks) wf.add_task(summary_task[location]) if upload_hale: if summarize: upstream_tasks = [summary_task[loc] for loc in locations] else: upstream_tasks = None args = [ '--hale_version', hale_version, '--hale_dir', hale_dir, '--envr', envr ] script = os.path.join(root_dir, '05_upload_hale.py') name = 'upload_hale' upload_task = PythonTask(script=script, args=args, name=name, slots=12, mem_free=24, max_attempts=3, tag='upload', upstream_tasks=upstream_tasks) wf.add_task(upload_task) print("executing workflow") integer_result = wf.execute() if integer_result: raise RuntimeError("Workflow failure") print("FINISHED")
process_timeout=process_timeout, path_to_python_binary=path_to_python_binary, upstream_tasks=upstream_tasks) # Set up logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) # Start up DAG d = datetime.datetime.now() dag_name = "mort_u5_{}_{}".format(version_id, d.strftime("%Y%m%d%H%M")) dag = TaskDag(name=dag_name) # Get locations location_hierarchy = get_location_metadata(location_set_id=21, gbd_round_id=5) location_hierarchy = location_hierarchy.loc[ (location_hierarchy['level'] >= 3) & (location_hierarchy['location_id'] != 6)] ihme_loc_dict = make_ihme_loc_id_dict(location_hierarchy) all_files = glob.glob((draws_dir + "*").format(version_id)) # Create tasks u5_tasks = {} for location_id in location_hierarchy['location_id'].tolist(): output_file = (draws_dir + "{}.csv").format(version_id, location_id) if output_file not in all_files: print(output_file) ihme_loc_id = ihme_loc_dict[location_id] u5_tasks[location_id] = generate_u5_task(location_id, ihme_loc_id,
args = vars(parser.parse_args()) params_dir = args["params_dir"] draws_dir = args["draws_dir"] interms_dir = args["interms_dir"] logs_dir = args["logs_dir"] else: params_dir = f"{data_root}/{cause}/FILEPATH" draws_dir = f"{data_root}/{cause}/FILEPATH" interms_dir = f"{data_root}/{cause}/FILEPATH" logs_dir = f"{data_root}/{cause}/FILEPATH" ### Define Constants gbd_round_id = 7 decomp_step = "iterative" loc_h = db.get_location_metadata(35) ### ======================= MAIN EXECUTION ======================= ### ### LOAD DRAWS draws_df = pd.read_csv(f'{interms_dir}/FILEPATH') draw_cols = draws_df.columns[draws_df.columns.str.contains('draw')] ### TRIM 2019 TO ENDEMIC LOCS eth_subs = loc_h.loc[loc_h.parent_id == 179, ['location_id', 'location_name']] end_locations = loc_h.loc[loc_h.location_name.isin(['Chad', 'Mali', 'South Sudan']), ['location_id', 'location_name']] end_locations = end_locations.append(eth_subs) draws_df.loc[(~draws_df.location_id.isin(end_locations.location_id.unique()) & (draws_df.year_id == 2019)
def outpatient_elmo(df, gbd_round_id, make_right_inclusive=True): """ Function that prepares data for upload to the epi database. Adds a lot of columns, renames a lot of columns. Args: df (Pandas DataFrame) contains outpatient data at the bundle level. make_right_inclusive: (bool) This switch changes values in the 'age_demographer' column and the 'age_end' column. If True, 'age_demographer' column will be set to 1. age_end will be made have values ending in 4s and 9s. For example, these age groups would be 5-9, 10-14, ... That means that an age_end is inclusive. That is, a value of 9 in age_end means that 9 is included in the range. If False, then 'age_demographer' will be set to 0 and age_end will be right exclusive. age_end will have values ending in 5s and 0s, like 5-10, 10-15, ... That is, a value of 10 in age_end would not include 10. It would be ages up to but not including 10. Returns: Data formatted and ready for uploading to Epi DB. """ if make_right_inclusive: assert (df.loc[df.age_end > 1, 'age_end'].values % 5 == 0).all(),\ """age_end appears not to be a multiple of 5, indicating that subtracting 1 is a bad move""" df.loc[df.age_end > 1, 'age_end'] = df.loc[df.age_end > 1, 'age_end'] - 1 df['age_demographer'] = 1 else: assert (df.loc[df.age_end > 1, 'age_end'].values % 5 != 0).all(),\ """age_end appears to be a multiple of 5, indicating that setting age_demographer to 0 is a bad move.""" df['age_demographer'] = 0 df.loc[df.age_end == 1, 'age_demographer'] = 0 df = df.drop(['source', 'facility_id', 'metric_id'], axis=1) df.rename(columns={ 'representative_id': 'representative_name', "val_inj_corrected": "cases_inj_corrected", 'val_corrected': 'cases_corrected', 'val': 'cases_uncorrected', 'population': 'sample_size', 'sex_id': 'sex' }, inplace=True) representative_dictionary = { -1: "Not Set", 0: "Unknown", 1: "Nationally representative only", 2: "Representative for subnational " + "location only", 3: "Not representative", 4: "Nationally and subnationally " + "representative", 5: "Nationally and urban/rural " + "representative", 6: "Nationally, subnationally and " + "urban/rural representative", 7: "Representative for subnational " + "location and below", 8: "Representative for subnational " + "location and urban/rural", 9: "Representative for subnational " + "location, urban/rural and below", 10: "Representative of urban areas only", 11: "Representative of rural areas only" } df.replace({'representative_name': representative_dictionary}, inplace=True) df['source_type'] = 'Facility - outpatient' df['urbanicity_type'] = 'Unknown' df['recall_type'] = 'Not Set' df['unit_type'] = 'Person' df['unit_value_as_published'] = 1 df['is_outlier'] = 0 df['sex'].replace([1, 2], ['Male', 'Female'], inplace=True) df['measure'].replace(["prev", "inc"], ["prevalence", "incidence"], inplace=True) df['mean'] = np.nan df['upper'] = np.nan df['lower'] = np.nan df['seq'] = np.nan df['underlying_nid'] = np.nan df['sampling_type'] = np.nan df['recall_type_value'] = np.nan df['uncertainty_type'] = np.nan df['uncertainty_type_value'] = np.nan df['input_type'] = np.nan df['standard_error'] = np.nan df['effective_sample_size'] = np.nan df['design_effect'] = np.nan df['response_rate'] = np.nan df['extractor'] = "USERNAME and USERNAME" loc_map = get_location_metadata(location_set_id=35, gbd_round_id=gbd_round_id) loc_map = loc_map[['location_id', 'location_name']] df = df.merge(loc_map, how='left', on='location_id') bundle_name_df = query("SQL", conn_def='epi') pre_shape = df.shape[0] df = df.merge(bundle_name_df, how="left", on="bundle_id") assert df.shape[0] == pre_shape, "added rows in merge" assert df.bundle_name.notnull().all().all(), 'bundle name df has nulls' print("DONE WITH ELMO") return (df)
import pandas as pd from db_queries import get_cod_data from db_queries import get_location_metadata gbd_year = 2019 needed_years = range(1980,(gbd_year+1)) # years used in dataframe lm = get_location_metadata(location_set_id=22, gbd_round_id=6) df = get_cod_data(cause_id='618', gbd_round_id=7, decomp_step='step2') # grab the cod data for other hemog df = df[df['data_type']=='Vital Registration'] # subset cod data to only include VR sources df = df.merge(lm, on='location_id', how='left') df = df[df['developed']==u'1'] df = df[['cause_id', 'location_id', 'year', 'age_group_id', 'sex', 'rate']] #subset relevant columns of data df = df.groupby(['location_id', 'year', 'age_group_id', 'sex']).mean() #take the mean CSMR across location/year/age/sex combos df = df.reset_index() pooled_dfs = [] #make an empty list to fill with dataframes of pooled years ''' loop through each year in the years list, 1) define the set of years being pooled to this year 2) grab chunks of the df used for pooling 3) take the mean across these years 4) once pooled, update the entry in the year column 5) add the pooled year dataframe to the list ''' for y in needed_years:
def main(ecode, ncode, platform, year, decomp, version, flat_version): toc = time.time() dems = db.get_demographics(gbd_team="epi", gbd_round_id=help.GBD_ROUND) dm_settings = os.path.join(paths.SHARE_DIR, 'dismod_settings') version = version.rstrip() dm_dir = os.path.join(paths.DATA_DIR, decomp, inj_info.ECODE_PARENT[ecode], str(version), "dismod_ode", ecode) metaloc = db.get_location_metadata(location_set_id=35, gbd_round_id=help.GBD_ROUND) filepath = write_path(ecode, ncode, platform, year, decomp, version) locations = help.ihme_loc_id_dict(metaloc, dems['location_id']) alldata = [] value_in = os.path.join(dm_dir, "value_in", "value_in_{}_{}.csv".format(ncode, platform)) draw_in = os.path.join(dm_settings, "draw_in.csv") plain_in = os.path.join(dm_settings, "plain_in.csv") effect_in = os.path.join(dm_settings, "effect_in.csv") v_in = pd.read_csv(value_in) num_locs = len(locations) loc_pos = 0 initime = help.start_timer() for locn in locations: loc_pos = loc_pos + 1 for sex in [1, 2]: start = help.start_timer() if float(v_in.loc[v_in['name'] == 'eta_incidence', 'value'][0]) == 0: result = pd.DataFrame({'age_group_id': dems['age_group_id']}) result = result.assign(**{d: 0 for d in help.drawcols()}) result = help.convert_from_age_group_id(result) else: data_in = os.path.join( dm_dir, "data_in", locations[locn], str(year), str(sex), ecode, "data_in_{}_{}.csv".format(ncode, platform)) if ncode in inj_info.EMR_NCODES: rate_in_name = "rate_in_emr.csv" else: rate_in_name = "rate_in_no_emr.csv" rate_in = os.path.join(paths.DATA_DIR, 'flats', str(flat_version), 'rate_in', str(year), str(sex), locations[locn], rate_in_name) draw_out_dir = os.path.join(dm_dir, "prev_results", locations[locn], str(year), str(sex)) draw_out = os.path.join( draw_out_dir, "prevalence_{}_{}.csv".format(ncode, platform)) if not os.path.exists(draw_out_dir): try: os.makedirs(draw_out_dir) except OSError as e: if e.errno != os.errno.EEXIST: raise pass result = run_model_injuries(draw_in, data_in, value_in, plain_in, rate_in, effect_in, draw_out, 1000) result['location_id'] = locn result['platform'] = platform result['year_id'] = year result['sex_id'] = sex alldata.append(result) help.end_timer(start) sys.stdout.flush() # write to log file total_time = (time.time() - initime) / 60. final = pd.concat(alldata) write_results(final, ecode, ncode, platform, year, decomp, version) tic = time.time()
def get_most_detailed(location_set, gbd_round): location_df = get_location_metadata(location_set_id=location_set, gbd_round_id=gbd_round) location_df = location_df[location_df['most_detailed'] == 1] location_list = location_df['location_id'].tolist() return location_list
def main(): drawdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_' 'splits/02_exposure_data/final_forecast') filepattern = '*.csv' files = glob(os.path.join(drawdir, filepattern)) me_name_to_meid = { 'stunting_mild': 10557, 'stunting_moderate': 10556, 'stunting_severe': 8949, 'underweight_mild': 10561, 'underweight_moderate': 10560, 'underweight_severe': 2540, 'wasting_mild': 10559, 'wasting_moderate': 10558, 'wasting_severe': 8945 } # Location Metadata locs = get_location_metadata(location_set_id=35, gbd_round_id=4) locs = locs[[ 'location_id', 'parent_id', 'location_name', 'level', 'location_name_short', 'map_id', 'location_type', 'is_estimate' ]] # Generate locations to keep: 188 + parents locs_to_keep = create_locs_to_keep(locs) custom_loc_df = locs[locs.location_id.isin(locs_to_keep)] # create main custom tree print('{} creating custom tree'.format(pretty_now())) custom_tree = create_custom_tree(custom_loc_df) index_cols = ['age_group_id', 'sex_id', 'year_id', 'location_id'] data_cols = ['lower', 'mean', 'upper'] # Create SDI trees sdi_locs = get_location_metadata(location_set_id=40, gbd_round_id=4) sdi_locs = sdi_locs[[ 'location_id', 'parent_id', 'location_name', 'level', 'location_name_short', 'map_id', 'location_type', 'is_estimate' ]] sdi_ids = [44635, 44634, 44639, 44636, 44637] sdi_trees = [] for _id in sdi_ids: print('{} creating sdi tree for {}'.format(pretty_now(), _id)) thisdf = sdi_locs[sdi_locs.parent_id == _id] thisdf = thisdf[thisdf.location_id.isin(locs_to_keep + [_id])] sdi_trees.append(create_custom_tree(thisdf)) # get population pops = get_pop() for _file in files: # Define me_name me_name = parse_me_name(_file) meid = me_name_to_meid[me_name] print('{} processing file: {}'.format(pretty_now(), meid)) df = pd.read_csv(_file) df.rename(columns={ 'worse': 'lower', 'reference': 'mean', 'better': 'upper' }, inplace=True) df = df[[ 'location_id', 'age_group_id', 'sex_id', 'year_id', 'lower', 'mean', 'upper' ]] df['modelable_entity_id'] = meid # Remove bad locations bad_locs = [298, 305, 349, 351, 376, 385, 422, 433, 434, 4636, 4749] df = df[~df.location_id.isin(bad_locs)] # convert to counts print('{} convert to counts before aggregation'.format(pretty_now())) df = df.merge(pops, on=index_cols, how='left') for i in data_cols: df[i] = df[i] * df['pop_scaled'] # aggregate all trees print('{} agg custom loc tree'.format(pretty_now())) agg_results = agg_hierarchy(custom_tree, df, index_cols, data_cols, dimension='location_id') for sdi_tree in sdi_trees: print('{} agg sdi tree for: {}'.format(pretty_now(), sdi_tree.root)) this_agg = agg_hierarchy(sdi_tree, df, index_cols, data_cols, dimension='location_id') this_agg = this_agg[this_agg.location_id.isin(sdi_ids)] agg_results = agg_results.append(this_agg) # copy data and set metric id to 1 for counts print('{} copy counts to new df'.format(pretty_now())) agg_counts = agg_results.copy() agg_counts = agg_counts[agg_counts.sex_id.isin([1, 2])] sex_agg = agg_sexes(agg_counts, pops) agg_counts = agg_counts.append(sex_agg) agg_counts['metric_id'] = 1 # convert back to rate space print('{} converting back to rate space'.format(pretty_now())) agg_results = agg_results.merge(pops, on=index_cols, how='left') for i in data_cols: agg_results[i] = agg_results[i] / agg_results['pop_scaled'] agg_results['metric_id'] = 3 print('{} append counts to rate df'.format(pretty_now())) agg_results = agg_results.append(agg_counts) outdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_' 'age_splits/03_exposure_loc_aggregates/02_forecast_' 'prevalence/{}'.format(meid)) outfile = '{}_prevalence_estimates.csv'.format(meid) print('{} saving as csv'.format(pretty_now())) agg_results.to_csv(os.path.join(outdir, outfile), index=False) print('{} finished processing meid: {}'.format(pretty_now(), meid))
def match_to_gbd_locations(in_df, location_set_id=21, fuzzy_match=True, fm_top_cutoff=90, fm_dist_cutoff=20): print("Beginning direct string matching to GBD locations...") # Get location metadata for matching gbd_meta = get_location_metadata(location_set_id=21) ## Add some columns to the input dataframe without suffixes for col in ['admin1', 'admin2', 'admin3', 'location']: in_df["{}__short".format(col)] = in_df[col].apply(remove_suffixes) for col in ['location_name', 'location_name_short', 'location_ascii_name']: gbd_meta["{}__short".format(col)] = gbd_meta[col].apply( remove_suffixes) # Build the list of columns that will be matched, in order in_df_cols = list() meta_cols = list() for in_named_col in ['admin1', 'admin2', 'admin3', 'location']: for meta_named_col in [ 'location_name', 'location_name_short', 'location_ascii_name' ]: for suffix1 in ["", "__short"]: for suffix2 in ["", "__short"]: in_df_cols.append("{}{}".format(in_named_col, suffix1)) meta_cols.append("{}{}".format(meta_named_col, suffix2)) in_df_cols = in_df_cols + [ "location_id", "iso", "country", "country", "country" ] meta_cols = meta_cols + [ "location_id", "ihme_loc_id", "location_name", "location_name_short", "location_ascii_name" ] ## Iteratively merge, adding only NaN columns on each merge # Subset to most detailed for a first run meta_most_detailed = gbd_meta.loc[gbd_meta['most_detailed'] == 1, :] # FIRST, run only the most detailed locations in_df['location_id_matched'] = np.nan joined_df = unified_location_column(in_df, match_df=meta_most_detailed, location_columns=in_df_cols, match_columns=meta_cols, match_column_to_add='location_id', new_column_name="location_id_matched") # NEXT, run on all locations to catch any not-most-detailed location matches joined_df = unified_location_column(in_df, match_df=gbd_meta, location_columns=in_df_cols, match_columns=meta_cols, match_column_to_add='location_id', new_column_name="location_id_matched") # Use the new location data to join on the 'most-detailed' column joined_df = pd.merge( left=joined_df, right=(gbd_meta.loc[:, ['location_id', 'most_detailed']].rename( columns={ 'location_id': 'location_id_matched', 'most_detailed': 'already_located' })), on='location_id_matched', how='left') # If fuzzy_match is true, try using fuzzy string matching to match countries # to their subnational locations if fuzzy_match: print("Beginning fuzzy matching to GBD locations...") joined_df = fuzzy_match_subnationals( in_df=joined_df, loc_metadata=gbd_meta, top_score_cutoff=fm_top_cutoff, dist_to_second_score_cutoff=fm_dist_cutoff) # Cleanup joined_df.loc[joined_df['location_id'].isnull(), 'location_id'] = joined_df.loc[ joined_df['location_id'].isnull(), 'location_id_matched'] joined_df = joined_df.drop([ 'admin1__short', 'admin2__short', 'admin3__short', 'location_id_matched' ], axis=1) return joined_df
def main(ecode, ncode, platform, version): start = help.start_timer() parent = inj_info.ECODE_PARENT[ecode] flat_version = versions.get_env(parent, version) # get demographics print("1. Getting demographic, location, and long-term probabilities...") dems = db.get_demographics(gbd_team = "epi", gbd_round_id=help.GBD_ROUND) metaloc = db.get_location_metadata(location_set_id=35, gbd_round_id=help.GBD_ROUND) locations = help.ihme_loc_id_dict(metaloc, dems['location_id']) # get long-term probabilities that will be used and long-term standardized-mortality ratios lt_probs = calculate_measures.long_term_probs_combined(ncode=ncode) smr = load_measures.smr(ncode) # define DisMod ODE input directory dm_out_dir = os.path.join("FILEPATH") # make the sub-directory for data in files: folder = os.path.join("FILEPATH") if not os.path.exists(folder): try: os.makedirs(folder) except OSError as e: if e.errno != os.errno.EEXIST: raise pass print("2. Looping through years and sexes to make rate-in and data-in files.") value_data = [] for year in dems["year_id"]: for sex in dems["sex_id"]: measures = {} print('Working on year {} sex {}'.format(year, sex)) incidence = calculate_measures.long_term_incidence(ecode, version, ncode, platform, year, sex, lt_probs) inc_mean = incidence.mean(dim='draw') # if the value is less then one in a trillion, set to 0. Otherwise, DisMod can have an overflow issue where # it sets prevalence to 100% inc_summary = xr.merge([inc_mean.where(inc_mean > .000000000001, 0).rename('meas_value'), incidence.std(dim='draw').rename('meas_stdev')]) measures['incidence'] = inc_summary if ncode in inj_info.EMR_NCODES: emr = calculate_measures.emr(smr, year, sex, flat_version) emr_summary = xr.merge([emr.mean(dim='draw').rename('meas_value'), emr.std(dim='draw').rename('meas_stdev')]) measures['mtexcess'] = emr_summary print('Making data in') data = make_data_in(measures, ecode, version, ncode, platform, locations, year, sex) value_data.append(data) sys.stdout.flush() print("Finished making data in files.") print("4. Now making the value-in file with the saved data from data in process...") make_value_in(value_data, ecode, ncode, platform, dm_out_dir) help.end_timer(start)
def apply_corrections(df, use_modified): """ Applies the marketscan correction factors to the hospital data at the bundle level. The corrections are merged on by 'age_start', 'sex_id', and 'bundle_id'. With the new cf uncertainty our process has been updated and this only applies to the sources with full care coverage. Parameters: df: Pandas DataFrame Must be aggregated and collapsed to the bundle level. """ assert "bundle_id" in df.columns, "'bundle_id' must exist." assert "nonfatal_cause_name" not in df.columns, ( "df cannot be at the baby ", "sequelae level") start_columns = df.columns # get a list of files, 1 for each type of CF if use_modified: corr_files = glob.glob(root + r"{FILEPATH}/mod_*.csv") idx = -4 id_cols = ['age_start', 'sex_id', 'cf_location_id', 'bundle_id'] else: corr_files = glob.glob(root + r"{FILEPATH}/*sm.csv") idx = -6 id_cols = ['age_start', 'sex', 'bundle_id'] corr_list = [] # to append the CF DFs to cf_names = [] # to apply the cfs for f in corr_files: # pull out the name of the correction type draw_name = os.path.basename(f)[:idx] if use_modified: draw_name = draw_name[4:] cf_names.append(draw_name) # read in a file dat = pd.read_csv(f) # rename the mean draw name cols back to just draw name if use_modified: dat.rename(columns={'mean_' + draw_name: draw_name}, inplace=True) if "Unnamed: 0" in dat.columns: dat.drop("Unnamed: 0", 1, inplace=True) pre_rows = dat.shape[0] # only need to take the mean if it's not modeled/modifed CF data if not use_modified: # get the draw col names draw_cols = dat.filter(regex=draw_name).columns assert len(draw_cols) == 1000, "wrong number of draw cols" # create the single mean value from all the draws dat[draw_name] = dat[draw_cols].mean(axis=1) # drop the draw cols dat.drop(draw_cols, axis=1, inplace=True) assert dat.shape[0] == pre_rows, "The number of rows changed" corr_list.append(dat) del dat # merge the dataframes in the list together correction_factors = functools.reduce( lambda x, y: pd.merge(x, y, on=id_cols), corr_list) if 'sex' in correction_factors.columns: # rename columns to match df correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True) # switch from age group id to age start/end df = hosp_prep.group_id_start_end_switcher(df) # switch from sex to sex id in our identifier columns id_cols = [f + "_id" if f == "sex" else f for f in id_cols] pre_shape = df.shape[0] if not use_modified: # merge corr factors onto data df = df.merge(correction_factors, how='left', on=id_cols) if use_modified: # merge country id aka cf loc id, onto the data in order for the later merge to work locs = get_location_metadata(location_set_id=35)[[ 'location_id', 'path_to_top_parent' ]] locs = pd.concat( [locs, locs.path_to_top_parent.str.split(",", expand=True)], axis=1) locs = locs[locs[3].notnull()] locs['cf_location_id'] = locs[3].astype(int) locs = locs[['cf_location_id', 'location_id']] df = df.merge(locs, how='left', on='location_id') # merge CFs onto hosp data df = df.merge(correction_factors, how='left', on=id_cols) assert pre_shape == df.shape[0], ( "You unexpectedly added rows while " "merging on the correction factors. Don't do that!") # drop unneeded cols for col in ['super_region_id', 'model_prediction', 'cf_location_id']: if col in df.columns: df.drop(col, axis=1, inplace=True) # apply the mean, smoothed corr factors without the env to covered sources full_coverage_sources = ["UK_HOSPITAL_STATISTICS"] # apply the corrections. for level in cf_names: df.loc[df.source.isin(full_coverage_sources), "mean_" + level] = \ df.loc[df.source.isin(full_coverage_sources), "mean_raw"] *\ df.loc[df.source.isin(full_coverage_sources), level] # switch from age_start and age_end back to age_group_id df = hosp_prep.group_id_start_end_switcher(df) # drop the CF cols. We'll add them manually later for all sources df.drop(cf_names, axis=1, inplace=True) assert set(start_columns).issubset(set(df.columns)), """ Some columns that were present at the start are missing now""" return (df)
logging.info("Creating draw source and sink.") draw_dir = os.path.join(parent_dir, 'aggregated/{}'.format(df_type)) input_pattern = '{measure_id}_{location_id}_{year_id}.h5' source_config = {'draw_dir': draw_dir, 'file_pattern': input_pattern} draw_source = DrawSource(source_config) output_pattern = '{measure_id}_{location_id}_{year_id}.h5' sink_config = { 'draw_dir': draw_dir, 'file_pattern': output_pattern, 'h5_tablename': 'draws' } draw_sink = DrawSink(sink_config) # Apply regional scalar transform region_locs = get_location_metadata(gbd_round_id=GBD.GBD_ROUND_ID, location_set_id=35) region_locs = region_locs[region_locs.level == 2].location_id.tolist() draw_sink.add_transform(apply_regional_scalars, region_locs=region_locs, parent_dir=parent_dir) draw_sink.add_transform(transform_add_measure, measure_id=measure_id) # create operator logging.info("Reading regional scalars from flatfiles.") index_cols = [col for col in index_cols if col != 'location_id'] operator = Sum(index_cols, draw_cols) # Aggregate logging.info("Instantiate aggregator.aggregators.AggMemEff.") aggregator = AggMemEff(draw_source=draw_source, draw_sink=draw_sink,
df.groupby("year_start").agg({"contacts": "sum", "patients": "sum"}).reset_index() # check the diagnosis col (df.diagnosis.str.upper() == df.diagnosis).all() assert (df.diagnosis.str.upper() == df.diagnosis).all() # make sure nulls aren't introduced county_nulls = df.county.isnull().sum() # manually adjust the county names to fit the spelling in the IHME location table df.loc[df.county == "Finnmark", 'county'] = "Finmark" df.loc[df.county == "Hedmark", 'county'] = "Hedemark" df.loc[df.county.isin(["Nord-Trondelag", "Sor-Trondelag"]), 'county'] = "Trondelag" df[df.county.isnull()].patients.sum() / float(df.patients.sum()) locs = get_location_metadata(QUERY) loc_subnats = locs.loc[locs.parent_id == 90, ['location_ascii_name', 'location_id']] loc_subnats.head(2) assert set(df.county.unique()) - set(loc_subnats.location_ascii_name.unique()) == set([np.nan]) assert set(loc_subnats.location_ascii_name.unique()) - set(df.county.unique()) == set() # drop national location id df.drop('location_id', axis=1, inplace=True) df.head(2) pre = df.shape[0] df = df.merge(loc_subnats, how='left', left_on='county', right_on='location_ascii_name') assert pre == df.shape[0] assert county_nulls == df.county.isnull().sum() print "shape is {}".format(df.shape)
the lowest life expectancies among these ten large countries, from XX·X (XX·X–XX·X) to XX·X (XX·X–XX·X) years. See appendix 2 (section 3) for additional results. """ import xarray as xr import pandas as pd import sys from db_queries import get_location_metadata from fbd_core.file_interface import FBDPath, open_xr, save_xr from fbd_core.etl import compute_summaries, expand_dimensions import settings as sett LOCS = get_location_metadata(location_set_id=35, gbd_round_id=5) SUPER_REGS = LOCS[LOCS.level == 1] NATS = LOCS[LOCS.level == 3] lex_past_vers = sett.PAST_VERSIONS["lex"].version lex_past_dir = "/5/past/life_expectancy/" lex_past_path = FBDPath(lex_past_dir + lex_past_vers) print(lex_past_vers) lex_fut_vers = sett.BASELINE_VERSIONS["lex"].version lex_fut_dir = "/5/future/life_expectancy/" lex_fut_path = FBDPath(lex_fut_dir + lex_fut_vers) print(lex_fut_vers) pop_past_vers = sett.PAST_VERSIONS["population"].version pop_past_dir = "/5/past/population/"
def setup_for_shiny(df, out_path): """ Description: Prepares the final result of the '00_prep_hf_mktscan_parallel.py' for a diagnostic visualization. Args: df (object): pandas dataframe object of input data engine (object): ihme_databases class instance with dUSERt engine set me_id (int): modelable_entity_id for the dataset in memory Returns: Returns a copy of the dataframe with the seqs filled in, increment strarting from the max of the database for the given modelable_entity. """ # columns necessary for creating appending necesary aggregates and adding # columns # with metadata useful to diagnostics (e.g. location name) index_cols = [ 'hf_target_prop', 'std_err_adj', 'sex_id', 'cause_id', 'age_group_id' ] # columns used for creating aggregates for the region and super region # proportions. group_cols = ['sex_id', 'cause_id', 'age_group_id'] # columns used in the final dataset. final_cols = [ 'hf_target_prop', 'std_err_adj', 'location_id', 'location_ascii_name', 'sex_id', 'cause_id', 'age_group_id', 'age_group_name', 'cause_name' ] locations = get_location_metadata(location_set_id=35)\ [['location_id', 'location_ascii_name']] ages = get_ids('age_group') causes = get_ids('cause') # Exclude composite etiologies for input diagnostics df = df.query('cause_id not in (520, 385, 499)') # location metadata df = df.merge(locations, on='location_id', how='inner') # add column with age group names df = df.merge(ages, on='age_group_id', how='inner') # To make the age progression linear and consecutive recode some of the # age_groups. df['age_group_id'] = df['age_group_id'].replace(to_replace=28, value=4) df.sort_values(by='age_group_id', axis=0, ascending=True, inplace=True) # add column with cause names df = df.merge(causes, on='cause_id', how='inner') # drop unnecessary columns df = df[final_cols] df.rename(columns={'hf_target_prop': 'proportion'}, inplace=True) df.rename(columns={'hf_target_prop': 'standard_error'}, inplace=True) # write the diagnostic input data to csv df.to_csv("{}hf_inputs.csv".format(out_path), index=False, encoding='utf-8')
df['representative_id'] = 3 locs = get_location_metadata(location_set_id=9, gbd_round_id=5) loc_id = locs.loc[locs.location_name == "Jordan", "location_id"] loc_id = loc_id.tolist()[0] df['location_id'] = loc_id assert (df.location_id == 144).all(),\ "loc id check failed" df['age_group_unit'] = 1 df['source'] = 'JOR_ABHD' df['code_system_id'] = 2 df['year_start'] = 2016 df['year_end'] = 2016
index=[df2.index.values], aggfunc='first') # calculate proportion of those who received care for their injury df3['proportion'] = df3['mean']['0100'] / df3['mean']['0000'] df3['sample_size_both'] = df3['sample_size']['0000'] df3.reset_index(inplace=True) df4 = df3[['index', 'proportion', 'sample_size_both']] df4.columns = ['demo', 'data', 'sample_size'] df4[['nid', 'ihme_loc_id', 'age_start', 'age_end']] = pd.DataFrame(df4['demo'].tolist(), index=df4.index) locs = db.get_location_metadata(location_set_id=35) df4 = df4.merge(locs[['ihme_loc_id', 'location_id']]) df4.drop(['demo', 'ihme_loc_id'], axis=1, inplace=True) # prep additional columns for ST-GPR df4['measure'] = 'proportion' df4['is_outlier'] = 0 df4['variance'] = '' df4['sex_id'] = 3 df4['year_id'] = 2003 # get rid of any implausible proportions df5 = df4[df4['data'] <= 1] # apply offset in order to model in logit space
def main(): popdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_' 'splits/01_populations') popfile = 'forecast_under_5_pops.csv' print('{} read in raw pop data from csv'.format(pretty_now())) popdf = pd.read_csv(os.path.join(popdir, popfile)) age_ranges = ['mean_12_to_23', 'mean_2_to_4'] age_map = {age_ranges[0]: 238, age_ranges[1]: 34} popdf = melt_age_cols(popdf) popdf = popdf[popdf.age_group.isin(age_ranges)] popdf['age_group_id'] = popdf.age_group.map(age_map) for col in ['location_id', 'sex_id', 'age_group_id']: popdf[col] = popdf[col].astype(int) popdf.rename(columns={'population': 'pop_scaled'}, inplace=True) # Location Metadata locs = get_location_metadata(location_set_id=35, gbd_round_id=4) locs = locs[[ 'location_id', 'parent_id', 'location_name', 'level', 'location_name_short', 'map_id', 'location_type', 'is_estimate' ]] # Generate locations to keep: 188 + parents locs_to_keep = create_locs_to_keep(locs) custom_loc_df = locs[locs.location_id.isin(locs_to_keep)] # create main custom tree print('{} creating custom tree'.format(pretty_now())) custom_tree = create_custom_tree(custom_loc_df) index_cols = ['age_group_id', 'sex_id', 'year_id', 'location_id'] data_cols = ['pop_scaled'] # aggregate up standard custom tree print('{} aggregate pop from custom tree'.format(pretty_now())) aggpop = agg_hierarchy(custom_tree, popdf, index_cols, data_cols, 'location_id') # SDI locations sdi_locs = get_location_metadata(location_set_id=40, gbd_round_id=4) sdi_locs = sdi_locs[[ 'location_id', 'parent_id', 'location_name', 'level', 'location_name_short', 'map_id', 'location_type', 'is_estimate' ]] sdi_ids = [44635, 44634, 44639, 44636, 44637] sdi_df_list = [] for _id in sdi_ids: print('{} processing sdi: {}'.format(pretty_now(), _id)) thisdf = sdi_locs[sdi_locs.parent_id == _id] thisdf = thisdf[thisdf.location_id.isin(locs_to_keep + [_id])] thistree = create_custom_tree(thisdf) print('{} aggregate pop from {} tree'.format(pretty_now(), _id)) thisaggpop = agg_hierarchy(thistree, popdf, index_cols, data_cols, 'location_id') thisaggpop = thisaggpop[thisaggpop.location_id == _id] aggpop = aggpop.append(thisaggpop) sexagg = agg_sexes(aggpop) aggpop = aggpop.append(sexagg) outdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_' 'splits/01_populations') outfile = os.path.join(outdir, 'future_pop.h5') print('{} output'.format(pretty_now())) aggpop.to_hdf(outfile, 'data', mode='w', format='table', data_columns=index_cols) print('{} fin'.format(pretty_now()))
# submit squeeze job for each cause and form a string of job names job_string = '' for i in [1990, 1995, 2000, 2005, 2010, 2016]: year_id = i job_name = "squeeze_{0}_{1}".format(year_id, cause_name) job_string = job_string + ',' + job_name call = ('qsub -l mem_free=20.0G -pe multi_slot 10' ' -cwd -P proj_custom_models' ' -o {FILEPATH}' ' -e {FILEPATH} -N {4}' ' cluster_shell.sh squeeze.py \'{0}\' {1} {2} {3}'.format(json.dumps(me_map), out_dir, year_id, cause_name,job_name)) #print call subprocess.call(call, shell=True) # get location_metadata for graphing step loc_df = get_location_metadata(location_set_id=35, gbd_round_id=4) loc_df.to_csv(os.path.join(out_dir, 'graphs', 'location_metadata.csv'), encoding='utf-8') # graph # need-hold_jid + job_string flag to hold jobs until squeezes are done # only graphing three estimation years now for i in [1990, 2005, 2016]: year_id = i #job_string= "no_holds" call = ('qsub -hold_jid {2} -cwd -P proj_custom_models ' ' -o {FILEPATH}' ' -e {FILEPATH} -N {0}_graph_{1}' ' r_shell.sh congenital_stacked_bar.R {0} {1}'.format(cause_name, year_id, job_string)) subprocess.call(call, shell=True)
def overlay_or_snap_points(point_df, poly_df, location_set_id=21, snap_points=True, update_snapped_points=True): ''' This function takes a geopandas Points GeoDataFrame and Polygons GeoDataFrame, then assigns all rows in the Points GeoDataFrame to a single polygon in the Polygons GeoDataFrame. It checks for exact overlap, then snaps points that fall outside of any polygon (or points that already have identifying information that indicates they do not belong in their current polygon). Inputs: point_df (geopandas GeoDataFrame): The points GeoDataFrame poly_df (geopandas GeoDataFrame): The polygons GeoDataFrame location_set_id (int): The value of the GBD location set that will be used to determine which most detailed locations align with which (not necessarily most detailed) parents snap_points (bool): Whether or not to snap points in addition to the overlay update_snapped_points (bool): If true, drop the old set of points and update the 'geometry' field of the points gdf to the new, snapped points Outputs: all_geolocated (geopandas GeoDataFrame): The points GeoDataFrame, where a new field "overlay_loc_id" indicates the polygon that the point overlaps with or has been snapped to ''' # Input data validation assert np.all([ type(i) is gpd.geodataframe.GeoDataFrame for i in [point_df, poly_df] ]), ("The point_df and poly_df" " should both be geopandas GeoDataFrames") # Copy the original data to allow for in-place changes poly_df = poly_df.copy() point_df = point_df.copy() # Rename the polygon field 'location_id' so it does not overlap with the # points field 'location_id' poly_df = poly_df.rename(columns={'location_id': 'overlay_loc_id'}) # Get location metadata for the known location set meta = get_location_metadata(location_set_id=location_set_id) # Create a dictionary of the most detailed descendents for each location descendents = construct_descendants_dict(meta) # Create a field that will be used to validate whether a point has been placed # within a valid geometry point_df['known_loc_tag'] = 1 reference_locations = [int(i) for i in list(descendents.keys())] if 'location_id_matched' in point_df.columns: point_df.loc[~np.isnan(point_df['location_id_matched']), 'known_loc_tag'] = point_df.loc[ ~np.isnan(point_df['location_id_matched']), 'location_id_matched'].apply(lambda x: 1 if int( x) not in reference_locations else int(x)) ## Overlay points print("* * * * STARTING FIRST OVERLAY * * * * at {}".format(dt.now())) all_overlaid = overlay_polygons(points_df=point_df, polys_df=poly_df, polys_cols_to_join=['overlay_loc_id']) # Check if there were any UIDs on the border that might be duplicated border_uids_df = all_overlaid.loc[:, ['uid']] border_uids_df['count'] = 1 border_uids_df = (border_uids_df.groupby(by='uid').sum().reset_index( drop=False)) border_uids = (border_uids_df.loc[border_uids_df['count'] == 2, 'uid'].tolist()) print(" WARNING: The following UIDs are being duplicated at this stage:") print(" {}".format(border_uids)) print(" These should be assigned beforehand to avoid duplication.\n") print("* * * * DONE WITH FIRST OVERLAY * * * * at {}".format(dt.now())) # Subset out points that have not been matched to a geography or were # matched to an impossible geometry per the 'valid geometry' field all_overlaid['good_match'] = all_overlaid.apply( lambda row: (row['overlay_loc_id'] is not np.nan) and (row['overlay_loc_id'] in descendents[row['known_loc_tag']]), axis=1) # If we don't want to snap points, then return the points here if not (snap_points): return all_overlaid # Otherwise, continue on to snapping overlaid_good = all_overlaid.loc[all_overlaid['good_match'], :].copy() needs_snapping = all_overlaid.loc[~all_overlaid['good_match'], :].copy() print(" {} points need to be snapped.".format(needs_snapping.shape[0])) print("* * * * * * * * CHECK DF SIZE * * * * * * * *") print(" {} points were good.".format(overlaid_good.shape[0])) print(" {} combined.".format(all_overlaid.shape[0])) # Iterate through each parent geometry, getting the best fit out of all # descendants of that parent geometry. Afterwards, concatenate the results # from all parents into a single dataframe print("* * * * STARTING SNAPPING * * * * at {}".format(dt.now())) snapped_sub_dfs = list() for parent_loc in needs_snapping['known_loc_tag'].dropna().unique().tolist( ): possible_snap_polys = poly_df.loc[( poly_df['overlay_loc_id'].isin(descendents[int(parent_loc)])), :] if len(possible_snap_polys) == 0: warnings.warn( "All location tagging failed for parent location: {}".format( parent_loc)) continue points_to_snap = needs_snapping.loc[needs_snapping['known_loc_tag'] == parent_loc, :] snapped_sub = snap_points_to_polys_df( needs_snapping=points_to_snap, polys_df=possible_snap_polys, polys_location_col='overlay_loc_id', descendents=descendents) snapped_sub_dfs.append(snapped_sub) snapped = pd.concat(snapped_sub_dfs) # Update with the new, snapped points as the geometry snap_geom = [ sly.geometry.Point(xy) for xy in zip(snapped['snapped_lon'], snapped['snapped_lat']) ] snapped = snapped.drop(labels=['geometry', 'snapped_lon', 'snapped_lat'], axis=1) snapped = gpd.GeoDataFrame(snapped, crs={'PASSWORD'}, geometry=snap_geom) print("\n* * * * DONE WITH SNAPPING * * * * at {}".format(dt.now())) print("\n* * * * * * * * CONFIRM SNAP WORKED * * * * * * * *") print(" The snapped df now has {} rows (should be same)".format( snapped.shape[0])) print(" There are {} rows that still don't have a loc_id.".format( snapped.loc[ snapped['overlay_loc_id'].apply(lambda x: x == ''), :].shape[0])) # Snapping will add the column "snap_dist" # to the geodataframe. Make these consistent with the overlaid df and # concatenate overlaid_good['snap_dist'] = 0 all_geolocated = pd.concat([overlaid_good, snapped]) # Add a field giving the location name that each point is now assigned to meta_names = meta.loc[:, ['location_id', 'location_ascii_name']] meta_names.rename(columns={ 'location_id': 'overlay_loc_id', 'location_ascii_name': 'overlay_loc_name' }, inplace=True) all_geolocated = all_geolocated.merge(meta_names, on="overlay_loc_id", how='left') # Delete the field that was used to determine valid locations for snapping all_geolocated.drop(labels=['known_loc_tag'], axis=1, inplace=True) print("* * * * * * * * CHECK DF SIZE PRESERVED * * * * * * * *") print(" {} rows at the end (should be same as beginning).".format( all_geolocated.shape[0])) # Return the dataframe return (all_geolocated)