def get_live_births_summaries(location_ids, year_ids): # best model_version_id at time of upload - 24083 lvbrth_cov_id = 1106 # live births by sex covariate births = get_covariate_estimates(covariate_id=lvbrth_cov_id, gbd_round_id=5, location_id=location_ids, year_id=year_ids, sex_id=[3, 2, 1]) births = births[['location_id', 'year_id', 'sex_id', 'mean_value']] # Currently, the live births by sex covariate returns most_detailed # sex but we need both sexes combined. The data needed to aggregate sex are # contained in the returned dataframe. # Use that info to aggregate sex for this data adjustment both_sexes = births.copy() both_sexes.loc[:, 'sex_id'] = 3 both_sexes = both_sexes.groupby(['location_id', 'year_id', 'sex_id']).sum().reset_index() births = pd.merge(births, both_sexes[['location_id', 'year_id', 'mean_value']], how='left', on=['location_id', 'year_id'], suffixes=['', '_both'], indicator=True) assert (births._merge == 'both').all() births.drop('_merge', axis=1, inplace=True) births['birth_prop'] = births['mean_value'] / births['mean_value_both'] # merge in 'sex' column because data in eurocat spreadsheet does not have # sex_ids sex_meta = get_ids('sex') births = pd.merge(births, sex_meta, on='sex_id') return births
def get_sy_population(grp_pop, sex_id, pop_dir): """ Get single-year population (i.e. single-year ages) data for all years. :param grp_pop: group-population df because we need 95+ :param sex_id (int) :param location_id (int) :param pop_dir (str): directory where population flat-files are saved :return: single-year population data-frame for all COD years """ terminal_age = grp_pop.loc[grp_pop["age"] == 95] under1 = grp_pop.loc[grp_pop.age < 1] under1 = under1.groupby(['location_id', 'sex_id', 'year_id']).sum().reset_index() under1["age"] = 0 master = xr.open_dataset(os.path.join('FILEPATH.nc')) master = master.loc[{'sex_id': [sex_id]}] master = master.to_dataframe().reset_index() # merge on single-year age names age_ids = db.get_ids(table="age_group") master = master.merge(age_ids, on=['age_group_id']) master.rename(columns={'age_group_name': 'age'}, inplace=True) master.drop('age_group_id', inplace=True, axis=1) master.loc[master.age == "95 plus", 'age'] = 95 master["age"] = master["age"].astype(float) master = master.append(terminal_age) master = master.append(under1) return master
def get_measures(ecode, me_id, year_id, sex_id, version): ids = db.get_ids(table='measure') inc_id = ids.loc[ids["measure_name"] == "Incidence", 'measure_id'].iloc[0] rms_id = ids.loc[ids["measure_name"] == "Remission", 'measure_id'].iloc[0] emr_id = ids.loc[ids["measure_name"] == "Excess mortality rate", 'measure_id'].iloc[0] dems = db.get_demographics(gbd_team="epi", gbd_round_id=help.GBD_ROUND) location_ids = dems["location_id"] age_group_ids = dems["age_group_id"] if ecode in inj_info.IM_RATIO_ECODES and year_id < help.LAST_YEAR: if year_id < 2010: year_end = year_id + 5 mort_year_end = year_end - 1 else: year_end = help.LAST_YEAR mort_year_end = year_end measure_dict = get_measures_interpolate(me_id, location_ids, sex_id, age_group_ids, inc_id, rms_id, emr_id, year_id, year_end) for year in range(year_id,mort_year_end+1): save_mortality(ecode,year,sex_id,location_ids,age_group_ids, version) else: measure_dict = get_measures_get_draws(me_id, location_ids, year_id, sex_id, age_group_ids, inc_id, rms_id, emr_id) return measure_dict
def summarize_loc(source, drawdir, outdir, location_id, year_id, rei_ids, change_intervals=None, gbd_round_id=5): '''summarize every rei for a single location''' # Set global age weights gbd_round_map = get_ids('gbd_round') gbd_round = gbd_round_map.loc[gbd_round_map.gbd_round_id == gbd_round_id].gbd_round.item() Globals.aw = get_age_weights(gbd_round_id=int(gbd_round_id)) # Set global population pops = [] popfiles = glob(os.path.join(drawdir, 'population_*.csv')) for popfile in popfiles: pops.append(pd.read_csv(popfile)) pops = pd.concat(pops).drop_duplicates( subset=['location_id', 'age_group_id', 'year_id', 'sex_id']) Globals.pop = pops.rename(columns={'population': 'pop_scaled'}) pool = Pool(10) results = pool.map(summ_loc, [((source, location_id, rei, year_id, change_intervals, gbd_round_id), {}) for rei in rei_ids]) pool.close() pool.join() results = [res for res in results if isinstance(res, tuple)] results = list(zip(*results)) single_year = pd.concat([res for res in results[0] if res is not None]) single_year = single_year[[ 'rei_id', 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'metric_id', 'val', 'lower', 'upper' ]] single_file = os.path.join(outdir, 'single_year_{}.csv'.format(location_id)) single_year.to_csv(single_file, index=False) os.chmod(single_file, 0o775) multi_year = pd.concat(results[1]) if len(multi_year) > 0: multi_year = multi_year[[ 'rei_id', 'location_id', 'year_start_id', 'year_end_id', 'age_group_id', 'sex_id', 'measure_id', 'metric_id', 'val', 'lower', 'upper' ]] multi_year.replace([np.inf, -np.inf], np.nan) multi_year.dropna(inplace=True) multi_file = os.path.join(outdir, 'multi_year_{}.csv'.format(location_id)) multi_year.to_csv(multi_file, index=False) os.chmod(multi_file, 0o775)
def get_sy_population(grp_pop, sex_id, pop_dir): terminal_age = grp_pop.loc[grp_pop["age"] == 95] under1 = grp_pop.loc[grp_pop.age < 1] under1 = under1.groupby(['location_id', 'sex_id', 'year_id']).sum().reset_index() under1["age"] = 0 master = xr.open_dataset(os.path.join(pop_dir, 'sypops.nc')) master = master.loc[{'sex_id': [sex_id]}] master = master.to_dataframe().reset_index() age_ids = db.get_ids(table="age_group") master = master.merge(age_ids, on=['age_group_id']) master.rename(columns={'age_group_name': 'age'}, inplace=True) master.drop('age_group_id', inplace=True, axis=1) master.loc[master.age == "95 plus", 'age'] = 95 master["age"] = master["age"].astype(float) master = master.append(terminal_age) master = master.append(under1) return master
def setup_for_shiny(df, out_path): """ Description: Prepares the final result of the '00_prep_hf_mktscan_parallel.py' for a diagnostic visualization. Args: df (object): pandas dataframe object of input data engine (object): ihme_databases class instance with dUSERt engine set me_id (int): modelable_entity_id for the dataset in memory Returns: Returns a copy of the dataframe with the seqs filled in, increment strarting from the max of the database for the given modelable_entity. """ # columns necessary for creating appending necesary aggregates and adding # columns # with metadata useful to diagnostics (e.g. location name) index_cols = [ 'hf_target_prop', 'std_err_adj', 'sex_id', 'cause_id', 'age_group_id' ] # columns used for creating aggregates for the region and super region # proportions. group_cols = ['sex_id', 'cause_id', 'age_group_id'] # columns used in the final dataset. final_cols = [ 'hf_target_prop', 'std_err_adj', 'location_id', 'location_ascii_name', 'sex_id', 'cause_id', 'age_group_id', 'age_group_name', 'cause_name' ] locations = get_location_metadata(location_set_id=35)\ [['location_id', 'location_ascii_name']] ages = get_ids('age_group') causes = get_ids('cause') # Exclude composite etiologies for input diagnostics df = df.query('cause_id not in (520, 385, 499)') # location metadata df = df.merge(locations, on='location_id', how='inner') # add column with age group names df = df.merge(ages, on='age_group_id', how='inner') # To make the age progression linear and consecutive recode some of the # age_groups. df['age_group_id'] = df['age_group_id'].replace(to_replace=28, value=4) df.sort_values(by='age_group_id', axis=0, ascending=True, inplace=True) # add column with cause names df = df.merge(causes, on='cause_id', how='inner') # drop unnecessary columns df = df[final_cols] df.rename(columns={'hf_target_prop': 'proportion'}, inplace=True) df.rename(columns={'hf_target_prop': 'standard_error'}, inplace=True) # write the diagnostic input data to csv df.to_csv("{}hf_inputs.csv".format(out_path), index=False, encoding='utf-8')
def get_modelable_entity_name(bundle_id): me_id = map_df.loc[map_df.fullmod_bundle == bundle_id, 'fullmod_ME'].item() me_meta = get_ids('modelable_entity') me_name = me_meta.loc[me_meta.modelable_entity_id == me_id, 'modelable_entity_name'].item() return "Birth prevalence of {}".format(me_name.lower())
loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=5) loc_meta.to_csv(os.path.join(code_dir, 'location_metadata.csv'), index=False, encoding='utf8') norway_id = 90 norway_subs = loc_meta.loc[loc_meta.parent_id==norway_id, 'location_id'].tolist() + [norway_id] country_pop = get_population(location_id=norway_id, year_id='all', sex_id='all', age_group_id=164, gbd_round_id=5, status='best') country_pop.drop('location_id', axis=1, inplace=True) subs_pop = get_population(location_id=norway_subs, year_id='all', sex_id='all', age_group_id=164, gbd_round_id=5, status='best') population = subs_pop.merge(country_pop, on=[c for c in subs_pop.columns if c not in ['location_id','population']], suffixes=('_subs', '_national')) population.loc[:, 'pop_weight'] = population['population_subs'] / population['population_national'] sex_meta = get_ids('sex') population = pd.merge(population, sex_meta, on='sex_id') population.loc[:, 'age_start'] = 0 population.rename(columns={'year_id':'year_start'}, inplace=True) population.to_csv(os.path.join(code_dir, 'norway_population.csv'), index=False, encoding='utf8') cause_bundle_pairs = list(zip(map_df.cause, map_df.fullmod_bundle)) bundle_num = len(map_df.fullmod_bundle.tolist()) if have_paths==0: job_string = '' for cause, bundle in cause_bundle_pairs: bundle = int(bundle) job_name = "get_reqids_{b}_{c}".format(b=bundle, c=cause)
# Make the file paths for draws FILE_PATHS = [out_path + 'diagnostics/', out_path + 'prevalence/'] for file_path in FILE_PATHS: if not os.path.exists(file_path): os.makedirs(file_path) # Make correction factors mktscan_draws = get_correction_factors(info_path, in_path, out_path, draws_path) # Get age group, and cause- IDs, and names: AGE_NAMES = get_ids('age_group')[['age_group_id', 'age_group_name']] CAUSE_NAMES = get_ids('cause')[['cause_id', 'cause_name']] # get the locations locations_df = get_location_metadata(location_set_id=9) # filter out locations not used in in Epi and non-admin0 locations DEMOGRAPHICS = list(get_demographics(gbd_team='epi')['location_ids']) # If admin0 only is selected then only take those if admin0_only == "YES": LOCATION_NAMES = locations_df.query('location_type_id == 2'.format((DEMOGRAPHICS)))[['location_id', 'location_name']] else: LOCATION_NAMES = locations_df.query('location_id in {} or location_type_id == 2'.format((DEMOGRAPHICS)))[['location_id', 'location_name']] # location IDs
import gbd.constants as gbd from gbd.decomp_step import decomp_step_id_from_decomp_step from gbd.estimation_years import gbd_round_from_gbd_round_id from test_support.profile_support import profile from split_models.exceptions import IllegalSplitCoDArgument from split_models.job_classes import SplitCoDSwarm from split_models.validate import (validate_decomp_step_input, validate_ids) if sys.version_info > (3, ): long = int REPORTING_CAUSE_SET_ID, COMPUTATION_CAUSE_SET_ID = 3, 2 # Create list of valid cause_ids VALID_CAUSE_IDS = get_ids(table='cause').cause_id.unique() VALID_MEIDS = get_ids(table='modelable_entity').modelable_entity_id.unique() @profile def _launch_cod_splits(source_cause_id, target_cause_ids, target_meids, prop_meas_id, gbd_round_id, decomp_step, output_dir, project): """ Split the given source_cause_id given target_meid proportions, saved to the target_cause_ids in output_dir. Arguments: source_cause_id (int): cause_id for the draws to be split target_cause_ids (intlist): list of cause ids that you want the new outputted subcauses to be identified by
def new_cause_list(self): cause_list = get_ids(table="cause") self.cause_list = cause_list[["cause_id", "acause"]]