def expand_location_arguments(args): """Expand the various location arguments into a set of location ids""" logger = logging.getLogger("dalynator") if args.location_id is not None: location_id_list = to_list(args.location_id) if args.subtree_id is not None: raise ValueError( "Cannot specify both a location_id and a subtree_id") else: if args.location_set_id: tree = loctree(None, args.location_set_id) location_id_list = [n.id for n in tree.nodes] logger.debug(" location_set_id original") elif args.location_set_version_id: tree = loctree(args.location_set_version_id) location_id_list = [n.id for n in tree.nodes] logger.debug(" location_set_version_id original") if args.subtree_id is not None: node_list = tree.get_node_by_id(args.subtree_id).all_descendants() location_id_list = [n.id for n in node_list] location_id_list.append(args.subtree_id) logger.debug(" location_id_list as list {}:{}".format( len(location_id_list), location_id_list)) args.location_id_list = location_id_list return args
def main(): popdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_' 'splits/01_populations') popfile = 'single_age_u5_pops.dta' popdf = pd.read_stata(os.path.join(popdir, popfile)) age_ranges = ['pys_{}'.format(x + 1) for x in range(4)] age_map = {val: idx + 1 for idx, val in enumerate(age_ranges)} popdf = melt_age_cols(popdf) popdf = popdf[popdf.age_group.isin(age_ranges)] popdf['age_group'] = popdf.age_group.map(age_map) popdf['sex_id'] = popdf.sex.map({'male': 1, 'female': 2}) popdf = aggregate_ages(popdf) popdf.rename(columns={'year': 'year_id'}, inplace=True) locs = get_location_metadata(location_set_id=35, gbd_round_id=4) locs = locs[['location_id', 'ihme_loc_id']] popdf = pd.merge(popdf, locs, on='ihme_loc_id', how='left') popdf.drop(labels=['ihme_loc_id'], inplace=True, axis=1) popdf.rename(columns={'population': 'pop_scaled'}, inplace=True) lsvid = dbtrees.get_location_set_version_id(35, gbd_round=2016) lt = dbtrees.loctree(lsvid) index_cols = ['age_group_id', 'sex_id', 'year_id', 'location_id'] data_cols = ['pop_scaled'] aggpop = agg_hierarchy(lt, popdf, index_cols, data_cols, 'location_id') sdi_lsvid = dbtrees.get_location_set_version_id(40, gbd_round=2016) sdi_lts = dbtrees.loctree(sdi_lsvid, return_many=True) for tree in sdi_lts: sdi_agg_df = agg_hierarchy(tree, popdf, index_cols, data_cols, 'location_id') sdi_agg_df = sdi_agg_df[sdi_agg_df.location_id.isin( [44634, 44635, 44636, 44637, 44639])] aggpop = aggpop.append(sdi_agg_df) outdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_' 'splits/01_populations') outfile = os.path.join(outdir, 'age_pop.h5') aggpop.to_hdf(outfile, 'data', mode='w', format='table', data_columns=index_cols) print('fin')
def new_dimensions( self, location_id=[], year_id=[], sex_id=[], age_group_id=[], measure_id=[], n_draws=1000): if not location_id: lt = dbtrees.loctree(self.location_set_version_id) location_id = [loc.id for loc in lt.leaves()] if not year_id: year_id = [1990, 1995, 2000, 2005, 2010, 2016] if not sex_id: sex_id = [1, 2] if not age_group_id: age_group_id = [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31, 32, 235] if not measure_id: measure_id = [3, 5, 6] index_dict = { "measure_id": measure_id, "year_id": year_id, "location_id": location_id, "sex_id": sex_id, "age_group_id": age_group_id } data_dict = { "draws": ['draw_{}'.format(d) for d in range(n_draws)] } dimensions = DataFrameDimensions(index_dict, data_dict) self.dimensions = dimensions
def calculate_mad(self): """ Merge the ST results back onto the input dataset and calculate MAD estimates at every location level """ flatmap = dbtrees.loctree(self.lvid).flatten() melted = self.long_result() merged = pd.merge(self.dataset, melted, on=[self.agevar, self.timevar, self.spacevar], how='right') merged = pd.merge(merged, flatmap, left_on=self.spacevar, right_on='leaf_node', how='left') # Calculate residuals merged['st_resid'] = merged[self.datavar] - merged['st_prediction'] # Calculate MAD estimates at various geographical levels for lvlcol in merged.filter(like='level').columns: if merged[lvlcol].notnull().any(): mad_lvl = merged.groupby(lvlcol).agg({ 'st_resid': mad }).reset_index().rename( columns={'st_resid': 'mad_%s' % lvlcol}) merged = pd.merge(merged, mad_lvl, on=lvlcol, how="left") return merged
def mix_locations(args): lid, yid, sample_size = args lt = dbtrees.loctree(None, 35) lids = [l.id for l in lt.get_node_by_id(lid).children] pops = get_pop({ 'location_id': lids, 'year_id': yid, 'sex_id': 3, 'age_group_id': 22 }) pops['prop'] = pops.pop_scaled / pops.pop_scaled.sum() pops['nsamples'] = pops.prop.apply(lambda x: int(round(x * sample_size))) subsample = [] for i, row in pops.iterrows(): l = int(row['location_id']) nsims = row['nsamples'] try: ss = pd.read_hdf('{pd}/{l}_{y}.h5'.format(pd=pooldir, l=l, y=yid)) ss = ss.sample(nsims, replace=True) subsample.append(ss) except: print 'issue with %s' % l subsample = pd.concat(subsample) subsample.reset_index(drop=True, inplace=True) subsample.to_hdf("{od}/{l}_{y}.h5".format(od=pooldir, l=lid, y=yid), 'sims', mode='w')
def calc_spatial_distances(location_set_version_id, o_locs): lt = dbtrees.loctree(location_set_version_id) o_locs = np.atleast_1d(o_locs).ravel() nlvls = lt.max_depth() leaf_lvls = [] for lvl in reversed(range(nlvls + 1)): if len(set(lt.leaves()) & set(lt.level_n_descendants(lvl))) > 0: leaf_lvls.append(lvl) lflat = lt.flatten() o_locdf = pd.DataFrame({'leaf_node': o_locs}) o_locdf = o_locdf.merge(lflat, on='leaf_node') d_df = [] for lvl in leaf_lvls: leaf_df = lflat[lflat['level_%s' % lvl].notnull()] lflat = lflat[lflat['level_%s' % lvl].isnull()] d0_locs = (np.atleast_2d( leaf_df['level_%s' % lvl].values).T == np.atleast_2d( o_locdf['level_%s' % lvl].values)).astype(int) d1_locs = (np.atleast_2d( leaf_df['level_%s' % (lvl - 1)].values).T == np.atleast_2d( o_locdf['level_%s' % (lvl - 1)].values)).astype(int) d2_locs = (np.atleast_2d( leaf_df['level_%s' % (lvl - 2)].values).T == np.atleast_2d( o_locdf['level_%s' % (lvl - 2)].values)).astype(int) d_locs = d0_locs + d1_locs + d2_locs d_df.append( pd.DataFrame(d_locs.T, columns=leaf_df['level_%s' % lvl].values, index=o_locs)) d_df = pd.concat(d_df, axis=1) d_df = d_df.fillna(0) return d_df
def expand_and_validate_location_lists(tool_name, location_set_ids, gbd_round_id): """ Create lists of most-detailed and aggregate location ids by expanding the location_set_ids. Returns: most_detailed_locs - the leaf locations with not children; aggregate_locs - the internal locations with children """ most_detailed_locs = set() aggregate_locs = set() for loc_set in location_set_ids: tree_list = hdb.loctree(location_set_id=loc_set, gbd_round_id=gbd_round_id, return_many=True) for lt in tree_list: most_detailed_locs.update(set([l.id for l in lt.leaves()])) aggregate_locs.update( set([l.id for l in lt.nodes if l.id not in most_detailed_locs])) # Remove 44620 (non-standard Global location) aggregate_locs = set(aggregate_locs - {44620}) return most_detailed_locs, aggregate_locs
def squeezer(q, log_dir): locations = dbtrees.loctree(None, 35) root_path = os.path.dirname(os.path.abspath(__file__)) runfile = "%s/squeeze_em_all.py" % root_path for location_id in [l.id for l in locations.leaves()]: for year_id in [1990, 1995, 2000, 2005, 2010, 2016]: for sex_id in [1, 2]: params = [ '--location_id', str(location_id), '--year_id', str(year_id), '--sex_id', str(sex_id) ] remote_job = job.Job(mon_dir=log_dir, runfile=runfile, name='squeeze_%s_%s_%s' % (location_id, year_id, sex_id), job_args=params) q.queue_job(remote_job, slots=30, memory=60, project='proj_epic', stderr='/FILEPATH', stdout='/FILEPATH') q.block_till_done(poll_interval=60)
def epilepsy_any(cv): standard_dws = pd.read_csv("filepath/02_standard/dw.csv") healthstates = cv.sequela_list[["modelable_entity_id", "healthstate_id"]] # Get country-year specific prevalences for back-calculation lt = dbtrees.loctree(None, 35) locations = [l.id for l in lt.leaves()] years = [1990, 1995, 2000, 2005, 2010, 2016] prop_dfs = [] args = [(l, y) for l in locations for y in years] pool = Pool(20) prop_dfs = pool.map(get_props, args) pool.close() pool.join() prop_dfs = pd.concat(prop_dfs) prop_dfs = prop_dfs.merge(healthstates) renames = {'draw_%s' % i: 'dw_prop_%s' % i for i in range(1000)} prop_dfs.rename(columns=renames, inplace=True) # Combine DWs dws_to_weight = prop_dfs.merge(standard_dws, on='healthstate_id', how='left') dws_to_weight = dws_to_weight.join( pd.DataFrame(data=(dws_to_weight.filter(like='draw').values * dws_to_weight.filter(like='dw_prop_').values), index=dws_to_weight.index, columns=drawcols)) def combine_dws(df): draws_to_combine = df.filter(like='draw') combined_draws = 1 - (1 - draws_to_combine).prod() return combined_draws combined_dws = dws_to_weight.groupby(['location_id', 'year_id' ]).apply(combine_dws).reset_index() combined_dws['healthstate_id'] = 772 col_order = ['location_id', 'year_id', 'healthstate_id'] + drawcols combined_dws = combined_dws[col_order] # fill in missing years dims = deepcopy(cv.dimensions) dims.index_dim.add_level('healthstate_id', 772) dims.index_dim.replace_level("year_id", list(range(1990, 2017))) for index_dim in dims.index_groups: if index_dim not in ['location_id', 'year_id', 'healthstate_id']: dims.index_dim.drop_level(index_dim) gbdizer = GBDizeDataFrame(dims) combined_dws = gbdizer.fill_year_by_interpolating( df=combined_dws, rank_df=combined_dws[combined_dws.year_id == 2005].reset_index()) combined_dws.to_hdf("{}/info/epilepsy_any_dws.h5".format(cv.como_dir), 'draws', mode='w', format='table', data_columns=['location_id', 'year_id'])
def run_task(self, location_set_version_id, component): source = self.get_source(component) sink = self.get_sink(component) dimensions = self.dimensions.get_dimension_by_component( component, self.measure_id) # get the tree we are aggregating loc_trees = dbtrees.loctree( location_set_version_id=location_set_version_id, return_many=True) for loc_tree in loc_trees: # get the weight vals pop = get_population( self.como_version, age_group_id=dimensions.index_dim.get_level("age_group_id"), location_id=[node.id for node in loc_tree.nodes], year_id=dimensions.index_dim.get_level("year_id"), sex_id=dimensions.index_dim.get_level("sex_id")) pop = pop[[ "age_group_id", "location_id", "year_id", "sex_id", "population" ]] # set up our aggregation operator operator = WtdSum(index_cols=[ col for col in dimensions.index_names if col != "location_id" ], value_cols=dimensions.data_list(), weight_df=pop, weight_name="population", merge_cols=[ "location_id", "year_id", "age_group_id", "sex_id" ]) # run our aggregation aggregator = AggMemEff(draw_source=source, draw_sink=sink, index_cols=[ col for col in dimensions.index_names if col != "location_id" ], aggregate_col="location_id", operator=operator, chunksize=self.chunksize[component]) # run the tree aggregator.run(loc_tree, draw_filters={ "measure_id": [self.measure_id], "year_id": dimensions.index_dim.get_level("year_id"), "sex_id": dimensions.index_dim.get_level("sex_id") }, n_processes=self.chunksize[component])
def _cache_location_hierarchy(self, location_set_id: int) -> None: logger.debug("Starting to load location_hierarchy cache") location_hierarchy = dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=self.gbd_round_id, return_many=True) cache_file = "FILEPATH".format(self.cache_dir, location_set_id) pickle.dump(location_hierarchy, open(cache_file, "wb")) logger.debug("Cached location_hierarchy {} in {}".format( cache_file, location_set_id))
def __init__(self, parent_meid, prop_drawfile=None): self.parent_meid = parent_meid self.lt = dbtrees.loctree(None, location_set_id=35) self.ags = self.age_groups() self.child_meids = self.get_children() if prop_drawfile is None: self.props = self.get_split_proportions() self.props = self.gen_proportion_draws() else: self.props = pd.read_csv(prop_drawfile)
def _add_pct_change_tasks(self, agg_loc_set_versions): all_locs = [] for location_set_version_id in agg_loc_set_versions: loc_tree = loctree(location_set_version_id=location_set_version_id) all_locs.extend(loc_tree.node_ids) all_locs = list(set(all_locs)) for measure_id in self.como_version.measure_id: for location_id in all_locs: pct_change_task = self._pct_change_task_fac.get_task( measure_id=measure_id, location_id=location_id) self.dag.add_task(pct_change_task)
def build_jobmon_workflow(self, identifier=None, extra_arguments=None): """ Returns jobmon workflow that represents cascade job dag. Args: identifier (str): A unique string to identify this workflow for JobMon. Running twice with the same string will restart a workflow. extra_arguments (List[str]): Command-line arguments to add to every UGE Job specified in Jobmon. Returns: jobmon.Workflow: With all Jobmon tasks created. """ extra_arguments = extra_arguments if extra_arguments else list() cv_iters = None if not self.run_cv else list(range(11)) demo = Demographics(self.mvid) lsvid = self.mvm.location_set_version_id.values[0] lt = loctree( location_set_id=demo.LOCATION_SET_ID, location_set_version_id=lsvid, gbd_round_id=demo.gbd_round_id) desc = self.mvm.description.values[0] jobdag = make_dag( mvid=self.mvid, loctree=lt, cv_iter=cv_iters, add_arguments=extra_arguments ) env = settings['env_variables']['ENVIRONMENT_NAME'] identifier = identifier if identifier else f"dismod_{self.mvid}_{env}" wf = Workflow( workflow_args=identifier, name=f"dismod_{self.mvid}_{env}", resume=True, description=desc, project=self.project, stderr=self.logdir, stdout=self.logdir, seconds_until_timeout=1210000) # since we're looping through the dict and mutating each JobNode # to contain a reference to a PythonTask, we require the jobdag dict # to be sorted such that we've already visited all upstream tasks of # any given node. for jobname, dagnode in jobdag.items(): dagnode.add_job(wf, jobdag, self.mvm) return wf
def launch_summaries( model_version_id, env='dev', years=[1990, 1995, 2000, 2005, 2010, 2015], file_pattern='all_draws.h5', h5_tablename='draws'): global pop, aw, ags pop = get_pop() aw = get_age_weights() ags = get_age_spans() drawdir = '/ihme/epi/panda_cascade/%s/%s/full/draws' % ( env, model_version_id) outdir = '/ihme/epi/panda_cascade/%s/%s/full/summaries' % ( env, model_version_id) try: os.makedirs(outdir) os.chmod(outdir, 0o775) os.chmod(os.path.join(outdir, '..'), 0o775) os.chmod(os.path.join(outdir, '..', '..'), 0o775) except: pass lt = dbtrees.loctree(None, location_set_id=35) locs = [l.id for l in lt.nodes] sg = super_gopher.SuperGopher({ 'file_pattern': file_pattern, 'h5_tablename': h5_tablename}, drawdir) pool = Pool(10) res = pool.map(slw, [( (l, drawdir, sg, years), {}) for l in locs]) pool.close() pool.join() res = [r for r in res if isinstance(r, tuple)] res = zip(*res) summ = pd.concat([r for r in res[0] if r is not None]) summ = summ[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'mean', 'lower', 'upper']] summfile = "%s/model_estimate_final.csv" % outdir summ.to_csv(summfile, index=False) os.chmod(summfile, 0o775) csumm = pd.concat(res[1]) if len(csumm) > 0: csumm = csumm[[ 'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id', 'measure_id', 'median', 'lower', 'upper']] csummfile = "%s/change_summaries.csv" % outdir csumm.to_csv(csummfile, index=False) os.chmod(csummfile, 0o775)
def epilepsy_any(cvid): standard_dws = pd.read_csv( "/home/j/WORK/04_epi/03_outputs/01_code/02_dw/02_standard/dw.csv") eng = sqlalchemy.create_engine("strDir") healthstates = pd.read_sql( "SELECT modelable_entity_id, healthstate_id FROM epi.sequela", eng) # Get country-year specific prevalences for back-calculation lt = dbtrees.loctree(None, 35) locations = [l.id for l in lt.leaves()] years = range(1990, 2016, 5) prop_dfs = [] args = [(l, y) for l in locations for y in years] pool = Pool(20) prop_dfs = pool.map(get_props, args) pool.close() pool.join() prop_dfs = pd.concat(prop_dfs) prop_dfs = prop_dfs.merge(healthstates) renames = {'draw_%s' % i: 'dw_prop_%s' % i for i in range(1000)} prop_dfs.rename(columns=renames, inplace=True) # Combine DWs dws_to_weight = prop_dfs.merge(standard_dws, on='healthstate_id', how='left') dws_to_weight = dws_to_weight.join( pd.DataFrame(data=(dws_to_weight.filter(like='draw').values * dws_to_weight.filter(like='dw_prop_').values), index=dws_to_weight.index, columns=drawcols)) def combine_dws(df): draws_to_combine = df.filter(like='draw') combined_draws = 1 - (1 - draws_to_combine).prod() return combined_draws combined_dws = dws_to_weight.groupby(['location_id', 'year_id' ]).apply(combine_dws).reset_index() combined_dws['healthstate_id'] = 772 col_order = ['location_id', 'year_id', 'healthstate_id'] + drawcols combined_dws = combined_dws[col_order] combined_dws.to_hdf("/ihme/centralcomp/como/%s/info/epilepsy_any_dws.h5" % cvid, 'draws', mode='w', format='table', data_columns=['location_id', 'year_id'])
def to_como(como_dir, location_set_id, gbd_round_id): df = pd.read_csv("FILEPATH/urolith_symp_dws.csv") # fill for new locs lt = dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id) locmap = lt.flatten() reg_avgs = df.merge(locmap[['leaf_node', 'level_2']], left_on='location_id', right_on='leaf_node') reg_avgs = reg_avgs[['level_2', 'year_id', 'healthstate_id'] + list(df.filter(like='draw').columns)] reg_avgs = reg_avgs.groupby(['level_2', 'year_id']) reg_avgs = reg_avgs.mean().reset_index() reg_avgs.rename(columns={'level_2': 'location_id'}, inplace=True) df = df.append(reg_avgs) filllen = 0 for ln in list(locmap.leaf_node.unique()): if ln not in list(df.location_id): for i in reversed(range(6)): fill_loc = locmap.loc[locmap.leaf_node == ln, 'level_%s' % i].squeeze() filldf = df[df.location_id == fill_loc] if len(filldf) > 0: filldf['location_id'] = ln df = df.append(filldf) filllen = filllen + 1 break df = df[df.location_id.isin([l.id for l in lt.leaves()])] # fill in missing years extra = df.query("year_id == 2013") extra['year_id'] = 2019 df = df.append(extra) df = df.filter(regex='(.*_id|draw_)') interp = pchip_interpolate(df=df, id_cols=['location_id', 'healthstate_id'], value_cols=['draw_%s' % d for d in range(1000)], time_col="year_id", time_vals=list(range(1990, 2020))) df = df.append(interp) df = df[df.year_id.isin(list(range(1990, 2020)))] # save for como run df.to_hdf(f"{como_dir}/info/urolith_dws.h5", 'draws', mode='w', format='table', data_columns=['location_id', 'year_id'])
def new_simulation_index(self, year_id): lt = dbtrees.loctree( location_set_version_id=self.location_set_version_id) location_id = [loc.id for loc in lt.leaves()] demo = get_demographics(gbd_team="epi", gbd_round_id=self.gbd_round_id) if not year_id: year_id = demo['year_id'] simulation_index = { "year_id": year_id, "location_id": location_id, "sex_id": demo['sex_id'], "age_group_id": demo['age_group_id'] } self.simulation_index = simulation_index
def get_task(self, component, year_id, sex_id, measure_id, location_set_version_id): loc_trees = dbtrees.loctree( location_set_version_id=location_set_version_id, return_many=True) # put all aggregate locations in a mapping dict for summary dependency agg_locs = [] for loc_tree in loc_trees: agg_locs.extend([ node for node in loc_tree.node_ids if node not in [x.id for x in loc_tree.leaves()] ]) for location_id in agg_locs: self.agg_loc_set_map[location_id] = location_set_version_id # hunt down upstream task list upstream_tasks = [] for location_id in [node.id for node in loc_tree.leaves()]: if measure_id == measures.INCIDENCE: upstream_name = IncidenceTaskFactory.get_task_name( location_id, sex_id) elif measure_id == measures.PREVALENCE and component in [ "sequela", "injuries", "impairment" ]: upstream_name = SimulationInputTaskFactory.get_task_name( location_id, sex_id) else: upstream_name = SimulationTaskFactory.get_task_name( location_id, sex_id, year_id) upstream_tasks.append(self.task_registry[upstream_name]) name = self.get_task_name(component, year_id, sex_id, measure_id, location_set_version_id) task = PythonTask(script=THIS_FILE, args=[ "--como_dir", self.como_version.como_dir, "--component", component, "--year_id", year_id, "--sex_id", sex_id, "--measure_id", measure_id, "--location_set_version_id", location_set_version_id ], name=name, upstream_tasks=upstream_tasks, num_cores=4, m_mem_free="300G", max_attempts=5, max_runtime_seconds=(60 * 60 * 20), tag="loc_agg") self.task_registry[name] = task return task
def to_como(como_dir): df = pd.read_csv( "filepath/03_custom/" "urolith_symp_dws.csv") iso3map = ezfuncs.query(""" SELECT location_id, ihme_loc_id as iso3 FROM shared.location_hierarchy_history WHERE location_set_version_id = 4""", conn_def="shared") df = df.rename(columns={'year': 'year_id'}) df = df.merge(iso3map, on='iso3') lt = dbtrees.loctree(None, 35) locmap = lt.flatten() reg_avgs = df.merge( locmap[['leaf_node', 'level_2']], left_on='location_id', right_on='leaf_node') reg_avgs = reg_avgs[ ['level_2', 'year_id', 'healthstate_id'] + list(df.filter(like='draw').columns)] reg_avgs = reg_avgs.groupby(['level_2', 'year_id']) reg_avgs = reg_avgs.mean().reset_index() reg_avgs.rename(columns={'level_2': 'location_id'}, inplace=True) df = df.append(reg_avgs) filllen = 0 for ln in list(locmap.leaf_node.unique()): if ln not in list(df.location_id): for i in reversed(range(6)): fill_loc = locmap.ix[ locmap.leaf_node == ln, 'level_%s' % i].squeeze() filldf = df[df.location_id == fill_loc] if len(filldf) > 0: filldf['location_id'] = ln df = df.append(filldf) filllen = filllen + 1 break df = df[df.location_id.isin([l.id for l in lt.leaves()])] df['year_id'] = df.year_id.replace({2013: 2016}) df.rename(columns={ 'draw%s' % d: 'draw_%s' % d for d in range(1000)}, inplace=True) df = df.filter(regex='(.*_id|draw_)') df.to_hdf( "{}/info/urolith_dws.h5".format(como_dir), 'draws', mode='w', format='table', data_columns=['location_id', 'year_id'])
def to_como(cvid): df = pd.read_csv("/home/j/WORK/04_epi/03_outputs/01_code/02_dw/03_custom/" "urolith_symp_dws.csv") eng = sqlalchemy.create_engine("stDir") iso3map = pd.read_sql( """ SELECT location_id, ihme_loc_id as iso3 FROM shared.location_hierarchy_history WHERE location_set_version_id = 4""", eng) df = df.rename(columns={'year': 'year_id'}) df = df.merge(iso3map, on='iso3') lt = dbtrees.loctree(None, 35) locmap = lt.flatten() reg_avgs = df.merge(locmap[['leaf_node', 'level_2']], left_on='location_id', right_on='leaf_node') reg_avgs = reg_avgs[['level_2', 'year_id', 'healthstate_id'] + list(df.filter(like='draw').columns)] reg_avgs = reg_avgs.groupby(['level_2', 'year_id']) reg_avgs = reg_avgs.mean().reset_index() reg_avgs.rename(columns={'level_2': 'location_id'}, inplace=True) df = df.append(reg_avgs) filllen = 0 for ln in list(locmap.leaf_node.unique()): if ln not in list(df.location_id): for i in reversed(range(6)): fill_loc = locmap.ix[locmap.leaf_node == ln, 'level_%s' % i].squeeze() filldf = df[df.location_id == fill_loc] if len(filldf) > 0: filldf['location_id'] = ln df = df.append(filldf) filllen = filllen + 1 break df = df[df.location_id.isin([l.id for l in lt.leaves()])] df['year_id'] = df.year_id.replace({2013: 2015}) df.rename(columns={'draw%s' % d: 'draw_%s' % d for d in range(1000)}, inplace=True) df = df.filter(regex='(.*_id|draw_)') df.to_hdf("/ihme/centralcomp/como/%s/info/urolith_dws.h5" % cvid, 'draws', mode='w', format='table', data_columns=['location_id', 'year_id'])
def get_task(self, component, year_id, sex_id, measure_id, location_set_version_id, redis_host): loc_tree = dbtrees.loctree( location_set_version_id=location_set_version_id) # put all aggregate locations in a mapping dict for summary dependency agg_locs = [ node for node in loc_tree.node_ids if node not in [x.id for x in loc_tree.leaves()] ] for location_id in agg_locs: self.agg_loc_set_map[location_id] = location_set_version_id # hunt down upstream task list upstream_tasks = [] for location_id in [node.id for node in loc_tree.leaves()]: if measure_id == 6: upstream_name = IncidenceTaskFactory.get_task_name( location_id, sex_id) elif measure_id == 5 and component in [ "sequela", "injuries", "impairment" ]: upstream_name = SimulationInputTaskFactory.get_task_name( location_id, sex_id) else: upstream_name = SimulationTaskFactory.get_task_name( location_id, sex_id, year_id) upstream_tasks.append(self.task_registry[upstream_name]) name = self.get_task_name(component, year_id, sex_id, measure_id, location_set_version_id) task = PythonTask(script=this_file, args=[ "--como_dir", self.como_version.como_dir, "--component", component, "--year_id", year_id, "--sex_id", sex_id, "--measure_id", measure_id, "--location_set_version_id", location_set_version_id, "--redis_host", redis_host ], name=name, upstream_tasks=upstream_tasks, slots=25, mem_free=300, max_attempts=5, max_runtime=(60 * 60 * 10), tag="loc_agg") self.task_registry[name] = task return task
def new_simulation_index(self, year_id=[]): lt = dbtrees.loctree( location_set_version_id=self.location_set_version_id) location_id = [loc.id for loc in lt.leaves()] age_group_id = [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31, 32, 235] if not year_id: year_id = [1990, 1995, 2000, 2005, 2010, 2017] simulation_index = { "year_id": year_id, "location_id": location_id, "sex_id": [1, 2], "age_group_id": age_group_id } self.simulation_index = simulation_index
def _add_summarization_tasks(self, agg_loc_set_versions): all_locs = [] for location_set_version_id in agg_loc_set_versions: loc_tree = loctree(location_set_version_id=location_set_version_id) all_locs.extend(loc_tree.node_ids) all_locs = list(set(all_locs)) parallelism = ["measure_id", "year_id"] d = self.como_version.nonfatal_dimensions.get_simulation_dimensions( self.como_version.measure_id) for slices in d.index_slices(parallelism): for location_id in all_locs: summ_task = self._summarize_task_fac.get_task( measure_id=slices[0], year_id=slices[1], location_id=location_id) self.dag.add_task(summ_task)
def launch_cod_splits(source_cause_id, target_cause_ids, target_meids, output_dir): from multiprocessing import Pool cme_map = dict(zip(target_cause_ids, target_meids)) lt = dbtrees.loctree(None, location_set_id=35) leaf_ids = [l.id for l in lt.leaves()] params = [] for lid in leaf_ids: params.append((source_cause_id, cme_map, lid, output_dir)) pool = Pool(30) res = pool.map(split_n_write, params) pool.close() return res
def launch_squeeze(autism_value=0.29, idiopathic=95.0, autism_resid=0.05): locations = dbtrees.loctree(None, 35) runfile = "strCodeDir/squeeze_em_all.py" for location_id in [l.id for l in locations.leaves()]: for year_id in [1990, 1995, 2000, 2005, 2010, 2015]: for sex_id in [1, 2]: params = [ '--location_id', location_id, '--year_id', year_id, '--sex_id', sex_id ] sge.qsub(runfile, 'sqz_%s_%s_%s' % (location_id, year_id, sex_id), parameters=params, slots=30, memory=60, project='proj_como', conda_env="isqueeze")
def run_como( como_dir=None, root_dir="FILEPATH", gbd_round_id=5, location_set_id=35, year_id=list(range(1990, 2018)), measure_id=[3, 5, 6], n_draws=1000, n_simulants=20000, components=["cause", "sequela", "injuries", "impairment"], change_years=[(1990, 2007), (2007, 2017), (1990, 2017)], agg_loc_sets=[35, 83], project="proj_como"): special_sets = set(agg_loc_sets) - set([location_set_id]) all_sets = set(agg_loc_sets) | set([location_set_id]) if como_dir is not None: cv = ComoVersion(como_dir) cv.load_cache() else: cv = ComoVersion.new( root_dir, gbd_round_id, location_set_id, year_id, measure_id, n_draws, components, change_years, special_sets) cwf = ComoWorkFlow(cv) cwf.add_tasks_to_dag(n_simulants=n_simulants, agg_loc_sets=all_sets) if cwf.run_workflow(project=project): all_locs = [] for location_set_id in all_sets: loc_tree = loctree(location_set_id=location_set_id, gbd_round_id=cv.gbd_round_id) all_locs.extend(loc_tree.node_ids) all_locs = list(set(all_locs)) run_upload(cv, all_locs) else: raise RuntimeError("como unsuccessful")
def _get_population( version: MachineParameters, location_set_id: int = constants.LocationSetId.OUTPUTS, agg_loc_sets: Optional[List[int]] = ( constants.LocationAggregation.Ids.SPECIAL_LOCATIONS + [constants.LocationSetId.OUTPUTS]) ) -> pd.DataFrame: """ Unpacks arguments from version object to use with get_population function. Requests most detailed ages and most detailed sexes because age-sex population aggregates are created in the summarize module. Dependant on demographics team to upload population for majority of aggregate locations but currently uses AggSynchronous to create population information for select Norway locations in LocationSetId.OUTPUTS. Arguments: version (MachineParameters): object containing all the demographic and configuration data needed to query population estimates. location_set_id (int): The id for hierarchy to aggregate up agg_loc_sets (list): Additional location sets to create special aggregates Return: pd.DataFrame """ pop = get_population(age_group_id=version.most_detailed_age_group_ids, location_id=version.location_ids, year_id=version.year_ids, sex_id=version.sex_ids, run_id=version.population_version_id, decomp_step=version.decomp_step, gbd_round_id=version.gbd_round_id) io_mock = {} source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func) sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func) index_cols = constants.Columns.DEMOGRAPHIC_INDEX data_cols = [constants.Columns.POPULATION] sink.push(pop[index_cols + data_cols]) # location if agg_loc_sets: assert len(agg_loc_sets) == len(set(agg_loc_sets)) assert agg_loc_sets[-1] == constants.LocationSetId.OUTPUTS for set_id in agg_loc_sets: loc_tree = dbtrees.loctree(location_set_id=set_id, gbd_round_id=version.gbd_round_id) operator = Sum(index_cols=([ col for col in index_cols if col != constants.Columns.LOCATION_ID ]), value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=([ col for col in index_cols if col != constants.Columns.LOCATION_ID ]), aggregate_col=constants.Columns.LOCATION_ID, operator=operator) aggregator.run(loc_tree) special_locations = source.content() else: special_locations = pd.DataFrame() return pd.concat([ pop, special_locations. loc[~special_locations.location_id.isin(pop.location_id.unique())] ], ignore_index=True)
def __init__(self, split_version_id, output_dir, decomp_step, location_id=None, year_id=None, age_group_id=None, sex_id=None, measure_id=None, location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000): # static ids self.split_version_id = split_version_id self.decomp_step = decomp_step self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id # read func is derived from static values. we call it to initialize the # internal caching self._read_func = split_prop_read_func() cached_props = self._read_func(params={}, filters=self.ss_filters) # dimensions are derived unless explicit if not location_id: location_id = [ node.id for node in dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id).leaves() ] if not year_id: year_id = estimation_years_from_gbd_round_id(gbd_round_id) if not age_group_id: # this has the advantage of instantiating the lru cache in the main # process before multiprocessing age_group_id = get_age_group_set(12)["age_group_id"].tolist() if -1 in cached_props["age_start"].unique().tolist(): age_group_id.append(164) if not sex_id: sex_id = [gbd.sex.MALE, gbd.sex.FEMALE] if not measure_id: measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] index_dict = { "location_id": location_id, "year_id": year_id, "age_group_id": age_group_id, "sex_id": sex_id, "measure_id": measure_id } data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]} self.dimensions = dimensionality.DataFrameDimensions( index_dict, data_dict) sp_formula = SevPropFormula() sp_formula.build_custom_draw_source(params={}, read_func=self._read_func) sp_formula.add_transforms() self._ss_draw_source = sp_formula.draw_source # epi draws source mvid, dstep = get_best_model_version_and_decomp_step( output_dir, int(self.parent_meid)) src = Epi.create_modelable_entity_draw_source( n_workers=1, modelable_entity_id=int(self.parent_meid), model_version_id=mvid, gbd_round_id=gbd_round_id, decomp_step=dstep) src.remove_transform(automagic_age_sex_agg) if n_draws < 1000: src.add_transform(group_and_downsample, n_draws) self._epi_draw_source = src self.pusher = SuperPusher(spec={ 'file_pattern': "{modelable_entity_id}/{location_id}.h5", 'h5_tablename': 'draws' }, directory=output_dir)
def aggregate_locations(aggregation_type: str, parent_dir: str, measure_id: int, gbd_round_id: int, location_set_id: int, year_id: int) -> None: """ Uses a AggMemEff aggregator to aggregate locations for deaths and YLLs. Arguments: aggregation_type (str): the type of data to be aggregated up a location hierarchy. One of 'aggregated/rescaled', 'aggregated/shocks', 'aggregated/unscaled' 'scaled', or 'unaggregated/shocks'. parent_dir (str): parent fauxcorrect directory e.g. PATH/{version} measure_id (int): measure ID for deaths or YLLs gbd_round_id (int): GBD round ID for this fauxcorrect run location_set_id (int): location set ID with which to aggregate year_id (int): draws year ID Raises: ValueError: if measure_id is not deaths (1) or YLLs (4) """ # Set up DrawSource and DrawSink. source_dir, sink_dir = _get_draw_source_sink_dirs(parent_dir, aggregation_type, measure_id) source, draw_filters = _get_draw_source_and_filters( aggregation_type, source_dir, year_id, measure_id) sink = DrawSink({ 'draw_dir': sink_dir, 'file_pattern': FilePaths.LOCATION_AGGREGATE_FILE_PATTERN.format(year_id=year_id), 'h5_tablename': Keys.DRAWS }) sink.add_transform(_apply_regional_scalars, parent_dir, gbd_round_id, location_set_id) sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) # clean up old files we plan on writing clean_aggregation_directory(root_dir=sink.params['draw_dir'], file_pattern=sink.params['file_pattern'], location_set_id=location_set_id, gbd_round_id=gbd_round_id) # Set up aggregator and location tree. index_cols = ([col for col in Columns.INDEX if col != Columns.LOCATION_ID]) operator = Sum(index_cols, Columns.DRAWS) agg = AggMemEff(source, sink, index_cols, Columns.LOCATION_ID, operator, chunksize=2) is_sdi_set = location_set_id == LocationSetId.SDI trees = loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id, return_many=is_sdi_set) logging.info(f"Aggregating locations, location_set_id: {location_set_id}") for tree in np.atleast_1d(trees): agg.run(tree, draw_filters=draw_filters, n_processes=10)