def run_pipeline_upload(como_dir, component, location_id): cv = ComoVersion(como_dir) cv.load_cache() if component == 'cause': upload_cause_summaries(cv, location_id) elif component == 'sequela': upload_sequela_summaries(cv, location_id) elif component == 'impairment': upload_rei_summaries(cv, location_id) elif component == "injuries": upload_inj_summaries(cv, location_id)
def run_pipeline_aggregate_locations(como_dir, component, year_id, sex_id, measure_id, location_set_id): # resume the como version with stored parameters cv = ComoVersion(como_dir) cv.load_cache() if component == "cause": agg_causes(cv, year_id, sex_id, measure_id, location_set_id) if component == "sequela": agg_sequelae(cv, year_id, sex_id, measure_id, location_set_id) if component == "impairment": agg_impairment(cv, year_id, sex_id, measure_id, location_set_id) if component == "injuries": agg_injuries(cv, year_id, sex_id, measure_id, location_set_id)
def run_pipeline_nonfatal(como_dir, location_id=[], year_id=[], sex_id=[], age_group_id=[], measure_id=[], n_processes=23, n_simulants=40000, *args, **kwargs): """run the nonfatal calculation on most detailed demographics Args: como_dir (str): location_id (list, optional): year_id (list, optional): sex_id (list, optional): measure_id (list, optional): n_processes (int, optional): n_simulants (int, optional): *args and **kwargs are passed into the simulation as parameters """ # resume the como version with stored parameters cv = ComoVersion(como_dir) cv.load_cache() # set up the nonfatal computation object for our demographic set cnf = ComputeNonfatal(cv, location_id=location_id, year_id=year_id, sex_id=sex_id, age_group_id=age_group_id, measure_id=measure_id) # import data cnf.import_data(n_processes=n_processes) # compute all results cnf.compute_results(n_simulants=n_simulants, n_processes=n_processes, *args, **kwargs) # write results to disk cnf.write_results()
def setup_env(como_version_id): global cv, simdir, pooldir, sg cv = ComoVersion(como_version_id) simdir = os.path.join(cv.root_dir, 'simulants') pooldir = os.path.join(cv.root_dir, 'locsims') try: os.makedirs(pooldir) except: pass sg = SuperGopher( { 'file_pattern': 'sims_{location_id}_{year_id}_{sex_id}.h5', 'h5_tablename': 'draws' }, simdir)
def run_como( como_dir=None, root_dir="FILEPATH", gbd_round_id=5, location_set_id=35, year_id=list(range(1990, 2018)), measure_id=[3, 5, 6], n_draws=1000, n_simulants=20000, components=["cause", "sequela", "injuries", "impairment"], change_years=[(1990, 2007), (2007, 2017), (1990, 2017)], agg_loc_sets=[35, 83], project="proj_como"): special_sets = set(agg_loc_sets) - set([location_set_id]) all_sets = set(agg_loc_sets) | set([location_set_id]) if como_dir is not None: cv = ComoVersion(como_dir) cv.load_cache() else: cv = ComoVersion.new( root_dir, gbd_round_id, location_set_id, year_id, measure_id, n_draws, components, change_years, special_sets) cwf = ComoWorkFlow(cv) cwf.add_tasks_to_dag(n_simulants=n_simulants, agg_loc_sets=all_sets) if cwf.run_workflow(project=project): all_locs = [] for location_set_id in all_sets: loc_tree = loctree(location_set_id=location_set_id, gbd_round_id=cv.gbd_round_id) all_locs.extend(loc_tree.node_ids) all_locs = list(set(all_locs)) run_upload(cv, all_locs) else: raise RuntimeError("como unsuccessful")
"sex_id": dimensions.index_dim.get_level("sex_id") }, n_processes=self.chunksize[component]) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Compute nonfatal aggregate for a year-sex-measure") parser.add_argument("--como_dir", type=str, help="directory of como run") parser.add_argument("--component", type=str, help="which component to aggregate") parser.add_argument("--year_id", type=int, help="year_id to aggregate") parser.add_argument("--sex_id", type=int, help="sex_id to aggregate") parser.add_argument("--measure_id", type=int, help="measure_id to aggregate") parser.add_argument("--location_set_version_id", type=int, help="location_set_version_id to aggregate") parser.add_argument("--redis_host", type=str, help="redis_host to manage concurrent I/O") args = parser.parse_args() cv = ComoVersion(args.como_dir) cv.load_cache() task = LocationAggTask(cv, args.measure_id, args.year_id, args.sex_id, args.redis_host) task.run_task(args.location_set_version_id, args.component)
def main(root_j_dir, root_tmp_dir, date, code_dir, in_dir, out_dir, ndraws, demographics, task_id): dems = demographics.ix[(demographics["task_id"] == task_id)] # subset based on task id to the demographic arguments location_id = np.asscalar(dems["location_id"].iloc[0]) year_id = np.asscalar(dems["year_id"].iloc[0]) sex_id = np.asscalar(dems["sex_id"].iloc[0]) # import hierarchies from Como from como.version import ComoVersion cv = ComoVersion("FILEPATH") cv.load_cache() # get dimensions and replace with what we are parallelizing # in this child script print "Copying dimensions from cv" dim = deepcopy(cv.dimensions) dim.index_dim.replace_level("location_id", location_id) dim.index_dim.replace_level("year_id", year_id) dim.index_dim.replace_level("sex_id", sex_id) print "Adding cause to dimensions" # add cause to dimensions dim.index_dim.add_level("cause_id", cv.cause_restrictions.cause_id.unique().tolist()) dim.index_dim.add_level("rei_id", cv.ncode_hierarchy.rei_id.unique().tolist()) # set the years so that we always have 2005 to calibrate years = list( set( cap_val(dim.index_dim.levels.year_id, [1990, 1995, 2000, 2005, 2010, 2016]) + [2005])) print "Years" print years # get all E-N combinations to use to make square data codes = pd.read_csv(os.path.join(code_dir, "FILEPATH.csv")) # Get Incidence df df_inc = get_incidence(dim, cv, years, codes, ndraws=ndraws) df_inc_agg = compute_aggregates(df_inc, dim, cv) assert len( df_inc_agg.index ) == 50922, "The number of rows in the injized DF is not correct." assert -df_inc_agg.duplicated(subset=[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'cause_id', 'rei_id' ]).any(), "The id columns do not uniquely identify the observations!" assert -df_inc_agg.isnull().any().any( ), "You have null values in the prevalence DF!" print "Writing results for incidence" write_result_draws(df_inc_agg, measure_id=6, dim=dim, out_dir=out_dir) # Get Prevalence df df_prev = get_prevalence(dim, cv, years, codes, ndraws=ndraws) df_prev_agg = compute_aggregates(df_prev, dim, cv) assert len( df_inc_agg.index ) == 50922, "The number of rows in the injized DF is not correct." assert -df_prev_agg.duplicated(subset=[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'cause_id', 'rei_id' ]).any(), "The id columns do not uniquely identify the observations!" assert -df_prev_agg.isnull().any().any( ), "You have null values in the prevalence DF!" print "Writing results for prevalence" write_result_draws(df_prev_agg, measure_id=5, dim=dim, out_dir=out_dir) # save checkfile when all done checkpath = os.path.join( "FILEPATH", "finished_{}_{}_{}.txt".format(location_id, year_id, sex_id)) tmp = open(checkpath, 'wb') tmp.close()
def run_pipeline_summarize(como_dir, component, location_id): cv = ComoVersion(como_dir) cv.load_cache() summ(cv, location_id, component)
def run_pipeline_como( root_dir, gbd_round_id=4, location_id=[], year_id=[], sex_id=[], age_group_id=[], measure_id=[], n_draws=1000, n_simulants=20000, components=["sequela", "cause", "impairment", "injuries"]): cv = ComoVersion.new(root_dir, gbd_round_id, location_id, year_id, sex_id, age_group_id, measure_id, n_draws, components) try: cjm = CentralJobMonitor(cv.como_dir, persistent=False) time.sleep(5) except Exception as e: raise e else: executor_params = {"request_timeout": 10000} jobq = JobQueue(cv.como_dir, scheduler=RetryScheduler, executor=SGEExecutor, executor_params=executor_params) # run nonfatal pipeline by location/year/sex parallelism = ["location_id", "sex_id"] for slices in cv.dimensions.index_slices(parallelism): jobname = "como_e_sim_{location_id}_{sex_id}".format( location_id=slices[0], sex_id=slices[1]) job = jobq.create_job( jobname=jobname, runfile=true_path(executable="compute_nonfatal"), parameters=[ "--como_dir", cv.como_dir, "--location_id", str(slices[0]), "--sex_id", str(slices[1]), "--n_processes", "23", "--n_simulants", str(n_simulants) ]) jobq.queue_job(job, slots=50, memory=400, project="proj_como", process_timeout=(60 * 180)) jobq.block_till_done(stop_scheduler_when_done=False) # run aggregation by year/sex/measure parallelism = ["year_id", "sex_id", "measure_id"] for slices in cv.dimensions.index_slices(parallelism): for component in cv.components: if component != "sequela": loc_sets = [35, 40] else: loc_sets = [35] for location_set_id in loc_sets: jobname = ("como_e_agg_{component}_{year_id}_{sex_id}" "_{measure_id}_{location_set_id}").format( component=component, year_id=slices[0], sex_id=slices[1], measure_id=slices[2], location_set_id=location_set_id) job = jobq.create_job( jobname=jobname, runfile=true_path(executable="aggregate_nonfatal"), parameters=[ "--como_dir", cv.como_dir, "--component", component, "--year_id", str(slices[0]), "--sex_id", str(slices[1]), "--measure_id", str(slices[2]), "--location_set_id", str(location_set_id) ]) jobq.queue_job(job, slots=25, memory=200, project="proj_como", process_timeout=(60 * 600)) jobq.block_till_done(stop_scheduler_when_done=False) # run summaries by component/location lt = dbtrees.loctree(None, 35) sdi_lts = dbtrees.loctree(None, 40, return_many=True) locs = [l.id for l in lt.nodes] sdi_locs = [l.root.id for l in sdi_lts] for component in cv.components: if component != "sequela": summ_locs = locs + sdi_locs else: summ_locs = locs[:] for location_id in summ_locs: jobname = "como_e_summ_{component}_{location_id}".format( component=component, location_id=location_id) job = jobq.create_job( jobname=jobname, runfile=true_path(executable="summarize_nonfatal"), parameters=[ "--como_dir", cv.como_dir, "--component", component, "--location_id", str(location_id) ]) jobq.queue_job(job, slots=48, memory=96, project="proj_como", process_timeout=(60 * 240)) jobq.block_till_done(stop_scheduler_when_done=False) for component in cv.components: jobname = "como_e_upload_{component}".format(component=component) job = jobq.create_job( jobname=jobname, runfile=true_path(executable="upload_nonfatal"), parameters=[ "--como_dir", cv.como_dir, "--component", component, "--location_id", " ".join([str(l) for l in locs + sdi_locs]) ]) jobq.queue_job(job, slots=20, memory=40, project="proj_como", process_timeout=(60 * 720)) jobq.block_till_done() finally: cjm.generate_report() cjm.stop_responder() cjm.stop_publisher()