def __init__( self, code_dir, out_dir, version_id, cause_ids, decomp_step, gbd_round_id=GBD.GBD_ROUND_ID ): self.code_dir = code_dir self.out_dir = out_dir self.version_id = version_id self.cause_ids = cause_ids self.decomp_step = decomp_step self.gbd_round_id = gbd_round_id username = getpass.getuser() self.workflow = Workflow( workflow_args='imported_cases_v{version}_{timestamp}'.format( version=self.version_id, timestamp=datetime.datetime.now().isoformat() ), name="Imported Cases Generator", project='proj_codcorrect', stdout=f'FILEPATH', stderr=f'FILEPATH' )
def __init__(self, run_id, run_type, holdouts, draws, nparallel, n_parameter_sets, cluster_project, error_log_path, output_log_path, location_set_id, gbd_round_id, custom_stage1, rake_logit, code_version, decomp_step, modelable_entity_id, output_path): self.run_id = run_id self.run_type = run_type self.holdouts = holdouts self.draws = draws self.nparallel = nparallel self.n_parameter_sets = n_parameter_sets self.cluster_project = cluster_project self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id self.custom_stage1 = custom_stage1 self.rake_logit = rake_logit self.code_version = code_version self.decomp_step = decomp_step self.output_path = output_path self.is_diet_model = modelable_entity_id in [ 2430, 2442, 2431, 2434, 2433, 2437, 2435, 2428, 2429, 2436, 2427, 2432, 2440, 9804, 2441, 2544, 23766, 2544, 2438, 23604, 23683 ] # set some stuff self.max_attempts = 3 # create workflow self.workflow = Workflow(workflow_args=f'stgpr_{self.run_id}', project=self.cluster_project, stderr=error_log_path, stdout=output_log_path, resume=True) # set up job lists self.stage1_jobs = {} self.st_jobs = {} self.descanso_jobs = {} self.gpr_jobs = {} self.post_jobs = {} self.rake_jobs = {} self.cleanup_jobs = {} self.eval_jobs = {} # set up conditionals if self.run_type == 'in_sample_selection': self.param_groups = np.array_split( list(range(0, self.n_parameter_sets)), MAX_SUBMISSIONS) elif self.run_type == 'oos_selection': split = math.floor(float(MAX_SUBMISSIONS) / float(self.holdouts)) self.param_groups = np.array_split( list(range(0, self.n_parameter_sets)), split) else: self.param_groups = np.array_split( list(range(0, self.n_parameter_sets)), 1)
def __init__(self, states): self.states = states self.wflow = Workflow( workflow_args='{}_find_zeros_04'.format(states[0][:2]), name='count_zero_pop_tracts', project='proj_cost_effect', stderr='/ihme/scratch/users/{}/sgeoutput'.format(user), stdout='/ihme/scratch/users/{}/sgeoutput'.format(user), working_dir='/homes/{}'.format(user), seconds_until_timeout=len(self.states) * 60 * 10)
class ImportedCasesJobSwarm(object): ADDITIONAL_RESTRICTIONS = {562: 'mental_drug_opioids'} """This class creates and submits the imported cases task dag.""" def __init__( self, code_dir, out_dir, version_id, cause_ids, decomp_step, gbd_round_id=GBD.GBD_ROUND_ID ): self.code_dir = code_dir self.out_dir = out_dir self.version_id = version_id self.cause_ids = cause_ids self.decomp_step = decomp_step self.gbd_round_id = gbd_round_id username = getpass.getuser() self.workflow = Workflow( workflow_args='imported_cases_v{version}_{timestamp}'.format( version=self.version_id, timestamp=datetime.datetime.now().isoformat() ), name="Imported Cases Generator", project='proj_codcorrect', stdout=f'FILEPATH', stderr=f'FILEPATH' ) def create_imported_cases_jobs(self): """Generates the tasks and adds them to the task_dag.""" # TODO: profile and revise core/mem allocation. for cause in self.cause_ids: task = PythonTask( script=os.path.join(self.code_dir, 'imported_cases.py'), args=[self.version_id, '--cause_id', cause, '--decomp_step', self.decomp_step, '--gbd_round_id', self.gbd_round_id, '--output_dir', self.out_dir], name='imported_cases_{}_{}'.format(self.version_id, cause), num_cores=42, m_mem_free="100.0G", max_attempts=3, tag='imported_cases', queue='all.q') self.workflow.add_task(task) def run(self): success = self.workflow.run() return success
class SplitCoDSwarm(object): _CODEDIR = os.path.dirname(os.path.abspath(__file__)) def __init__(self, source_id, proportion_ids, proportion_measure_id, sex_ids, gbd_round_id, decomp_step, intermediate_dir, outdir, project): self.source_id = source_id self.proportion_ids = proportion_ids self.proportion_measure_id = proportion_measure_id self.sex_ids = sex_ids self.gbd_round_id = gbd_round_id self.decomp_step = decomp_step self.intermediate_dir = intermediate_dir self.outdir = outdir time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') self.workflow = Workflow( workflow_args=('split_cod_model_interpolate_{}_{}'.format( source_id, time)), name='Split CoD Model cause_id: {}'.format(source_id), project=project, stderr=outdir, stdout=outdir) def add_interpolate_tasks(self): for meid in self.proportion_ids: for sex in self.sex_ids: arglist = [ '--gbd_id', meid, '--proportion_measure_id', self.proportion_measure_id, '--sex_id', sex, '--gbd_round_id', self.gbd_round_id, '--intermediate_dir', self.intermediate_dir ] if self.decomp_step: arglist.extend(['--decomp_step', self.decomp_step]) task = PythonTask(script=os.path.join(self._CODEDIR, 'split_interp.py'), args=arglist, name='split_model_interpolate_{}_{}'.format( meid, sex), num_cores=30, m_mem_free='60G', max_runtime_seconds=14400, max_attempts=10) self.workflow.add_task(task) def run(self): return self.workflow.run()
def main() -> None: args = parse_args() user = getpass.getuser() today_string = datetime.date.today().strftime('%m%d%y') workflow = Workflow( workflow_args=f'anemia_malaria_{args.decomp_step}_{today_string}', name=f'anemia_malaria_{args.decomp_step}_{today_string}', description= f'Anemia: Malaria pre-processing for decomp {args.decomp_step}', project="proj_anemia", stderr="FILEPATH", stdout="FILEPATH", working_dir=path_to_directory, resume=True) # first submit the subtract clinical jobs subtract_tasks = [] demo = get_demographics("epi", gbd_round_id=args.gbd_round_id) for loc in demo['location_id']: task = PythonTask(script="FILEPATH", args=[ "--location_id", loc, "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--out_dir", args.out_dir ], name=f"malaria_subtract_{loc}", tag="malaria_subtract", num_cores=2, m_mem_free="8G", max_attempts=3, max_runtime_seconds=60 * 60 * 3, queue='all.q') subtract_tasks.append(task) workflow.add_tasks(subtract_tasks) # once the new draws exist, save results for modelable_entity_id in [19390, 19394]: task = PythonTask(script="FILEPATH", args=[ "--modelable_entity_id", modelable_entity_id, "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--out_dir", args.out_dir ], name=f"malaria_save_{modelable_entity_id}", tag="malaria_save", upstream_tasks=subtract_tasks, num_cores=8, m_mem_free="100G", max_attempts=3, max_runtime_seconds=60 * 60 * 24, queue='all.q') workflow.add_task(task) status = workflow.run() print(f'Workflow finished with status {status}')
def __init__(self, source_id, proportion_ids, proportion_measure_id, sex_ids, gbd_round_id, decomp_step, intermediate_dir, outdir, project): self.source_id = source_id self.proportion_ids = proportion_ids self.proportion_measure_id = proportion_measure_id self.sex_ids = sex_ids self.gbd_round_id = gbd_round_id self.decomp_step = decomp_step self.intermediate_dir = intermediate_dir self.outdir = outdir time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') self.workflow = Workflow( workflow_args=('split_cod_model_interpolate_{}_{}'.format( source_id, time)), name='Split CoD Model cause_id: {}'.format(source_id), project=project, stderr=outdir, stdout=outdir)
def build_jobmon_workflow(self, identifier=None, extra_arguments=None): """ Returns jobmon workflow that represents cascade job dag. Args: identifier (str): A unique string to identify this workflow for JobMon. Running twice with the same string will restart a workflow. extra_arguments (List[str]): Command-line arguments to add to every UGE Job specified in Jobmon. Returns: jobmon.Workflow: With all Jobmon tasks created. """ extra_arguments = extra_arguments if extra_arguments else list() cv_iters = None if not self.run_cv else list(range(11)) demo = Demographics(self.mvid) lsvid = self.mvm.location_set_version_id.values[0] lt = loctree( location_set_id=demo.LOCATION_SET_ID, location_set_version_id=lsvid, gbd_round_id=demo.gbd_round_id) desc = self.mvm.description.values[0] jobdag = make_dag( mvid=self.mvid, loctree=lt, cv_iter=cv_iters, add_arguments=extra_arguments ) env = settings['env_variables']['ENVIRONMENT_NAME'] identifier = identifier if identifier else f"dismod_{self.mvid}_{env}" wf = Workflow( workflow_args=identifier, name=f"dismod_{self.mvid}_{env}", resume=True, description=desc, project=self.project, stderr=self.logdir, stdout=self.logdir, seconds_until_timeout=1210000) # since we're looping through the dict and mutating each JobNode # to contain a reference to a PythonTask, we require the jobdag dict # to be sorted such that we've already visited all upstream tasks of # any given node. for jobname, dagnode in jobdag.items(): dagnode.add_job(wf, jobdag, self.mvm) return wf
def __init__( self, parameters: params.master.CoDCorrectParameters, resume: bool = False ): """ Creates an instance of a FauxCorrect JobSwarm. Arguments: parameters (parameters.master.FauxCorrectParameters): instance of the FauxCorrrect parameters that this job swarm will execute. """ self.parameters: params.master.CoDCorrectParameters = parameters # Intuit the root code directory self.code_dir: str = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) # Create workflow object self.workflow: Workflow = Workflow( workflow_args=DAG.Workflow.CODCORRECT_WORKFLOW_ARGS.format( version_id=self.parameters.version_id, timestamp=datetime.datetime.now().isoformat() ), name=DAG.Workflow.CODCORRECT_NAME.format( version_id=self.parameters.version_id ), project=DAG.Workflow.PROJECT, stdout=os.path.join( self.parameters.parent_dir, FilePaths.LOG_DIR, FilePaths.STDOUT ), stderr=os.path.join( self.parameters.parent_dir, FilePaths.LOG_DIR, FilePaths.STDERR ), resume=resume, # 2 weeks (14*24*60*60) seconds_until_timeout=(1209600) ) self.task_map: Dict[str, Dict[str, PythonTask]] = defaultdict(dict)
class Swarm_binning(object): fips_dict = { 'al': 1, 'ak': 2, 'ar': 5, 'az': 4, 'ca': 6, 'co': 8, 'ct': 9, 'de': 10, 'dc': 11, 'fl': 12, 'ga': 13, 'hi': 15, 'id': 16, 'il': 17, 'in': 18, 'ia': 19, 'ks': 20, 'ky': 21, 'la': 22, 'me': 23, 'md': 24, 'ma': 25, 'mi': 26, 'mn': 27, 'ms': 28, 'mo': 29, 'mt': 30, 'ne': 31, 'nv': 32, 'nh': 33, 'nj': 34, 'nm': 35, 'ny': 36, 'nc': 37, 'nd': 38, 'oh': 39, 'ok': 40, 'or': 41, 'pa': 42, 'ri': 44, 'sc': 45, 'sd': 46, 'tn': 47, 'tx': 48, 'ut': 49, 'vt': 50, 'va': 51, 'wa': 53, 'wv': 54, 'wi': 55, 'wy': 56, 'pr': 72 } def __init__(self, states): self.states = states self.county_dict = load_county_dicts(states) self.tract_dict = load_tract_dicts(states) self.wflow = Workflow( workflow_args='{}_bin_co_04'.format(states[0][:2]), name='count_zero_pop_tracts', project='proj_cost_effect', stderr='/ihme/scratch/users/{}/sgeoutput'.format(user), stdout='/ihme/scratch/users/{}/sgeoutput'.format(user), working_dir='/homes/{}'.format(user), seconds_until_timeout=len(self.states) * 60 * 10) def add_state_counties(self): interpreter = '/ihme/code/beatrixh/miniconda/envs/pyomo/bin/python' script = '/ihme/code/beatrixh/microsim_2020/census_2020/synthetic_pop/binning/count_bins_head.py' for state in self.states: counties = self.county_dict[state] #grab counties per state for county in counties: tracts = self.tract_dict[(state, county)] #grab tracts per county rtime = len(tracts) * 2 #expected runtime rtime = rtime + 600 # buffer args = state + ' ' + str(county) cmd = interpreter + ' ' + script + ' ' + args task = BashTask(cmd, name='bin_and_calc_n_hi_{}_{}'.format( state, county), num_cores=1, m_mem_free=10, max_attempts=3, max_runtime_seconds=rtime, resource_scales={ 'm_mem_free': 0.3, 'max_runtime_seconds': 2.0 }, queue='long.q') self.wflow.add_task(task) print("added {}".format(task.name))
class Find_pop_zero_tracts(object): fips_dict = { 'al': 1, 'ak': 2, 'ar': 5, 'az': 4, 'ca': 6, 'co': 8, 'ct': 9, 'de': 10, 'dc': 11, 'fl': 12, 'ga': 13, 'hi': 15, 'id': 16, 'il': 17, 'in': 18, 'ia': 19, 'ks': 20, 'ky': 21, 'la': 22, 'me': 23, 'md': 24, 'ma': 25, 'mi': 26, 'mn': 27, 'ms': 28, 'mo': 29, 'mt': 30, 'ne': 31, 'nv': 32, 'nh': 33, 'nj': 34, 'nm': 35, 'ny': 36, 'nc': 37, 'nd': 38, 'oh': 39, 'ok': 40, 'or': 41, 'pa': 42, 'ri': 44, 'sc': 45, 'sd': 46, 'tn': 47, 'tx': 48, 'ut': 49, 'vt': 50, 'va': 51, 'wa': 53, 'wv': 54, 'wi': 55, 'wy': 56, 'pr': 72 } def __init__(self, states): self.states = states self.wflow = Workflow( workflow_args='{}_find_zeros_04'.format(states[0][:2]), name='count_zero_pop_tracts', project='proj_cost_effect', stderr='/ihme/scratch/users/{}/sgeoutput'.format(user), stdout='/ihme/scratch/users/{}/sgeoutput'.format(user), working_dir='/homes/{}'.format(user), seconds_until_timeout=len(self.states) * 60 * 10) def add_states(self): interpreter = '/ihme/code/beatrixh/miniconda/envs/pyomo/bin/python' script = '/ihme/code/beatrixh/microsim_2020/census_2020/synthetic_pop/gen_synth_pop/identify_pop_zero_tracts.py' for state in self.states: args = state cmd = interpreter + ' ' + script + ' ' + args task = BashTask(cmd, name='find_pop_zero_tracts_{}'.format(state), num_cores=1, m_mem_free=10, max_attempts=3, max_runtime_seconds=60 * 10, resource_scales={ 'm_mem_free': 0.3, 'max_runtime_seconds': 2.0 }, queue='all.q') self.wflow.add_task(task) print("added {}".format(task.name))
def generate_workflow(self, wf_name): wf = Workflow(workflow_args=wf_name, project="proj_mortenvelope", stdout=self.stdout, stderr=self.stderr, seconds_until_timeout=777600, resume=True) self.create_directories(master_dir=self.master_dir, subdirs=[ 'inputs', 'shock_numbers', 'hiv_adjust', 'logs', 'upload', 'abridged_lt', 'full_lt' ]) self.create_directories(master_dir=self.reckoning_output_dir, subdirs=[ 'lt_whiv', 'lt_hivdel', 'envelope_whiv', 'envelope_hivdel' ]) self.create_directories(master_dir=self.full_lt_dir, subdirs=['no_hiv', 'with_hiv', 'with_shock']) self.create_directories(master_dir=self.abridged_lt_dir, subdirs=['no_hiv', 'with_hiv', 'with_shock']) self.create_directories(master_dir=self.abridged_lt_dir, subdirs=['no_hiv', 'with_hiv', 'with_shock']) self.create_directories(master_dir=self.upload_dir) self.create_directories( master_dir=self.log_dir, subdirs=[ 'full_with_hiv_mx_vs_no_hiv', 'full_shock_mx_vs_with_hiv', 'abridged_with_hiv_mx_vs_no_hiv', 'abridged_shock_mx_vs_with_hiv', 'abridged_no_hiv_qx_1_5', 'abridged_with_hiv_qx_1_5', 'abridged_shock_qx_1_5', 'shock_rate_compare', 'ax_compare' ]) # Get locations most_detail_locations = call_mort_function("get_locations", { "level": "lowest", "gbd_year": self.gbd_year }) most_detail_loc_ids = most_detail_locations.location_id.tolist() # Generate save inputs task # job name: agg_save_inputs_{} # script being ran: save_inputs.R save_inputs_task = self.generate_save_inputs_task(upstream_tasks=[]) wf.add_task(save_inputs_task) # Generate full lt tasks # job name: gen_full_{loc}_{version} # script being ran: full_lt.R full_lt_tasks = {} for loc in most_detail_loc_ids: full_lt_tasks[loc] = self.generate_full_lt_task( upstream_tasks=[save_inputs_task], loc=loc) wf.add_task(full_lt_tasks[loc]) # Run finalizer if self.run_finalizer: finalizer_run_task = self.generate_finalizer_task( upstream_tasks=full_lt_tasks.values()) wf.add_task(finalizer_run_task) # Generate rest of full_lt tasks and add to the workflow # job names: "agg_full_{loc}_{lt_type}_{version}" # script being ran: aggregate_lts.R if self.aggregate_full_lts: # Get aggregate locations locations = call_mort_function("get_locations", { "level": "all", "gbd_year": self.gbd_year }) agg_locations = locations[(locations.level == 3) & ( ~locations.location_id.isin(most_detail_loc_ids))] agg_loc_ids = agg_locations.location_id.tolist() # Generate agg tasks agg_tasks = {} for loc in agg_loc_ids: num_children = len(most_detail_locations.loc[ most_detail_locations.path_to_top_parent.str.contains( "," + str(loc) + ",")]) for lt_type in ['with_shock', 'with_hiv', 'no_hiv']: agg_task_key = str(loc) + "_" + lt_type agg_tasks[agg_task_key] = self.generate_aggregate_lt_task( upstream_tasks=full_lt_tasks.values(), loc=loc, lt_type=lt_type, num_children=num_children) wf.add_task(agg_tasks[agg_task_key]) # Generate upload task # job name: full_life_table_upload_{} # script name: compile_upload_results.R if self.upload: upload_task = self.generate_full_upload_task( upstream_tasks=agg_tasks.values()) wf.add_task(upload_task) return wf
def main() -> None: args = parse_args() user = getpass.getuser() today_string = datetime.date.today().strftime('%m%d%y') workflow = Workflow( workflow_args=f'anemia_causal_attribution_new_{args.decomp_step}_{today_string}', name=f'anemia_causal_attribution_{args.decomp_step}_{today_string}', description=f'Anemia: Causal attribution for decomp {args.decomp_step}', project="proj_anemia", stderr="FILEPATH", stdout="FILEPATH", working_dir=path_to_directory, resume=True) causal_attribution_tasks = [] demo = get_demographics("epi", gbd_round_id=args.gbd_round_id) for location_id in demo['location_id']: prev_year_task = None for year in args.year_id: cmd = ( f'FILEPATH ' f'FILEPATH ' f'FILEPATH ' f'{location_id} {year} {args.gbd_round_id} {args.decomp_step} ' f'{path_to_directory}/ {args.out_dir}' ) if prev_year_task: task = BashTask( command=cmd, name=f'causal_attribution_{location_id}_{year}', tag='causal_attribution', upstream_tasks=[prev_year_task], num_cores=1, m_mem_free='4G', max_attempts=3, max_runtime_seconds=60*60*2, queue='all.q') else: task = BashTask( command=cmd, name=f'causal_attribution_{location_id}_{year}', tag='causal_attribution', num_cores=1, m_mem_free='4G', max_attempts=3, max_runtime_seconds=60*60*2, queue='all.q') causal_attribution_tasks.append(task) prev_year_task = task workflow.add_tasks(causal_attribution_tasks) # once the draws exist, save results meids = pd.read_excel("FILEPATH") meids = meids.filter(like='modelable_entity').values.flatten() for modelable_entity_id in meids.tolist(): task = PythonTask( script="FILEPATH", args=[ "--modelable_entity_id", modelable_entity_id, "--year_id", " ".join([str(yr) for yr in args.year_id]), "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--save_dir", "FILEPATH" ], name=f"save_{modelable_entity_id}", tag="save", upstream_tasks=causal_attribution_tasks, num_cores=8, m_mem_free="100G", max_attempts=3, max_runtime_seconds=60*60*24, queue='all.q') workflow.add_task(task) status = workflow.run() print(f'Workflow finished with status {status}')
class STGPRJobSwarm: def __init__(self, run_id, run_type, holdouts, draws, nparallel, n_parameter_sets, cluster_project, error_log_path, output_log_path, location_set_id, gbd_round_id, custom_stage1, rake_logit, code_version, decomp_step, modelable_entity_id, output_path): self.run_id = run_id self.run_type = run_type self.holdouts = holdouts self.draws = draws self.nparallel = nparallel self.n_parameter_sets = n_parameter_sets self.cluster_project = cluster_project self.location_set_id = location_set_id self.gbd_round_id = gbd_round_id self.custom_stage1 = custom_stage1 self.rake_logit = rake_logit self.code_version = code_version self.decomp_step = decomp_step self.output_path = output_path self.is_diet_model = modelable_entity_id in [ 2430, 2442, 2431, 2434, 2433, 2437, 2435, 2428, 2429, 2436, 2427, 2432, 2440, 9804, 2441, 2544, 23766, 2544, 2438, 23604, 23683 ] # set some stuff self.max_attempts = 3 # create workflow self.workflow = Workflow(workflow_args=f'stgpr_{self.run_id}', project=self.cluster_project, stderr=error_log_path, stdout=output_log_path, resume=True) # set up job lists self.stage1_jobs = {} self.st_jobs = {} self.descanso_jobs = {} self.gpr_jobs = {} self.post_jobs = {} self.rake_jobs = {} self.cleanup_jobs = {} self.eval_jobs = {} # set up conditionals if self.run_type == 'in_sample_selection': self.param_groups = np.array_split( list(range(0, self.n_parameter_sets)), MAX_SUBMISSIONS) elif self.run_type == 'oos_selection': split = math.floor(float(MAX_SUBMISSIONS) / float(self.holdouts)) self.param_groups = np.array_split( list(range(0, self.n_parameter_sets)), split) else: self.param_groups = np.array_split( list(range(0, self.n_parameter_sets)), 1) def prep_parallelization_groups(self): """Parallize by splitting locations to be modelled into *nparallel* groups. This function grabs the location hierarchy, identifies needed locations, and assigns each one to a parallelization group.""" session = ezfuncs.get_session(conn_def='epi') locs = query.get_locations(self.location_set_id, self.gbd_round_id, self.decomp_step, session)[0] self.locs = locs.sort_values(by=['level_{}'.format(NATIONAL_LEVEL)]) self.parallel_groups = np.array_split( self.locs.loc[self.locs.level >= NATIONAL_LEVEL][SPACEVAR].values, self.nparallel) # prep raking upstreams and submission locations lvl = 'level_{}'.format(NATIONAL_LEVEL) self.subnat_locations = (locs.loc[locs.level > NATIONAL_LEVEL, lvl].unique().astype(int)) def assign_rake_runtimes(self): """ Rake jobs have immensely different times based almost exclusively on the number of subnationals nested within the national location. Assign memory based on the number of subnationals for each national location with subnationals, using (slightly modified) intercept and beta values from a super simple linear regression, 'memory ~ n_subnats' for one very data-dense model (ie a good upper bound for all st-gpr models) Memory Intercept: .75 Memory Beta_N_subnats: .5 (More conservative for runtime because not as wasteful to go high) Runtime Intercept: 5 (min), so 300 sec Runtime Beta_N_subnats: .35 (min), so 21 sec """ natcol = f'level_{NATIONAL_LEVEL}' self.locs['subnat'] = \ (self.locs['level'] > NATIONAL_LEVEL).astype(int) n_subnats = self.locs.groupby(natcol)['subnat'].sum() n_subnats = n_subnats.loc[n_subnats > 0].reset_index(name='N') n_subnats[natcol] = n_subnats[natcol].astype(int) # assign memory n_subnats['mem'] = .75 + .5 * n_subnats['N'] n_subnats['runtime'] = 300 + 21 * n_subnats['N'] n_subnats = n_subnats.rename(columns={natcol: 'location'}) self.rake_memory_df = n_subnats.copy() def create_stage1_jobs(self): """First set of tasks, thus no upstream tasks. Only run stage1 if no custom stage1 (custom_stage1) estimates. """ for ko in list(range(0, self.holdouts + 1)): # ie shell, script, and args pasted together model_root = os.path.join(paths.CODE_ROOT, 'model') cmd = (f'{RSHELL} -s {STAGE1_SCRIPT} ' f'{self.output_path} {model_root} {ko}') task = BashTask(command=cmd, name=f'stage1_{self.run_id}_{ko}', num_cores=1, m_mem_free='3G', max_attempts=2, max_runtime_seconds=300, tag='stgpr_stage1', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) self.workflow.add_task(task) self.stage1_jobs[task.name] = task def create_st_jobs(self): for ko in range(0, self.holdouts + 1): upstream_job = self.stage1_jobs['stage1_{}_{}'.format( self.run_id, ko)] for param_group in range(0, len(self.param_groups)): for loc_group in range(0, self.nparallel): submit_params = ','.join( [str(x) for x in self.param_groups[param_group]]) jname = 'st_{}_{}_{}_{}'.format(self.run_id, ko, param_group, loc_group) memory = 50 runtime = 1500 if self.is_diet_model: memory = 120 runtime = 28800 # 8 hours task = PythonTask(script=ST_SCRIPT, args=[ self.run_id, self.output_path, ko, self.run_type, submit_params, self.nparallel, loc_group ], name=jname, num_cores=6, m_mem_free=f'{memory}G', max_attempts=3, max_runtime_seconds=runtime, tag='stgpr_spacetime', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) task.add_upstream(upstream_job) self.workflow.add_task(task) self.st_jobs[task.name] = task def create_descanso_jobs(self): """Depends on aggregate locations coming out of loc agg jobs""" for ko in list(range(0, self.holdouts + 1)): for param_group in list(range(0, len(self.param_groups))): submit_params = ','.join( [str(x) for x in self.param_groups[param_group]]) runtime = 3600 if self.is_diet_model else 300 task = PythonTask( script=IM_SCRIPT, args=[ self.run_id, self.output_path, ko, self.draws, self.nparallel, submit_params ], name=f'descanso_{self.run_id}_{ko}_{param_group}', num_cores=1, m_mem_free= '20G', # upped from 5 to 20 for variance simulation max_runtime_seconds=runtime, max_attempts=2, tag='stgpr_amp_nsv', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) # add ST upstreams for loc_group in list(range(0, self.nparallel)): st_label = 'st_{}_{}_{}_{}'.format(self.run_id, ko, param_group, loc_group) upstream_job = self.st_jobs[st_label] task.add_upstream(upstream_job) self.workflow.add_task(task) self.descanso_jobs[task.name] = task def create_gpr_jobs(self): # set runtime and memory based on draws gpr_runtime = 1200 gpr_memory = 4 if self.draws == 100: gpr_runtime = 1500 gpr_memory = 7 elif self.draws == 1000: gpr_runtime = 1800 gpr_memory = 10 if self.is_diet_model: gpr_runtime *= 3 gpr_memory *= 3 for ko in list(range(0, self.holdouts + 1)): for param_group in list(range(0, len(self.param_groups))): upstream_job = self.descanso_jobs['descanso_{}_{}_{}'.format( self.run_id, ko, param_group)] for loc_group in list(range(0, self.nparallel)): submit_params = ','.join( [str(x) for x in self.param_groups[param_group]]) jname = 'gpr_{}_{}_{}_{}'.format(self.run_id, ko, param_group, loc_group) task = PythonTask(script=GPR_SCRIPT, args=[ self.run_id, self.output_path, ko, self.draws, submit_params, self.nparallel, loc_group ], name=jname, num_cores=1, m_mem_free=f'{gpr_memory}G', max_runtime_seconds=gpr_runtime, max_attempts=2, tag='stgpr_gpr', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) task.add_upstream(upstream_job) self.workflow.add_task(task) self.gpr_jobs[task.name] = task def create_rake_jobs(self): """Depends on GPR jobs including all the subnationals and national locations for each rake job, parallelized out by parent_id. Raking only done on the first KO (KO 0), which does not hold out any data from the dataset.""" for loc in self.subnat_locations: mem = int( np.ceil( self.rake_memory_df.query(f'location == {loc}') ['mem'].iat[0])) rt = int( np.ceil( self.rake_memory_df.query(f'location == {loc}') ['runtime'].iat[0])) if self.draws == 1000: mem *= 2 rt *= 3 rt = max(rt, 7200) if self.is_diet_model: mem *= 2 rt *= 3 rt = max(rt, 14400) task = PythonTask(script=RAKE_SCRIPT, args=[ self.run_id, self.output_path, 0, self.draws, self.run_type, self.rake_logit, loc ], name=f'rake_{self.run_id}_{loc}', num_cores=1, m_mem_free=f'{mem}G', max_runtime_seconds=rt, max_attempts=2, tag='stgpr_rake', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) # grab all subnationals and country location_ids associated with a country lvl = 'level_{}'.format(NATIONAL_LEVEL) all_needed_locs = self.locs.loc[self.locs[lvl] == loc, 'location_id'].unique() # add each gpr job containing a needed national/subnational # for raking to upstreams if self.holdouts == 0: for param_group in list(range(0, len(self.param_groups))): for loc_group in list(range(0, self.nparallel)): loc_group_vals = self.parallel_groups[loc_group] common_elements = len( intersection(all_needed_locs.tolist(), loc_group_vals.tolist())) if common_elements > 0: task.add_upstream( self.gpr_jobs['gpr_{}_0_{}_{}'.format( self.run_id, param_group, loc_group)]) else: task.add_upstream(self.eval_jobs['eval_{}'.format( self.run_id)]) self.workflow.add_task(task) self.rake_jobs[task.name] = task def create_post_jobs(self): """Depends on rake jobs. Calculates fit stats and cleans up file folders, no mas.""" for ko in list(range(0, self.holdouts + 1)): for param_group in list(range(0, len(self.param_groups))): submit_params = ','.join( [str(x) for x in self.param_groups[param_group]]) task = PythonTask( script=POST_SCRIPT, args=[ self.run_id, self.output_path, ko, self.run_type, self.holdouts, submit_params ], name=f'post_{self.run_id}_{ko}_{param_group}', num_cores=1, m_mem_free='2G', max_runtime_seconds=300, max_attempts=2, tag='stgpr_post', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) # add ST upstreams for loc_group in list(range(0, self.nparallel)): gp_label = 'gpr_{}_{}_{}_{}'.format( self.run_id, ko, param_group, loc_group) upstream_job = self.gpr_jobs[gp_label] task.add_upstream(upstream_job) self.workflow.add_task(task) self.post_jobs[task.name] = task def create_cleanup_jobs(self): """Saves rake summaries and removes tempfiles no longer needed""" runtime = 600 if self.draws == 0 else 7200 for ko in list(range(0, self.holdouts + 1)): task = PythonTask(script=CLEANUP_SCRIPT, args=[ self.run_id, self.output_path, self.run_type, ko, self.draws ], name=f'clean_{self.run_id}_{ko}', num_cores=1, m_mem_free='1G', max_runtime_seconds=runtime, max_attempts=1, tag='stgpr_clean', queue='all.q') if ko == 0: for loc in self.subnat_locations: task.add_upstream(self.rake_jobs['rake_{}_{}'.format( self.run_id, loc)]) else: task.add_upstream(self.eval_jobs['eval_{}'.format( self.run_id)]) self.workflow.add_task(task) self.cleanup_jobs[task.name] = task def create_eval_jobs(self): """ For hyperparameter selection runs only, determine best hyperparameter set based on in-sample or out-of-sample RMSE. - run_type = "in_sample_selection" - run_type = "oos_selection" For runs with only one set of parameters, set the best_param_set to the *only* param_set (param_set 0) for consistency in rake inputs. Lastly, just collect the disparate fit_stats files and combine into a single file, saved as fit_stats.csv for all run types """ task = PythonTask(script=EVAL_SCRIPT, args=[ self.run_id, self.output_path, self.run_type, self.holdouts, self.n_parameter_sets ], name=f'eval_{self.run_id}', num_cores=1, m_mem_free='500M', max_runtime_seconds=180, max_attempts=2, tag='stgpr_eval', queue='all.q', resource_scales=RESOURCE_SCALES, hard_limits=True) for ko in list(range(0, self.holdouts + 1)): for param_group in list(range(0, len(self.param_groups))): post_label = 'post_{}_{}_{}'.format(self.run_id, ko, param_group) task.add_upstream(self.post_jobs[post_label]) self.workflow.add_task(task) self.eval_jobs[task.name] = task def run(self): # run main model estimation pipeline self.prep_parallelization_groups() self.create_stage1_jobs() self.create_st_jobs() self.create_descanso_jobs() self.create_gpr_jobs() self.create_post_jobs() # choose best parameter set to run rake for self.create_eval_jobs() # run rake/aggregation step and clean outputs self.assign_rake_runtimes() self.create_rake_jobs() self.create_cleanup_jobs() status = self.workflow.run() print(f'Workflow finished with status {status}') return status
class EpicWorkFlow(object): CODE_DIR = os.path.dirname(os.path.realpath(__file__)) USERNAME = getpass.getuser() DATA_DIR = "FILEPATH" LOG_DIR = os.path.join('PATH', USERNAME) YEAR_IDS = [1990, 1995, 2000, 2005, 2010, 2015, 2017, 2019] N_DRAWS = 1000 def __init__(self, version, mapbuilder, decomp_step, gbd_round_id, resume): # validate decomp_step validate_decomp_step("EPIC", decomp_step, gbd_round_id) self.DATA_DIR = os.path.join(self.DATA_DIR, str(version)) if not os.path.exists(self.DATA_DIR): os.makedirs(self.DATA_DIR) os.makedirs(os.path.join(self.DATA_DIR, FilePaths.INPUT_FILES_DIR)) self.decomp_step = decomp_step self.gbd_round_id = gbd_round_id self.resume = resume # create epic json map self.emap = mapbuilder.downstream_only("como") # instantiate the factories self._task_registry = {} self._sev_split_fac = SevSplitTaskFactory(self._task_registry) self._ex_adjust_fac = ExAdjustFactory(self._task_registry) self._super_squeeze_fac = SuperSqueezeFactory(self._task_registry) self._save_fac = SaveFactory() self.workflow = Workflow( workflow_args="epic_v{version}".format(version=version), name="EPIC Central Machinery", project=DAG.Tasks.PROJECT, stdout=os.path.join(self.LOG_DIR, "output"), stderr=os.path.join(self.LOG_DIR, "errors"), resume=resume, seconds_until_timeout=435600) if not resume: # Save best input models as csv for posting to EPIC tracker HUB # page then separate into individual json files for use in # downstream scripts. Take care that downstream processes do not # pick up a model_version_id from a previous run. Only # collect the best models once per run so we know exactly what # was a available at the start of the run and what was # consequently used in the rest of the workflow best_models = mapbuilder.best_models inputs = [int(x) for x in mapbuilder.inputs] best_models = best_models.loc[best_models[ Params.MODELABLE_ENTITY_ID].isin(inputs)] best_models.to_csv( os.path.join(self.DATA_DIR, FilePaths.INPUT_FILES_DIR, FilePaths.BEST_MODELS_FILE_PATTERN), index=False, encoding="utf8") for index, row in best_models.iterrows(): SaveFactory.save_model_metadata(self.DATA_DIR, row.modelable_entity_id, row.model_version_id, row.decomp_step) self._task_map = { DAG.Tasks.SPLIT: self._add_sev_split_task, DAG.Tasks.SUPER_SQUEEZE: self._add_super_squeeze_task, DAG.Tasks.EX_ADJUST: self._add_ex_adjust_task } # run every process in the pipeline regardless of whether or not # there is already a model saved self.pgraph = mapbuilder.P # get process nodes and build out jobmon workflow # create a subgraph from the process nodes top_sort = nx.topological_sort(self.pgraph) for node in top_sort: if node == mapbuilder.start_node: pass elif DAG.Tasks.SPLIT in node: self._task_map[DAG.Tasks.SPLIT](node) elif DAG.Tasks.SUPER_SQUEEZE in node: self._task_map[DAG.Tasks.SUPER_SQUEEZE](node) else: self._task_map[DAG.Tasks.EX_ADJUST](node) def _create_output_directories(self, meid_list): for meid in meid_list: directory = os.path.join(self.DATA_DIR, str(meid)) if os.path.exists(directory) and not self.resume: shutil.rmtree(directory) os.makedirs(directory) elif os.path.exists(directory) and self.resume: logging.info( f"Directory exists for modelable_entity_id {meid} " f"and resume is {self.resume}. Do not delete anything. " f"Continue workflow.") else: os.makedirs(directory) def _add_sev_split_task(self, node): logging.info(f"Adding {node} task") split_map = self.emap[node] split_id = int(split_map["kwargs"]["split_id"]) split_meta = SeverityPropMetadata(split_id=split_id, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id) split_version_id = split_meta.best_version meta_version = split_meta.get_metadata_version(split_version_id) parent_meid = int(meta_version.parent_meid()) children_meids = [int(x) for x in meta_version.child_meid().split(",")] # make output directories self._create_output_directories(children_meids) split_task = self._sev_split_fac.get_task( node=node, process_graph=self.pgraph, split_version_id=split_version_id, output_dir=self.DATA_DIR, decomp_step=self.decomp_step, year_id=self.YEAR_IDS, n_draws=self.N_DRAWS) self.workflow.add_task(split_task) self._task_registry[SevSplitTaskFactory.get_task_name( node)] = split_task description = ( f"Central_severity_split_{Params.DESCRIPTION_MAP[self.N_DRAWS]}") for meid in children_meids: measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] self._add_save_task(meid, "{location_id}.h5", description, measure_id, self.YEAR_IDS, self.N_DRAWS, split_task) def _add_save_task(self, meid, input_file_pattern, description, measure_id, year_id, n_draws, upstream_task): logging.info(f"Adding {meid} save task") args = { Params.PARENT_DIR: self.DATA_DIR, Params.INPUT_DIR: os.path.join(self.DATA_DIR, str(meid)), Params.INPUT_FILE_PATTERN: input_file_pattern, Params.MODELABLE_ENTITY_ID: meid, Params.DESCRIPTION: description, Params.MEASURE_ID: measure_id, Params.YEAR_ID: year_id, Params.DECOMP_STEP: self.decomp_step, Params.N_DRAWS: n_draws } save_task = self._save_fac.get_task(**args) for upt in list(np.atleast_1d(upstream_task)): save_task.add_upstream(upt) self.workflow.add_task(save_task) self._task_registry[SaveFactory.get_task_name(meid)] = save_task def _add_ex_adjust_task(self, node): logging.info(f"Adding {node} task") # compile submission arguments kwargs = self.emap[node]["kwargs"] try: copy_env_inc = kwargs.pop("copy_env_inc") measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE] except KeyError: copy_env_inc = False measure_id = [gbd.measures.PREVALENCE] # make output directories self._create_output_directories(self.pgraph.nodes[node]["outs"]) ex_adj_task = self._ex_adjust_fac.get_task( node=node, process_graph=self.pgraph, output_dir=self.DATA_DIR, decomp_step=self.decomp_step, year_id=self.YEAR_IDS, n_draws=self.N_DRAWS) self.workflow.add_task(ex_adj_task) self._task_registry[ExAdjustFactory.get_task_name(node)] = ex_adj_task description = (f"Exclusivity_adjustment_auto_mark_" f"{Params.DESCRIPTION_MAP[self.N_DRAWS]}") for meid in self.pgraph.nodes[node]["outs"]: self._add_save_task(meid, "{location_id}.h5", description, measure_id, self.YEAR_IDS, self.N_DRAWS, ex_adj_task) def _add_super_squeeze_task(self, node): logging.info(f"Adding {node} task") # make output directories self._create_output_directories(self.pgraph.nodes[node]["outs"]) # get dependency_list before parallelizing since the # dependencies are the same for each parallelized demographic dep_list = get_dependencies(node, self.pgraph, self._task_registry) epi_demo = get_demographics("epi", gbd_round_id=self.gbd_round_id) for location_id in epi_demo[Params.LOCATION_ID]: for year_id in self.YEAR_IDS: for sex_id in epi_demo[Params.SEX_ID]: ss_task = self._super_squeeze_fac.get_task( node=node, output_dir=self.DATA_DIR, location_id=location_id, year_id=year_id, sex_id=sex_id, decomp_step=self.decomp_step, n_draws=self.N_DRAWS, dependency_list=dep_list) self.workflow.add_task(ss_task) self._task_registry[SuperSqueezeFactory.get_task_name( node, location_id, year_id, sex_id)] = ss_task ss_upstream = [ self._task_registry[t] for t in list(self._task_registry.keys()) if DAG.Tasks.SUPER_SQUEEZE in t ] description = ( f"Super_Squeeze_auto_mark_{Params.DESCRIPTION_MAP[self.N_DRAWS]}") measure_id = [gbd.measures.PREVALENCE] for meid in self.pgraph.nodes[node]["outs"]: self._add_save_task( meid, "{location_id}/{measure_id}_{year_id}_{sex_id}.h5", description, measure_id, self.YEAR_IDS, self.N_DRAWS, ss_upstream)
def main() -> None: args = parse_args() user = getpass.getuser() today_string = datetime.date.today().strftime('%m%d%y') workflow = Workflow( workflow_args= f'anemia_post_interp_temp_{args.decomp_step}_{today_string}', name=f'anemia_post_{args.decomp_step}_{today_string}', description=f'Anemia: Post-processing for decomp {args.decomp_step}', project="proj_anemia", stderr="FILEPATH", stdout="FILEPATH", working_dir=path_to_directory, resume=True) anemia_causes = ('hiv', 'pud', 'gastritis', 'esrd_dialysis', 'ckd3', 'ckd4', 'ckd5', 'cirrhosis') for anemia_cause in anemia_causes: # load in the info table info_df = pd.read_csv("FILEPATH") new_me_id_list = info_df['proportion_me'].tolist() # submit compute job for each me_id compute_prop_tasks = [] for year in args.year_id: task = PythonTask(script="FILEPATH", args=[ "--year_id", year, "--anemia_cause", anemia_cause, "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--out_dir", args.out_dir ], name=f"make_{anemia_cause}_props_{year}", tag="compute_props", num_cores=1, m_mem_free="12G", max_attempts=3, max_runtime_seconds=60 * 60 * 2, queue='all.q') compute_prop_tasks.append(task) workflow.add_tasks(compute_prop_tasks) # submit save result jobs after compute jobs finish for new_me_id in new_me_id_list: task = PythonTask(script="FILEPATH", args=[ "--modelable_entity_id", new_me_id, "--year_ids", " ".join([str(yr) for yr in args.year_id ]), "--gbd_round_id", args.gbd_round_id, "--decomp_step", args.decomp_step, "--save_dir", 'FILEPATH' ], name=f"save_props_{new_me_id}", tag="save_props", upstream_tasks=compute_prop_tasks, num_cores=8, m_mem_free="90G", max_attempts=3, max_runtime_seconds=60 * 60 * 8, queue='long.q') workflow.add_task(task) status = workflow.run() print(f'Workflow finished with status {status}')
def __init__(self, version, mapbuilder, decomp_step, gbd_round_id, resume): # validate decomp_step validate_decomp_step("EPIC", decomp_step, gbd_round_id) self.DATA_DIR = os.path.join(self.DATA_DIR, str(version)) if not os.path.exists(self.DATA_DIR): os.makedirs(self.DATA_DIR) os.makedirs(os.path.join(self.DATA_DIR, FilePaths.INPUT_FILES_DIR)) self.decomp_step = decomp_step self.gbd_round_id = gbd_round_id self.resume = resume # create epic json map self.emap = mapbuilder.downstream_only("como") # instantiate the factories self._task_registry = {} self._sev_split_fac = SevSplitTaskFactory(self._task_registry) self._ex_adjust_fac = ExAdjustFactory(self._task_registry) self._super_squeeze_fac = SuperSqueezeFactory(self._task_registry) self._save_fac = SaveFactory() self.workflow = Workflow( workflow_args="epic_v{version}".format(version=version), name="EPIC Central Machinery", project=DAG.Tasks.PROJECT, stdout=os.path.join(self.LOG_DIR, "output"), stderr=os.path.join(self.LOG_DIR, "errors"), resume=resume, seconds_until_timeout=435600) if not resume: # Save best input models as csv for posting to EPIC tracker HUB # page then separate into individual json files for use in # downstream scripts. Take care that downstream processes do not # pick up a model_version_id from a previous run. Only # collect the best models once per run so we know exactly what # was a available at the start of the run and what was # consequently used in the rest of the workflow best_models = mapbuilder.best_models inputs = [int(x) for x in mapbuilder.inputs] best_models = best_models.loc[best_models[ Params.MODELABLE_ENTITY_ID].isin(inputs)] best_models.to_csv( os.path.join(self.DATA_DIR, FilePaths.INPUT_FILES_DIR, FilePaths.BEST_MODELS_FILE_PATTERN), index=False, encoding="utf8") for index, row in best_models.iterrows(): SaveFactory.save_model_metadata(self.DATA_DIR, row.modelable_entity_id, row.model_version_id, row.decomp_step) self._task_map = { DAG.Tasks.SPLIT: self._add_sev_split_task, DAG.Tasks.SUPER_SQUEEZE: self._add_super_squeeze_task, DAG.Tasks.EX_ADJUST: self._add_ex_adjust_task } # run every process in the pipeline regardless of whether or not # there is already a model saved self.pgraph = mapbuilder.P # get process nodes and build out jobmon workflow # create a subgraph from the process nodes top_sort = nx.topological_sort(self.pgraph) for node in top_sort: if node == mapbuilder.start_node: pass elif DAG.Tasks.SPLIT in node: self._task_map[DAG.Tasks.SPLIT](node) elif DAG.Tasks.SUPER_SQUEEZE in node: self._task_map[DAG.Tasks.SUPER_SQUEEZE](node) else: self._task_map[DAG.Tasks.EX_ADJUST](node)
def generate_workflow(self, wf_name, run_mv): wf = Workflow(workflow_args=wf_name, project="proj_mortenvelope", stdout=self.stdout, stderr=self.stderr, resume=True, seconds_until_timeout=174000) model_locations = call_mort_function("get_locations", { "gbd_type": "ap_old", "level": "estimate", "gbd_year": self.gbd_year }) model_locations = model_locations["ihme_loc_id"].tolist() lt_task = self.generate_empirical_lt_prep_task(upstream_tasks=[]) wf.add_task(lt_task) # optional machine vision prediction tasks if run_mv == True: mv_plot_task = {} for ihme_loc_id in model_locations: mv_plot_task[ihme_loc_id] = self.generate_mv_plots_task( [lt_task], ihme_loc_id) wf.add_task(mv_plot_task[ihme_loc_id]) mv_run_task = {} for ihme_loc_id in model_locations: mv_run_task[ihme_loc_id] = self.generate_run_mv_task( mv_plot_task.values(), ihme_loc_id) wf.add_task(mv_run_task[ihme_loc_id]) select_lts_task = self.generate_select_lts_task( mv_run_task.values(), run_mv) wf.add_task(select_lts_task) else: select_lts_task = self.generate_select_lts_task([lt_task], run_mv) wf.add_task(select_lts_task) return wf
def generate_workflow(self, wf_name): wf = Workflow(workflow_args=wf_name, project="proj_mortenvelope", stdout=self.stdout, stderr=self.stderr, resume=True, seconds_until_timeout=864000) # Create folder structure self.create_directories() # Save location metadata if they haven't been saved yet if os.path.isfile(self.input_dir + "/lt_match_map.csv") == False | os.path.isfile( self.input_dir + "lt_env_locations.csv") == False: self.save_location_metadata() # Step 1: prep input data prep_task = self.generate_prep_task([]) wf.add_task(prep_task) run_countries = pd.read_csv( self.input_dir + "/lt_match_map.csv")["ihme_loc_id"].tolist() # Step 2: generate life tables gen_lt_tasks = [] for country in run_countries: country_task = self.generate_gen_lt_task([prep_task], country) wf.add_task(country_task) gen_lt_tasks.append(country_task) # Step 3: scale results scaling_tasks = [] for year in self.run_years: year_task = self.generate_scaling_task(gen_lt_tasks, year) wf.add_task(year_task) scaling_tasks.append(year_task) # Step 4: Compile upload compile_task = self.generate_compile_upload_task(scaling_tasks) wf.add_task(compile_task) if self.send_slack: notify_task = self.generate_notify_task([compile_task]) wf.add_task(notify_task) return wf