Esempio n. 1
0
    def __init__(
            self,
            code_dir,
            out_dir,
            version_id,
            cause_ids,
            decomp_step,
            gbd_round_id=GBD.GBD_ROUND_ID
    ):
        self.code_dir = code_dir
        self.out_dir = out_dir
        self.version_id = version_id
        self.cause_ids = cause_ids
        self.decomp_step = decomp_step
        self.gbd_round_id = gbd_round_id

        username = getpass.getuser()
        self.workflow = Workflow(
            workflow_args='imported_cases_v{version}_{timestamp}'.format(
                version=self.version_id,
                timestamp=datetime.datetime.now().isoformat()
            ),
            name="Imported Cases Generator",
            project='proj_codcorrect',
            stdout=f'FILEPATH',
            stderr=f'FILEPATH'
        )
Esempio n. 2
0
    def __init__(self, run_id, run_type, holdouts, draws, nparallel,
                 n_parameter_sets, cluster_project, error_log_path,
                 output_log_path, location_set_id, gbd_round_id, custom_stage1,
                 rake_logit, code_version, decomp_step, modelable_entity_id,
                 output_path):

        self.run_id = run_id
        self.run_type = run_type
        self.holdouts = holdouts
        self.draws = draws
        self.nparallel = nparallel
        self.n_parameter_sets = n_parameter_sets
        self.cluster_project = cluster_project
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id
        self.custom_stage1 = custom_stage1
        self.rake_logit = rake_logit
        self.code_version = code_version
        self.decomp_step = decomp_step
        self.output_path = output_path
        self.is_diet_model = modelable_entity_id in [
            2430, 2442, 2431, 2434, 2433, 2437, 2435, 2428, 2429, 2436, 2427,
            2432, 2440, 9804, 2441, 2544, 23766, 2544, 2438, 23604, 23683
        ]

        # set some stuff
        self.max_attempts = 3

        # create workflow
        self.workflow = Workflow(workflow_args=f'stgpr_{self.run_id}',
                                 project=self.cluster_project,
                                 stderr=error_log_path,
                                 stdout=output_log_path,
                                 resume=True)

        # set up job lists
        self.stage1_jobs = {}
        self.st_jobs = {}
        self.descanso_jobs = {}
        self.gpr_jobs = {}
        self.post_jobs = {}
        self.rake_jobs = {}
        self.cleanup_jobs = {}
        self.eval_jobs = {}

        # set up conditionals
        if self.run_type == 'in_sample_selection':
            self.param_groups = np.array_split(
                list(range(0, self.n_parameter_sets)), MAX_SUBMISSIONS)
        elif self.run_type == 'oos_selection':
            split = math.floor(float(MAX_SUBMISSIONS) / float(self.holdouts))
            self.param_groups = np.array_split(
                list(range(0, self.n_parameter_sets)), split)
        else:
            self.param_groups = np.array_split(
                list(range(0, self.n_parameter_sets)), 1)
Esempio n. 3
0
    def __init__(self, states):
        self.states = states

        self.wflow = Workflow(
            workflow_args='{}_find_zeros_04'.format(states[0][:2]),
            name='count_zero_pop_tracts',
            project='proj_cost_effect',
            stderr='/ihme/scratch/users/{}/sgeoutput'.format(user),
            stdout='/ihme/scratch/users/{}/sgeoutput'.format(user),
            working_dir='/homes/{}'.format(user),
            seconds_until_timeout=len(self.states) * 60 * 10)
Esempio n. 4
0
class ImportedCasesJobSwarm(object):
    ADDITIONAL_RESTRICTIONS = {562: 'mental_drug_opioids'}
    """This class creates and submits the imported cases task dag."""
    def __init__(
            self,
            code_dir,
            out_dir,
            version_id,
            cause_ids,
            decomp_step,
            gbd_round_id=GBD.GBD_ROUND_ID
    ):
        self.code_dir = code_dir
        self.out_dir = out_dir
        self.version_id = version_id
        self.cause_ids = cause_ids
        self.decomp_step = decomp_step
        self.gbd_round_id = gbd_round_id

        username = getpass.getuser()
        self.workflow = Workflow(
            workflow_args='imported_cases_v{version}_{timestamp}'.format(
                version=self.version_id,
                timestamp=datetime.datetime.now().isoformat()
            ),
            name="Imported Cases Generator",
            project='proj_codcorrect',
            stdout=f'FILEPATH',
            stderr=f'FILEPATH'
        )

    def create_imported_cases_jobs(self):
        """Generates the tasks and adds them to the task_dag."""
        # TODO: profile and revise core/mem allocation.
        for cause in self.cause_ids:
            task = PythonTask(
                script=os.path.join(self.code_dir, 'imported_cases.py'),
                args=[self.version_id,
                      '--cause_id', cause,
                      '--decomp_step', self.decomp_step,
                      '--gbd_round_id', self.gbd_round_id,
                      '--output_dir', self.out_dir],
                name='imported_cases_{}_{}'.format(self.version_id, cause),
                num_cores=42,
                m_mem_free="100.0G",
                max_attempts=3,
                tag='imported_cases',
                queue='all.q')
            self.workflow.add_task(task)

    def run(self):
        success = self.workflow.run()
        return success
Esempio n. 5
0
class SplitCoDSwarm(object):

    _CODEDIR = os.path.dirname(os.path.abspath(__file__))

    def __init__(self, source_id, proportion_ids, proportion_measure_id,
                 sex_ids, gbd_round_id, decomp_step, intermediate_dir, outdir,
                 project):
        self.source_id = source_id
        self.proportion_ids = proportion_ids
        self.proportion_measure_id = proportion_measure_id
        self.sex_ids = sex_ids
        self.gbd_round_id = gbd_round_id
        self.decomp_step = decomp_step
        self.intermediate_dir = intermediate_dir
        self.outdir = outdir

        time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')

        self.workflow = Workflow(
            workflow_args=('split_cod_model_interpolate_{}_{}'.format(
                source_id, time)),
            name='Split CoD Model cause_id: {}'.format(source_id),
            project=project,
            stderr=outdir,
            stdout=outdir)

    def add_interpolate_tasks(self):
        for meid in self.proportion_ids:
            for sex in self.sex_ids:
                arglist = [
                    '--gbd_id', meid, '--proportion_measure_id',
                    self.proportion_measure_id, '--sex_id', sex,
                    '--gbd_round_id', self.gbd_round_id, '--intermediate_dir',
                    self.intermediate_dir
                ]
                if self.decomp_step:
                    arglist.extend(['--decomp_step', self.decomp_step])

                task = PythonTask(script=os.path.join(self._CODEDIR,
                                                      'split_interp.py'),
                                  args=arglist,
                                  name='split_model_interpolate_{}_{}'.format(
                                      meid, sex),
                                  num_cores=30,
                                  m_mem_free='60G',
                                  max_runtime_seconds=14400,
                                  max_attempts=10)
                self.workflow.add_task(task)

    def run(self):
        return self.workflow.run()
Esempio n. 6
0
def main() -> None:
    args = parse_args()
    user = getpass.getuser()
    today_string = datetime.date.today().strftime('%m%d%y')
    workflow = Workflow(
        workflow_args=f'anemia_malaria_{args.decomp_step}_{today_string}',
        name=f'anemia_malaria_{args.decomp_step}_{today_string}',
        description=
        f'Anemia: Malaria pre-processing for decomp {args.decomp_step}',
        project="proj_anemia",
        stderr="FILEPATH",
        stdout="FILEPATH",
        working_dir=path_to_directory,
        resume=True)

    # first submit the subtract clinical jobs
    subtract_tasks = []
    demo = get_demographics("epi", gbd_round_id=args.gbd_round_id)
    for loc in demo['location_id']:
        task = PythonTask(script="FILEPATH",
                          args=[
                              "--location_id", loc, "--gbd_round_id",
                              args.gbd_round_id, "--decomp_step",
                              args.decomp_step, "--out_dir", args.out_dir
                          ],
                          name=f"malaria_subtract_{loc}",
                          tag="malaria_subtract",
                          num_cores=2,
                          m_mem_free="8G",
                          max_attempts=3,
                          max_runtime_seconds=60 * 60 * 3,
                          queue='all.q')
        subtract_tasks.append(task)
    workflow.add_tasks(subtract_tasks)

    # once the new draws exist, save results
    for modelable_entity_id in [19390, 19394]:
        task = PythonTask(script="FILEPATH",
                          args=[
                              "--modelable_entity_id", modelable_entity_id,
                              "--gbd_round_id", args.gbd_round_id,
                              "--decomp_step", args.decomp_step, "--out_dir",
                              args.out_dir
                          ],
                          name=f"malaria_save_{modelable_entity_id}",
                          tag="malaria_save",
                          upstream_tasks=subtract_tasks,
                          num_cores=8,
                          m_mem_free="100G",
                          max_attempts=3,
                          max_runtime_seconds=60 * 60 * 24,
                          queue='all.q')
        workflow.add_task(task)

    status = workflow.run()
    print(f'Workflow finished with status {status}')
Esempio n. 7
0
    def __init__(self, source_id, proportion_ids, proportion_measure_id,
                 sex_ids, gbd_round_id, decomp_step, intermediate_dir, outdir,
                 project):
        self.source_id = source_id
        self.proportion_ids = proportion_ids
        self.proportion_measure_id = proportion_measure_id
        self.sex_ids = sex_ids
        self.gbd_round_id = gbd_round_id
        self.decomp_step = decomp_step
        self.intermediate_dir = intermediate_dir
        self.outdir = outdir

        time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')

        self.workflow = Workflow(
            workflow_args=('split_cod_model_interpolate_{}_{}'.format(
                source_id, time)),
            name='Split CoD Model cause_id: {}'.format(source_id),
            project=project,
            stderr=outdir,
            stdout=outdir)
Esempio n. 8
0
    def build_jobmon_workflow(self, identifier=None, extra_arguments=None):
        """
        Returns jobmon workflow that represents cascade job dag.

        Args:
            identifier (str): A unique string to identify this workflow
                for JobMon. Running twice with the same string will restart
                a workflow.
            extra_arguments (List[str]): Command-line arguments to add to
                every UGE Job specified in Jobmon.
        Returns:
            jobmon.Workflow: With all Jobmon tasks created.
        """
        extra_arguments = extra_arguments if extra_arguments else list()
        cv_iters = None if not self.run_cv else list(range(11))

        demo = Demographics(self.mvid)
        lsvid = self.mvm.location_set_version_id.values[0]
        lt = loctree(
            location_set_id=demo.LOCATION_SET_ID,
            location_set_version_id=lsvid,
            gbd_round_id=demo.gbd_round_id)

        desc = self.mvm.description.values[0]

        jobdag = make_dag(
            mvid=self.mvid, loctree=lt, cv_iter=cv_iters,
            add_arguments=extra_arguments
        )

        env = settings['env_variables']['ENVIRONMENT_NAME']
        identifier = identifier if identifier else f"dismod_{self.mvid}_{env}"
        wf = Workflow(
            workflow_args=identifier,
            name=f"dismod_{self.mvid}_{env}",
            resume=True,
            description=desc,
            project=self.project,
            stderr=self.logdir,
            stdout=self.logdir,
            seconds_until_timeout=1210000)

        # since we're looping through the dict and mutating each JobNode
        # to contain a reference to a PythonTask, we require the jobdag dict
        # to be sorted such that we've already visited all upstream tasks of
        # any given node.
        for jobname, dagnode in jobdag.items():
            dagnode.add_job(wf, jobdag, self.mvm)

        return wf
Esempio n. 9
0
    def __init__(
            self,
            parameters: params.master.CoDCorrectParameters,
            resume: bool = False
    ):
        """
        Creates an instance of a FauxCorrect JobSwarm.

        Arguments:
            parameters (parameters.master.FauxCorrectParameters): instance of
                the FauxCorrrect parameters that this job swarm will execute.
        """
        self.parameters: params.master.CoDCorrectParameters = parameters

        # Intuit the root code directory
        self.code_dir: str = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        )
        # Create workflow object
        self.workflow: Workflow = Workflow(
            workflow_args=DAG.Workflow.CODCORRECT_WORKFLOW_ARGS.format(
                version_id=self.parameters.version_id,
                timestamp=datetime.datetime.now().isoformat()
            ),
            name=DAG.Workflow.CODCORRECT_NAME.format(
                version_id=self.parameters.version_id
            ),
            project=DAG.Workflow.PROJECT,
            stdout=os.path.join(
                self.parameters.parent_dir, FilePaths.LOG_DIR, FilePaths.STDOUT
            ),
            stderr=os.path.join(
                self.parameters.parent_dir, FilePaths.LOG_DIR, FilePaths.STDERR
            ),
            resume=resume,
            # 2 weeks (14*24*60*60)
            seconds_until_timeout=(1209600)
        )

        self.task_map: Dict[str, Dict[str, PythonTask]] = defaultdict(dict)
Esempio n. 10
0
class Swarm_binning(object):

    fips_dict = {
        'al': 1,
        'ak': 2,
        'ar': 5,
        'az': 4,
        'ca': 6,
        'co': 8,
        'ct': 9,
        'de': 10,
        'dc': 11,
        'fl': 12,
        'ga': 13,
        'hi': 15,
        'id': 16,
        'il': 17,
        'in': 18,
        'ia': 19,
        'ks': 20,
        'ky': 21,
        'la': 22,
        'me': 23,
        'md': 24,
        'ma': 25,
        'mi': 26,
        'mn': 27,
        'ms': 28,
        'mo': 29,
        'mt': 30,
        'ne': 31,
        'nv': 32,
        'nh': 33,
        'nj': 34,
        'nm': 35,
        'ny': 36,
        'nc': 37,
        'nd': 38,
        'oh': 39,
        'ok': 40,
        'or': 41,
        'pa': 42,
        'ri': 44,
        'sc': 45,
        'sd': 46,
        'tn': 47,
        'tx': 48,
        'ut': 49,
        'vt': 50,
        'va': 51,
        'wa': 53,
        'wv': 54,
        'wi': 55,
        'wy': 56,
        'pr': 72
    }

    def __init__(self, states):
        self.states = states
        self.county_dict = load_county_dicts(states)
        self.tract_dict = load_tract_dicts(states)
        self.wflow = Workflow(
            workflow_args='{}_bin_co_04'.format(states[0][:2]),
            name='count_zero_pop_tracts',
            project='proj_cost_effect',
            stderr='/ihme/scratch/users/{}/sgeoutput'.format(user),
            stdout='/ihme/scratch/users/{}/sgeoutput'.format(user),
            working_dir='/homes/{}'.format(user),
            seconds_until_timeout=len(self.states) * 60 * 10)

    def add_state_counties(self):

        interpreter = '/ihme/code/beatrixh/miniconda/envs/pyomo/bin/python'
        script = '/ihme/code/beatrixh/microsim_2020/census_2020/synthetic_pop/binning/count_bins_head.py'
        for state in self.states:
            counties = self.county_dict[state]  #grab counties per state
            for county in counties:
                tracts = self.tract_dict[(state,
                                          county)]  #grab tracts per county
                rtime = len(tracts) * 2  #expected runtime
                rtime = rtime + 600  # buffer
                args = state + ' ' + str(county)
                cmd = interpreter + ' ' + script + ' ' + args
                task = BashTask(cmd,
                                name='bin_and_calc_n_hi_{}_{}'.format(
                                    state, county),
                                num_cores=1,
                                m_mem_free=10,
                                max_attempts=3,
                                max_runtime_seconds=rtime,
                                resource_scales={
                                    'm_mem_free': 0.3,
                                    'max_runtime_seconds': 2.0
                                },
                                queue='long.q')
                self.wflow.add_task(task)
                print("added {}".format(task.name))
Esempio n. 11
0
class Find_pop_zero_tracts(object):

    fips_dict = {
        'al': 1,
        'ak': 2,
        'ar': 5,
        'az': 4,
        'ca': 6,
        'co': 8,
        'ct': 9,
        'de': 10,
        'dc': 11,
        'fl': 12,
        'ga': 13,
        'hi': 15,
        'id': 16,
        'il': 17,
        'in': 18,
        'ia': 19,
        'ks': 20,
        'ky': 21,
        'la': 22,
        'me': 23,
        'md': 24,
        'ma': 25,
        'mi': 26,
        'mn': 27,
        'ms': 28,
        'mo': 29,
        'mt': 30,
        'ne': 31,
        'nv': 32,
        'nh': 33,
        'nj': 34,
        'nm': 35,
        'ny': 36,
        'nc': 37,
        'nd': 38,
        'oh': 39,
        'ok': 40,
        'or': 41,
        'pa': 42,
        'ri': 44,
        'sc': 45,
        'sd': 46,
        'tn': 47,
        'tx': 48,
        'ut': 49,
        'vt': 50,
        'va': 51,
        'wa': 53,
        'wv': 54,
        'wi': 55,
        'wy': 56,
        'pr': 72
    }

    def __init__(self, states):
        self.states = states

        self.wflow = Workflow(
            workflow_args='{}_find_zeros_04'.format(states[0][:2]),
            name='count_zero_pop_tracts',
            project='proj_cost_effect',
            stderr='/ihme/scratch/users/{}/sgeoutput'.format(user),
            stdout='/ihme/scratch/users/{}/sgeoutput'.format(user),
            working_dir='/homes/{}'.format(user),
            seconds_until_timeout=len(self.states) * 60 * 10)

    def add_states(self):

        interpreter = '/ihme/code/beatrixh/miniconda/envs/pyomo/bin/python'
        script = '/ihme/code/beatrixh/microsim_2020/census_2020/synthetic_pop/gen_synth_pop/identify_pop_zero_tracts.py'
        for state in self.states:
            args = state
            cmd = interpreter + ' ' + script + ' ' + args
            task = BashTask(cmd,
                            name='find_pop_zero_tracts_{}'.format(state),
                            num_cores=1,
                            m_mem_free=10,
                            max_attempts=3,
                            max_runtime_seconds=60 * 10,
                            resource_scales={
                                'm_mem_free': 0.3,
                                'max_runtime_seconds': 2.0
                            },
                            queue='all.q')
            self.wflow.add_task(task)
            print("added {}".format(task.name))
Esempio n. 12
0
    def generate_workflow(self, wf_name):
        wf = Workflow(workflow_args=wf_name,
                      project="proj_mortenvelope",
                      stdout=self.stdout,
                      stderr=self.stderr,
                      seconds_until_timeout=777600,
                      resume=True)

        self.create_directories(master_dir=self.master_dir,
                                subdirs=[
                                    'inputs', 'shock_numbers', 'hiv_adjust',
                                    'logs', 'upload', 'abridged_lt', 'full_lt'
                                ])
        self.create_directories(master_dir=self.reckoning_output_dir,
                                subdirs=[
                                    'lt_whiv', 'lt_hivdel', 'envelope_whiv',
                                    'envelope_hivdel'
                                ])
        self.create_directories(master_dir=self.full_lt_dir,
                                subdirs=['no_hiv', 'with_hiv', 'with_shock'])
        self.create_directories(master_dir=self.abridged_lt_dir,
                                subdirs=['no_hiv', 'with_hiv', 'with_shock'])
        self.create_directories(master_dir=self.abridged_lt_dir,
                                subdirs=['no_hiv', 'with_hiv', 'with_shock'])
        self.create_directories(master_dir=self.upload_dir)
        self.create_directories(
            master_dir=self.log_dir,
            subdirs=[
                'full_with_hiv_mx_vs_no_hiv', 'full_shock_mx_vs_with_hiv',
                'abridged_with_hiv_mx_vs_no_hiv',
                'abridged_shock_mx_vs_with_hiv', 'abridged_no_hiv_qx_1_5',
                'abridged_with_hiv_qx_1_5', 'abridged_shock_qx_1_5',
                'shock_rate_compare', 'ax_compare'
            ])

        # Get locations
        most_detail_locations = call_mort_function("get_locations", {
            "level": "lowest",
            "gbd_year": self.gbd_year
        })
        most_detail_loc_ids = most_detail_locations.location_id.tolist()

        # Generate save inputs task
        # job name: agg_save_inputs_{}
        # script being ran: save_inputs.R
        save_inputs_task = self.generate_save_inputs_task(upstream_tasks=[])
        wf.add_task(save_inputs_task)

        # Generate full lt tasks
        # job name: gen_full_{loc}_{version}
        # script being ran: full_lt.R

        full_lt_tasks = {}
        for loc in most_detail_loc_ids:
            full_lt_tasks[loc] = self.generate_full_lt_task(
                upstream_tasks=[save_inputs_task], loc=loc)
            wf.add_task(full_lt_tasks[loc])

        # Run finalizer
        if self.run_finalizer:
            finalizer_run_task = self.generate_finalizer_task(
                upstream_tasks=full_lt_tasks.values())
            wf.add_task(finalizer_run_task)

        # Generate rest of full_lt tasks and add to the workflow
        # job names: "agg_full_{loc}_{lt_type}_{version}"
        # script being ran: aggregate_lts.R
        if self.aggregate_full_lts:
            # Get aggregate locations
            locations = call_mort_function("get_locations", {
                "level": "all",
                "gbd_year": self.gbd_year
            })
            agg_locations = locations[(locations.level == 3) & (
                ~locations.location_id.isin(most_detail_loc_ids))]
            agg_loc_ids = agg_locations.location_id.tolist()

            # Generate agg tasks
            agg_tasks = {}
            for loc in agg_loc_ids:
                num_children = len(most_detail_locations.loc[
                    most_detail_locations.path_to_top_parent.str.contains(
                        "," + str(loc) + ",")])
                for lt_type in ['with_shock', 'with_hiv', 'no_hiv']:
                    agg_task_key = str(loc) + "_" + lt_type
                    agg_tasks[agg_task_key] = self.generate_aggregate_lt_task(
                        upstream_tasks=full_lt_tasks.values(),
                        loc=loc,
                        lt_type=lt_type,
                        num_children=num_children)
                    wf.add_task(agg_tasks[agg_task_key])

            # Generate upload task
            # job name: full_life_table_upload_{}
            # script name: compile_upload_results.R
            if self.upload:
                upload_task = self.generate_full_upload_task(
                    upstream_tasks=agg_tasks.values())
                wf.add_task(upload_task)

        return wf
Esempio n. 13
0
def main() -> None:
    args = parse_args()
    user = getpass.getuser()
    today_string = datetime.date.today().strftime('%m%d%y')
    workflow = Workflow(
        workflow_args=f'anemia_causal_attribution_new_{args.decomp_step}_{today_string}',
        name=f'anemia_causal_attribution_{args.decomp_step}_{today_string}',
        description=f'Anemia: Causal attribution for decomp {args.decomp_step}',
        project="proj_anemia",
        stderr="FILEPATH",
        stdout="FILEPATH",
        working_dir=path_to_directory,
        resume=True)

    causal_attribution_tasks = []
    demo = get_demographics("epi", gbd_round_id=args.gbd_round_id)
    for location_id in demo['location_id']:
        prev_year_task = None
        for year in args.year_id:
            cmd = (
                f'FILEPATH '
                f'FILEPATH '
                f'FILEPATH '
                f'{location_id} {year} {args.gbd_round_id} {args.decomp_step} '
                f'{path_to_directory}/ {args.out_dir}'
            )
            if prev_year_task:
                task = BashTask(
                    command=cmd,
                    name=f'causal_attribution_{location_id}_{year}',
                    tag='causal_attribution',
                    upstream_tasks=[prev_year_task],
                    num_cores=1,
                    m_mem_free='4G',
                    max_attempts=3,
                    max_runtime_seconds=60*60*2,
                    queue='all.q')
            else:
                task = BashTask(
                    command=cmd,
                    name=f'causal_attribution_{location_id}_{year}',
                    tag='causal_attribution',
                    num_cores=1,
                    m_mem_free='4G',
                    max_attempts=3,
                    max_runtime_seconds=60*60*2,
                    queue='all.q')
            causal_attribution_tasks.append(task)
            prev_year_task = task
    workflow.add_tasks(causal_attribution_tasks)

    # once the draws exist, save results
    meids = pd.read_excel("FILEPATH")
    meids = meids.filter(like='modelable_entity').values.flatten()
    for modelable_entity_id in meids.tolist():
        task = PythonTask(
            script="FILEPATH",
            args=[
                "--modelable_entity_id", modelable_entity_id,
                "--year_id", " ".join([str(yr) for yr in args.year_id]),
                "--gbd_round_id", args.gbd_round_id,
                "--decomp_step", args.decomp_step,
                "--save_dir", "FILEPATH"
            ],
            name=f"save_{modelable_entity_id}",
            tag="save",
            upstream_tasks=causal_attribution_tasks,
            num_cores=8,
            m_mem_free="100G",
            max_attempts=3,
            max_runtime_seconds=60*60*24,
            queue='all.q')
        workflow.add_task(task)

    status = workflow.run()
    print(f'Workflow finished with status {status}')
Esempio n. 14
0
class STGPRJobSwarm:
    def __init__(self, run_id, run_type, holdouts, draws, nparallel,
                 n_parameter_sets, cluster_project, error_log_path,
                 output_log_path, location_set_id, gbd_round_id, custom_stage1,
                 rake_logit, code_version, decomp_step, modelable_entity_id,
                 output_path):

        self.run_id = run_id
        self.run_type = run_type
        self.holdouts = holdouts
        self.draws = draws
        self.nparallel = nparallel
        self.n_parameter_sets = n_parameter_sets
        self.cluster_project = cluster_project
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id
        self.custom_stage1 = custom_stage1
        self.rake_logit = rake_logit
        self.code_version = code_version
        self.decomp_step = decomp_step
        self.output_path = output_path
        self.is_diet_model = modelable_entity_id in [
            2430, 2442, 2431, 2434, 2433, 2437, 2435, 2428, 2429, 2436, 2427,
            2432, 2440, 9804, 2441, 2544, 23766, 2544, 2438, 23604, 23683
        ]

        # set some stuff
        self.max_attempts = 3

        # create workflow
        self.workflow = Workflow(workflow_args=f'stgpr_{self.run_id}',
                                 project=self.cluster_project,
                                 stderr=error_log_path,
                                 stdout=output_log_path,
                                 resume=True)

        # set up job lists
        self.stage1_jobs = {}
        self.st_jobs = {}
        self.descanso_jobs = {}
        self.gpr_jobs = {}
        self.post_jobs = {}
        self.rake_jobs = {}
        self.cleanup_jobs = {}
        self.eval_jobs = {}

        # set up conditionals
        if self.run_type == 'in_sample_selection':
            self.param_groups = np.array_split(
                list(range(0, self.n_parameter_sets)), MAX_SUBMISSIONS)
        elif self.run_type == 'oos_selection':
            split = math.floor(float(MAX_SUBMISSIONS) / float(self.holdouts))
            self.param_groups = np.array_split(
                list(range(0, self.n_parameter_sets)), split)
        else:
            self.param_groups = np.array_split(
                list(range(0, self.n_parameter_sets)), 1)

    def prep_parallelization_groups(self):
        """Parallize by splitting locations
        to be modelled into *nparallel* groups.
        This function grabs the location hierarchy,
        identifies needed locations, and assigns
        each one to a parallelization group."""
        session = ezfuncs.get_session(conn_def='epi')
        locs = query.get_locations(self.location_set_id, self.gbd_round_id,
                                   self.decomp_step, session)[0]
        self.locs = locs.sort_values(by=['level_{}'.format(NATIONAL_LEVEL)])
        self.parallel_groups = np.array_split(
            self.locs.loc[self.locs.level >= NATIONAL_LEVEL][SPACEVAR].values,
            self.nparallel)

        # prep raking upstreams and submission locations
        lvl = 'level_{}'.format(NATIONAL_LEVEL)
        self.subnat_locations = (locs.loc[locs.level > NATIONAL_LEVEL,
                                          lvl].unique().astype(int))

    def assign_rake_runtimes(self):
        """
        Rake jobs have immensely different times based
        almost exclusively on the number of subnationals nested
        within the national location. Assign memory based on the
        number of subnationals for each national location with
        subnationals, using (slightly modified)
        intercept and beta values from a  super simple
        linear regression,

        'memory ~ n_subnats'

        for one very data-dense model
        (ie a good upper bound for all st-gpr models)

        Memory Intercept: .75
        Memory Beta_N_subnats: .5

        (More conservative for runtime because not as
        wasteful to go high)
        Runtime Intercept: 5 (min), so 300 sec
        Runtime Beta_N_subnats: .35 (min), so 21 sec
        """

        natcol = f'level_{NATIONAL_LEVEL}'
        self.locs['subnat'] = \
            (self.locs['level'] > NATIONAL_LEVEL).astype(int)
        n_subnats = self.locs.groupby(natcol)['subnat'].sum()
        n_subnats = n_subnats.loc[n_subnats > 0].reset_index(name='N')
        n_subnats[natcol] = n_subnats[natcol].astype(int)

        # assign memory
        n_subnats['mem'] = .75 + .5 * n_subnats['N']
        n_subnats['runtime'] = 300 + 21 * n_subnats['N']
        n_subnats = n_subnats.rename(columns={natcol: 'location'})
        self.rake_memory_df = n_subnats.copy()

    def create_stage1_jobs(self):
        """First set of tasks, thus no upstream tasks.
        Only run stage1 if no custom stage1 (custom_stage1)
        estimates. """
        for ko in list(range(0, self.holdouts + 1)):

            # ie shell, script, and args pasted together
            model_root = os.path.join(paths.CODE_ROOT, 'model')
            cmd = (f'{RSHELL} -s {STAGE1_SCRIPT} '
                   f'{self.output_path} {model_root} {ko}')

            task = BashTask(command=cmd,
                            name=f'stage1_{self.run_id}_{ko}',
                            num_cores=1,
                            m_mem_free='3G',
                            max_attempts=2,
                            max_runtime_seconds=300,
                            tag='stgpr_stage1',
                            queue='all.q',
                            resource_scales=RESOURCE_SCALES,
                            hard_limits=True)

            self.workflow.add_task(task)
            self.stage1_jobs[task.name] = task

    def create_st_jobs(self):
        for ko in range(0, self.holdouts + 1):
            upstream_job = self.stage1_jobs['stage1_{}_{}'.format(
                self.run_id, ko)]
            for param_group in range(0, len(self.param_groups)):
                for loc_group in range(0, self.nparallel):

                    submit_params = ','.join(
                        [str(x) for x in self.param_groups[param_group]])
                    jname = 'st_{}_{}_{}_{}'.format(self.run_id, ko,
                                                    param_group, loc_group)

                    memory = 50
                    runtime = 1500
                    if self.is_diet_model:
                        memory = 120
                        runtime = 28800  # 8 hours

                    task = PythonTask(script=ST_SCRIPT,
                                      args=[
                                          self.run_id, self.output_path, ko,
                                          self.run_type, submit_params,
                                          self.nparallel, loc_group
                                      ],
                                      name=jname,
                                      num_cores=6,
                                      m_mem_free=f'{memory}G',
                                      max_attempts=3,
                                      max_runtime_seconds=runtime,
                                      tag='stgpr_spacetime',
                                      queue='all.q',
                                      resource_scales=RESOURCE_SCALES,
                                      hard_limits=True)

                    task.add_upstream(upstream_job)

                    self.workflow.add_task(task)
                    self.st_jobs[task.name] = task

    def create_descanso_jobs(self):
        """Depends on aggregate locations coming out of loc agg jobs"""
        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):

                submit_params = ','.join(
                    [str(x) for x in self.param_groups[param_group]])

                runtime = 3600 if self.is_diet_model else 300
                task = PythonTask(
                    script=IM_SCRIPT,
                    args=[
                        self.run_id, self.output_path, ko, self.draws,
                        self.nparallel, submit_params
                    ],
                    name=f'descanso_{self.run_id}_{ko}_{param_group}',
                    num_cores=1,
                    m_mem_free=
                    '20G',  # upped from 5 to 20 for variance simulation
                    max_runtime_seconds=runtime,
                    max_attempts=2,
                    tag='stgpr_amp_nsv',
                    queue='all.q',
                    resource_scales=RESOURCE_SCALES,
                    hard_limits=True)

                # add ST upstreams
                for loc_group in list(range(0, self.nparallel)):
                    st_label = 'st_{}_{}_{}_{}'.format(self.run_id, ko,
                                                       param_group, loc_group)
                    upstream_job = self.st_jobs[st_label]
                    task.add_upstream(upstream_job)

                self.workflow.add_task(task)
                self.descanso_jobs[task.name] = task

    def create_gpr_jobs(self):
        # set runtime and memory based on draws
        gpr_runtime = 1200
        gpr_memory = 4
        if self.draws == 100:
            gpr_runtime = 1500
            gpr_memory = 7
        elif self.draws == 1000:
            gpr_runtime = 1800
            gpr_memory = 10

        if self.is_diet_model:
            gpr_runtime *= 3
            gpr_memory *= 3

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):
                upstream_job = self.descanso_jobs['descanso_{}_{}_{}'.format(
                    self.run_id, ko, param_group)]
                for loc_group in list(range(0, self.nparallel)):

                    submit_params = ','.join(
                        [str(x) for x in self.param_groups[param_group]])

                    jname = 'gpr_{}_{}_{}_{}'.format(self.run_id, ko,
                                                     param_group, loc_group)

                    task = PythonTask(script=GPR_SCRIPT,
                                      args=[
                                          self.run_id, self.output_path, ko,
                                          self.draws, submit_params,
                                          self.nparallel, loc_group
                                      ],
                                      name=jname,
                                      num_cores=1,
                                      m_mem_free=f'{gpr_memory}G',
                                      max_runtime_seconds=gpr_runtime,
                                      max_attempts=2,
                                      tag='stgpr_gpr',
                                      queue='all.q',
                                      resource_scales=RESOURCE_SCALES,
                                      hard_limits=True)

                    task.add_upstream(upstream_job)

                    self.workflow.add_task(task)
                    self.gpr_jobs[task.name] = task

    def create_rake_jobs(self):
        """Depends on GPR jobs including all the subnationals
        and national locations for each
        rake job, parallelized out by parent_id.
        Raking only done on the first KO (KO 0),
        which does not hold out any data from the dataset."""
        for loc in self.subnat_locations:
            mem = int(
                np.ceil(
                    self.rake_memory_df.query(f'location == {loc}')
                    ['mem'].iat[0]))

            rt = int(
                np.ceil(
                    self.rake_memory_df.query(f'location == {loc}')
                    ['runtime'].iat[0]))

            if self.draws == 1000:
                mem *= 2
                rt *= 3
                rt = max(rt, 7200)

            if self.is_diet_model:
                mem *= 2
                rt *= 3
                rt = max(rt, 14400)

            task = PythonTask(script=RAKE_SCRIPT,
                              args=[
                                  self.run_id, self.output_path, 0, self.draws,
                                  self.run_type, self.rake_logit, loc
                              ],
                              name=f'rake_{self.run_id}_{loc}',
                              num_cores=1,
                              m_mem_free=f'{mem}G',
                              max_runtime_seconds=rt,
                              max_attempts=2,
                              tag='stgpr_rake',
                              queue='all.q',
                              resource_scales=RESOURCE_SCALES,
                              hard_limits=True)

            # grab all subnationals and country location_ids associated with a country
            lvl = 'level_{}'.format(NATIONAL_LEVEL)
            all_needed_locs = self.locs.loc[self.locs[lvl] == loc,
                                            'location_id'].unique()

            # add each gpr job containing a needed national/subnational
            # for raking to upstreams
            if self.holdouts == 0:
                for param_group in list(range(0, len(self.param_groups))):
                    for loc_group in list(range(0, self.nparallel)):
                        loc_group_vals = self.parallel_groups[loc_group]

                        common_elements = len(
                            intersection(all_needed_locs.tolist(),
                                         loc_group_vals.tolist()))
                        if common_elements > 0:
                            task.add_upstream(
                                self.gpr_jobs['gpr_{}_0_{}_{}'.format(
                                    self.run_id, param_group, loc_group)])
            else:
                task.add_upstream(self.eval_jobs['eval_{}'.format(
                    self.run_id)])

            self.workflow.add_task(task)
            self.rake_jobs[task.name] = task

    def create_post_jobs(self):
        """Depends on rake jobs. Calculates fit stats
        and cleans up file folders, no mas."""

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):

                submit_params = ','.join(
                    [str(x) for x in self.param_groups[param_group]])

                task = PythonTask(
                    script=POST_SCRIPT,
                    args=[
                        self.run_id, self.output_path, ko, self.run_type,
                        self.holdouts, submit_params
                    ],
                    name=f'post_{self.run_id}_{ko}_{param_group}',
                    num_cores=1,
                    m_mem_free='2G',
                    max_runtime_seconds=300,
                    max_attempts=2,
                    tag='stgpr_post',
                    queue='all.q',
                    resource_scales=RESOURCE_SCALES,
                    hard_limits=True)

                # add ST upstreams
                for loc_group in list(range(0, self.nparallel)):
                    gp_label = 'gpr_{}_{}_{}_{}'.format(
                        self.run_id, ko, param_group, loc_group)
                    upstream_job = self.gpr_jobs[gp_label]
                    task.add_upstream(upstream_job)

                self.workflow.add_task(task)
                self.post_jobs[task.name] = task

    def create_cleanup_jobs(self):
        """Saves rake summaries and removes
        tempfiles no longer needed"""
        runtime = 600 if self.draws == 0 else 7200

        for ko in list(range(0, self.holdouts + 1)):
            task = PythonTask(script=CLEANUP_SCRIPT,
                              args=[
                                  self.run_id, self.output_path, self.run_type,
                                  ko, self.draws
                              ],
                              name=f'clean_{self.run_id}_{ko}',
                              num_cores=1,
                              m_mem_free='1G',
                              max_runtime_seconds=runtime,
                              max_attempts=1,
                              tag='stgpr_clean',
                              queue='all.q')

            if ko == 0:
                for loc in self.subnat_locations:
                    task.add_upstream(self.rake_jobs['rake_{}_{}'.format(
                        self.run_id, loc)])
            else:
                task.add_upstream(self.eval_jobs['eval_{}'.format(
                    self.run_id)])

            self.workflow.add_task(task)
            self.cleanup_jobs[task.name] = task

    def create_eval_jobs(self):
        """
        For hyperparameter selection runs only, determine best hyperparameter
        set based on in-sample or out-of-sample RMSE.
        - run_type = "in_sample_selection"
        - run_type = "oos_selection"

        For runs with only one set of parameters, set the best_param_set
        to the *only* param_set (param_set 0) for consistency in rake inputs.

        Lastly, just collect the disparate fit_stats files and combine into
        a single file, saved as fit_stats.csv for all run types
        """

        task = PythonTask(script=EVAL_SCRIPT,
                          args=[
                              self.run_id, self.output_path, self.run_type,
                              self.holdouts, self.n_parameter_sets
                          ],
                          name=f'eval_{self.run_id}',
                          num_cores=1,
                          m_mem_free='500M',
                          max_runtime_seconds=180,
                          max_attempts=2,
                          tag='stgpr_eval',
                          queue='all.q',
                          resource_scales=RESOURCE_SCALES,
                          hard_limits=True)

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):
                post_label = 'post_{}_{}_{}'.format(self.run_id, ko,
                                                    param_group)
                task.add_upstream(self.post_jobs[post_label])

        self.workflow.add_task(task)
        self.eval_jobs[task.name] = task

    def run(self):

        # run main model estimation pipeline
        self.prep_parallelization_groups()
        self.create_stage1_jobs()
        self.create_st_jobs()
        self.create_descanso_jobs()
        self.create_gpr_jobs()
        self.create_post_jobs()

        # choose best parameter set to run rake for
        self.create_eval_jobs()

        # run rake/aggregation step and clean outputs
        self.assign_rake_runtimes()
        self.create_rake_jobs()
        self.create_cleanup_jobs()

        status = self.workflow.run()
        print(f'Workflow finished with status {status}')
        return status
Esempio n. 15
0
class EpicWorkFlow(object):

    CODE_DIR = os.path.dirname(os.path.realpath(__file__))
    USERNAME = getpass.getuser()
    DATA_DIR = "FILEPATH"
    LOG_DIR = os.path.join('PATH', USERNAME)
    YEAR_IDS = [1990, 1995, 2000, 2005, 2010, 2015, 2017, 2019]
    N_DRAWS = 1000

    def __init__(self, version, mapbuilder, decomp_step, gbd_round_id, resume):

        # validate decomp_step
        validate_decomp_step("EPIC", decomp_step, gbd_round_id)

        self.DATA_DIR = os.path.join(self.DATA_DIR, str(version))
        if not os.path.exists(self.DATA_DIR):
            os.makedirs(self.DATA_DIR)
            os.makedirs(os.path.join(self.DATA_DIR, FilePaths.INPUT_FILES_DIR))

        self.decomp_step = decomp_step
        self.gbd_round_id = gbd_round_id
        self.resume = resume

        # create epic json map
        self.emap = mapbuilder.downstream_only("como")

        # instantiate the factories
        self._task_registry = {}
        self._sev_split_fac = SevSplitTaskFactory(self._task_registry)
        self._ex_adjust_fac = ExAdjustFactory(self._task_registry)
        self._super_squeeze_fac = SuperSqueezeFactory(self._task_registry)
        self._save_fac = SaveFactory()

        self.workflow = Workflow(
            workflow_args="epic_v{version}".format(version=version),
            name="EPIC Central Machinery",
            project=DAG.Tasks.PROJECT,
            stdout=os.path.join(self.LOG_DIR, "output"),
            stderr=os.path.join(self.LOG_DIR, "errors"),
            resume=resume,
            seconds_until_timeout=435600)

        if not resume:
            # Save best input models as csv for posting to EPIC tracker HUB
            # page then separate into individual json files for use in
            # downstream scripts. Take care that downstream processes do not
            # pick up a model_version_id from a previous run. Only
            # collect the best models once per run so we know exactly what
            # was a available at the start of the run and what was
            # consequently used in the rest of the workflow
            best_models = mapbuilder.best_models
            inputs = [int(x) for x in mapbuilder.inputs]
            best_models = best_models.loc[best_models[
                Params.MODELABLE_ENTITY_ID].isin(inputs)]
            best_models.to_csv(
                os.path.join(self.DATA_DIR, FilePaths.INPUT_FILES_DIR,
                             FilePaths.BEST_MODELS_FILE_PATTERN),
                index=False,
                encoding="utf8")
            for index, row in best_models.iterrows():
                SaveFactory.save_model_metadata(self.DATA_DIR,
                                                row.modelable_entity_id,
                                                row.model_version_id,
                                                row.decomp_step)

        self._task_map = {
            DAG.Tasks.SPLIT: self._add_sev_split_task,
            DAG.Tasks.SUPER_SQUEEZE: self._add_super_squeeze_task,
            DAG.Tasks.EX_ADJUST: self._add_ex_adjust_task
        }

        # run every process in the pipeline regardless of whether or not
        # there is already a model saved
        self.pgraph = mapbuilder.P

        # get process nodes and build out jobmon workflow
        # create a subgraph from the process nodes

        top_sort = nx.topological_sort(self.pgraph)

        for node in top_sort:
            if node == mapbuilder.start_node:
                pass
            elif DAG.Tasks.SPLIT in node:
                self._task_map[DAG.Tasks.SPLIT](node)
            elif DAG.Tasks.SUPER_SQUEEZE in node:
                self._task_map[DAG.Tasks.SUPER_SQUEEZE](node)
            else:
                self._task_map[DAG.Tasks.EX_ADJUST](node)

    def _create_output_directories(self, meid_list):
        for meid in meid_list:
            directory = os.path.join(self.DATA_DIR, str(meid))

            if os.path.exists(directory) and not self.resume:
                shutil.rmtree(directory)
                os.makedirs(directory)
            elif os.path.exists(directory) and self.resume:
                logging.info(
                    f"Directory exists for modelable_entity_id {meid} "
                    f"and resume is {self.resume}. Do not delete anything. "
                    f"Continue workflow.")
            else:
                os.makedirs(directory)

    def _add_sev_split_task(self, node):
        logging.info(f"Adding {node} task")
        split_map = self.emap[node]
        split_id = int(split_map["kwargs"]["split_id"])
        split_meta = SeverityPropMetadata(split_id=split_id,
                                          decomp_step=self.decomp_step,
                                          gbd_round_id=self.gbd_round_id)
        split_version_id = split_meta.best_version
        meta_version = split_meta.get_metadata_version(split_version_id)
        parent_meid = int(meta_version.parent_meid())
        children_meids = [int(x) for x in meta_version.child_meid().split(",")]

        # make output directories
        self._create_output_directories(children_meids)

        split_task = self._sev_split_fac.get_task(
            node=node,
            process_graph=self.pgraph,
            split_version_id=split_version_id,
            output_dir=self.DATA_DIR,
            decomp_step=self.decomp_step,
            year_id=self.YEAR_IDS,
            n_draws=self.N_DRAWS)
        self.workflow.add_task(split_task)
        self._task_registry[SevSplitTaskFactory.get_task_name(
            node)] = split_task

        description = (
            f"Central_severity_split_{Params.DESCRIPTION_MAP[self.N_DRAWS]}")
        for meid in children_meids:
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]
            self._add_save_task(meid, "{location_id}.h5", description,
                                measure_id, self.YEAR_IDS, self.N_DRAWS,
                                split_task)

    def _add_save_task(self, meid, input_file_pattern, description, measure_id,
                       year_id, n_draws, upstream_task):
        logging.info(f"Adding {meid} save task")
        args = {
            Params.PARENT_DIR: self.DATA_DIR,
            Params.INPUT_DIR: os.path.join(self.DATA_DIR, str(meid)),
            Params.INPUT_FILE_PATTERN: input_file_pattern,
            Params.MODELABLE_ENTITY_ID: meid,
            Params.DESCRIPTION: description,
            Params.MEASURE_ID: measure_id,
            Params.YEAR_ID: year_id,
            Params.DECOMP_STEP: self.decomp_step,
            Params.N_DRAWS: n_draws
        }
        save_task = self._save_fac.get_task(**args)

        for upt in list(np.atleast_1d(upstream_task)):
            save_task.add_upstream(upt)

        self.workflow.add_task(save_task)
        self._task_registry[SaveFactory.get_task_name(meid)] = save_task

    def _add_ex_adjust_task(self, node):
        logging.info(f"Adding {node} task")
        # compile submission arguments
        kwargs = self.emap[node]["kwargs"]
        try:
            copy_env_inc = kwargs.pop("copy_env_inc")
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]
        except KeyError:
            copy_env_inc = False
            measure_id = [gbd.measures.PREVALENCE]

        # make output directories
        self._create_output_directories(self.pgraph.nodes[node]["outs"])

        ex_adj_task = self._ex_adjust_fac.get_task(
            node=node,
            process_graph=self.pgraph,
            output_dir=self.DATA_DIR,
            decomp_step=self.decomp_step,
            year_id=self.YEAR_IDS,
            n_draws=self.N_DRAWS)
        self.workflow.add_task(ex_adj_task)
        self._task_registry[ExAdjustFactory.get_task_name(node)] = ex_adj_task

        description = (f"Exclusivity_adjustment_auto_mark_"
                       f"{Params.DESCRIPTION_MAP[self.N_DRAWS]}")
        for meid in self.pgraph.nodes[node]["outs"]:
            self._add_save_task(meid, "{location_id}.h5", description,
                                measure_id, self.YEAR_IDS, self.N_DRAWS,
                                ex_adj_task)

    def _add_super_squeeze_task(self, node):
        logging.info(f"Adding {node} task")

        # make output directories
        self._create_output_directories(self.pgraph.nodes[node]["outs"])

        # get dependency_list before parallelizing since the
        # dependencies are the same for each parallelized demographic
        dep_list = get_dependencies(node, self.pgraph, self._task_registry)

        epi_demo = get_demographics("epi", gbd_round_id=self.gbd_round_id)
        for location_id in epi_demo[Params.LOCATION_ID]:
            for year_id in self.YEAR_IDS:
                for sex_id in epi_demo[Params.SEX_ID]:
                    ss_task = self._super_squeeze_fac.get_task(
                        node=node,
                        output_dir=self.DATA_DIR,
                        location_id=location_id,
                        year_id=year_id,
                        sex_id=sex_id,
                        decomp_step=self.decomp_step,
                        n_draws=self.N_DRAWS,
                        dependency_list=dep_list)
                    self.workflow.add_task(ss_task)
                    self._task_registry[SuperSqueezeFactory.get_task_name(
                        node, location_id, year_id, sex_id)] = ss_task

        ss_upstream = [
            self._task_registry[t] for t in list(self._task_registry.keys())
            if DAG.Tasks.SUPER_SQUEEZE in t
        ]
        description = (
            f"Super_Squeeze_auto_mark_{Params.DESCRIPTION_MAP[self.N_DRAWS]}")
        measure_id = [gbd.measures.PREVALENCE]
        for meid in self.pgraph.nodes[node]["outs"]:
            self._add_save_task(
                meid, "{location_id}/{measure_id}_{year_id}_{sex_id}.h5",
                description, measure_id, self.YEAR_IDS, self.N_DRAWS,
                ss_upstream)
Esempio n. 16
0
def main() -> None:
    args = parse_args()
    user = getpass.getuser()
    today_string = datetime.date.today().strftime('%m%d%y')
    workflow = Workflow(
        workflow_args=
        f'anemia_post_interp_temp_{args.decomp_step}_{today_string}',
        name=f'anemia_post_{args.decomp_step}_{today_string}',
        description=f'Anemia: Post-processing for decomp {args.decomp_step}',
        project="proj_anemia",
        stderr="FILEPATH",
        stdout="FILEPATH",
        working_dir=path_to_directory,
        resume=True)

    anemia_causes = ('hiv', 'pud', 'gastritis', 'esrd_dialysis', 'ckd3',
                     'ckd4', 'ckd5', 'cirrhosis')

    for anemia_cause in anemia_causes:

        # load in the info table
        info_df = pd.read_csv("FILEPATH")
        new_me_id_list = info_df['proportion_me'].tolist()

        # submit compute job for each me_id
        compute_prop_tasks = []
        for year in args.year_id:
            task = PythonTask(script="FILEPATH",
                              args=[
                                  "--year_id", year, "--anemia_cause",
                                  anemia_cause, "--gbd_round_id",
                                  args.gbd_round_id, "--decomp_step",
                                  args.decomp_step, "--out_dir", args.out_dir
                              ],
                              name=f"make_{anemia_cause}_props_{year}",
                              tag="compute_props",
                              num_cores=1,
                              m_mem_free="12G",
                              max_attempts=3,
                              max_runtime_seconds=60 * 60 * 2,
                              queue='all.q')
            compute_prop_tasks.append(task)
        workflow.add_tasks(compute_prop_tasks)

        # submit save result jobs after compute jobs finish
        for new_me_id in new_me_id_list:
            task = PythonTask(script="FILEPATH",
                              args=[
                                  "--modelable_entity_id", new_me_id,
                                  "--year_ids",
                                  " ".join([str(yr) for yr in args.year_id
                                            ]), "--gbd_round_id",
                                  args.gbd_round_id, "--decomp_step",
                                  args.decomp_step, "--save_dir", 'FILEPATH'
                              ],
                              name=f"save_props_{new_me_id}",
                              tag="save_props",
                              upstream_tasks=compute_prop_tasks,
                              num_cores=8,
                              m_mem_free="90G",
                              max_attempts=3,
                              max_runtime_seconds=60 * 60 * 8,
                              queue='long.q')
            workflow.add_task(task)

    status = workflow.run()
    print(f'Workflow finished with status {status}')
Esempio n. 17
0
    def __init__(self, version, mapbuilder, decomp_step, gbd_round_id, resume):

        # validate decomp_step
        validate_decomp_step("EPIC", decomp_step, gbd_round_id)

        self.DATA_DIR = os.path.join(self.DATA_DIR, str(version))
        if not os.path.exists(self.DATA_DIR):
            os.makedirs(self.DATA_DIR)
            os.makedirs(os.path.join(self.DATA_DIR, FilePaths.INPUT_FILES_DIR))

        self.decomp_step = decomp_step
        self.gbd_round_id = gbd_round_id
        self.resume = resume

        # create epic json map
        self.emap = mapbuilder.downstream_only("como")

        # instantiate the factories
        self._task_registry = {}
        self._sev_split_fac = SevSplitTaskFactory(self._task_registry)
        self._ex_adjust_fac = ExAdjustFactory(self._task_registry)
        self._super_squeeze_fac = SuperSqueezeFactory(self._task_registry)
        self._save_fac = SaveFactory()

        self.workflow = Workflow(
            workflow_args="epic_v{version}".format(version=version),
            name="EPIC Central Machinery",
            project=DAG.Tasks.PROJECT,
            stdout=os.path.join(self.LOG_DIR, "output"),
            stderr=os.path.join(self.LOG_DIR, "errors"),
            resume=resume,
            seconds_until_timeout=435600)

        if not resume:
            # Save best input models as csv for posting to EPIC tracker HUB
            # page then separate into individual json files for use in
            # downstream scripts. Take care that downstream processes do not
            # pick up a model_version_id from a previous run. Only
            # collect the best models once per run so we know exactly what
            # was a available at the start of the run and what was
            # consequently used in the rest of the workflow
            best_models = mapbuilder.best_models
            inputs = [int(x) for x in mapbuilder.inputs]
            best_models = best_models.loc[best_models[
                Params.MODELABLE_ENTITY_ID].isin(inputs)]
            best_models.to_csv(
                os.path.join(self.DATA_DIR, FilePaths.INPUT_FILES_DIR,
                             FilePaths.BEST_MODELS_FILE_PATTERN),
                index=False,
                encoding="utf8")
            for index, row in best_models.iterrows():
                SaveFactory.save_model_metadata(self.DATA_DIR,
                                                row.modelable_entity_id,
                                                row.model_version_id,
                                                row.decomp_step)

        self._task_map = {
            DAG.Tasks.SPLIT: self._add_sev_split_task,
            DAG.Tasks.SUPER_SQUEEZE: self._add_super_squeeze_task,
            DAG.Tasks.EX_ADJUST: self._add_ex_adjust_task
        }

        # run every process in the pipeline regardless of whether or not
        # there is already a model saved
        self.pgraph = mapbuilder.P

        # get process nodes and build out jobmon workflow
        # create a subgraph from the process nodes

        top_sort = nx.topological_sort(self.pgraph)

        for node in top_sort:
            if node == mapbuilder.start_node:
                pass
            elif DAG.Tasks.SPLIT in node:
                self._task_map[DAG.Tasks.SPLIT](node)
            elif DAG.Tasks.SUPER_SQUEEZE in node:
                self._task_map[DAG.Tasks.SUPER_SQUEEZE](node)
            else:
                self._task_map[DAG.Tasks.EX_ADJUST](node)
Esempio n. 18
0
    def generate_workflow(self, wf_name, run_mv):
        wf = Workflow(workflow_args=wf_name,
                      project="proj_mortenvelope",
                      stdout=self.stdout,
                      stderr=self.stderr,
                      resume=True,
                      seconds_until_timeout=174000)

        model_locations = call_mort_function("get_locations", {
            "gbd_type": "ap_old",
            "level": "estimate",
            "gbd_year": self.gbd_year
        })
        model_locations = model_locations["ihme_loc_id"].tolist()

        lt_task = self.generate_empirical_lt_prep_task(upstream_tasks=[])
        wf.add_task(lt_task)

        # optional machine vision prediction tasks
        if run_mv == True:

            mv_plot_task = {}
            for ihme_loc_id in model_locations:
                mv_plot_task[ihme_loc_id] = self.generate_mv_plots_task(
                    [lt_task], ihme_loc_id)
                wf.add_task(mv_plot_task[ihme_loc_id])

            mv_run_task = {}
            for ihme_loc_id in model_locations:
                mv_run_task[ihme_loc_id] = self.generate_run_mv_task(
                    mv_plot_task.values(), ihme_loc_id)
                wf.add_task(mv_run_task[ihme_loc_id])

            select_lts_task = self.generate_select_lts_task(
                mv_run_task.values(), run_mv)
            wf.add_task(select_lts_task)

        else:

            select_lts_task = self.generate_select_lts_task([lt_task], run_mv)
            wf.add_task(select_lts_task)

        return wf
Esempio n. 19
0
    def generate_workflow(self, wf_name):
        wf = Workflow(workflow_args=wf_name,
                      project="proj_mortenvelope",
                      stdout=self.stdout,
                      stderr=self.stderr,
                      resume=True,
                      seconds_until_timeout=864000)

        # Create folder structure
        self.create_directories()

        # Save location metadata if they haven't been saved yet
        if os.path.isfile(self.input_dir +
                          "/lt_match_map.csv") == False | os.path.isfile(
                              self.input_dir +
                              "lt_env_locations.csv") == False:
            self.save_location_metadata()

        # Step 1: prep input data
        prep_task = self.generate_prep_task([])
        wf.add_task(prep_task)

        run_countries = pd.read_csv(
            self.input_dir + "/lt_match_map.csv")["ihme_loc_id"].tolist()

        # Step 2: generate life tables
        gen_lt_tasks = []
        for country in run_countries:
            country_task = self.generate_gen_lt_task([prep_task], country)
            wf.add_task(country_task)
            gen_lt_tasks.append(country_task)

        # Step 3: scale results
        scaling_tasks = []
        for year in self.run_years:
            year_task = self.generate_scaling_task(gen_lt_tasks, year)
            wf.add_task(year_task)
            scaling_tasks.append(year_task)

        # Step 4: Compile upload
        compile_task = self.generate_compile_upload_task(scaling_tasks)
        wf.add_task(compile_task)

        if self.send_slack:
            notify_task = self.generate_notify_task([compile_task])
            wf.add_task(notify_task)

        return wf