Esempio n. 1
0
    def create_cleanup_jobs(self):
        """Saves rake summaries and removes
        tempfiles no longer needed"""
        runtime = 600 if self.draws == 0 else 7200

        for ko in list(range(0, self.holdouts + 1)):
            task = PythonTask(script=CLEANUP_SCRIPT,
                              args=[
                                  self.run_id, self.output_path, self.run_type,
                                  ko, self.draws
                              ],
                              name=f'clean_{self.run_id}_{ko}',
                              num_cores=1,
                              m_mem_free='1G',
                              max_runtime_seconds=runtime,
                              max_attempts=1,
                              tag='stgpr_clean',
                              queue='all.q')

            if ko == 0:
                for loc in self.subnat_locations:
                    task.add_upstream(self.rake_jobs['rake_{}_{}'.format(
                        self.run_id, loc)])
            else:
                task.add_upstream(self.eval_jobs['eval_{}'.format(
                    self.run_id)])

            self.workflow.add_task(task)
            self.cleanup_jobs[task.name] = task
Esempio n. 2
0
    def create_post_jobs(self):
        """Depends on rake jobs. Calculates fit stats
        and cleans up file folders, no mas."""

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):

                submit_params = ','.join(
                    [str(x) for x in self.param_groups[param_group]])

                task = PythonTask(
                    script=POST_SCRIPT,
                    args=[
                        self.run_id, self.output_path, ko, self.run_type,
                        self.holdouts, submit_params
                    ],
                    name=f'post_{self.run_id}_{ko}_{param_group}',
                    num_cores=1,
                    m_mem_free='2G',
                    max_runtime_seconds=300,
                    max_attempts=2,
                    tag='stgpr_post',
                    queue='all.q',
                    resource_scales=RESOURCE_SCALES,
                    hard_limits=True)

                # add ST upstreams
                for loc_group in list(range(0, self.nparallel)):
                    gp_label = 'gpr_{}_{}_{}_{}'.format(
                        self.run_id, ko, param_group, loc_group)
                    upstream_job = self.gpr_jobs[gp_label]
                    task.add_upstream(upstream_job)

                self.workflow.add_task(task)
                self.post_jobs[task.name] = task
Esempio n. 3
0
    def create_descanso_jobs(self):
        """Depends on aggregate locations coming out of loc agg jobs"""
        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):

                submit_params = ','.join(
                    [str(x) for x in self.param_groups[param_group]])

                runtime = 3600 if self.is_diet_model else 300
                task = PythonTask(
                    script=IM_SCRIPT,
                    args=[
                        self.run_id, self.output_path, ko, self.draws,
                        self.nparallel, submit_params
                    ],
                    name=f'descanso_{self.run_id}_{ko}_{param_group}',
                    num_cores=1,
                    m_mem_free=
                    '20G',  # upped from 5 to 20 for variance simulation
                    max_runtime_seconds=runtime,
                    max_attempts=2,
                    tag='stgpr_amp_nsv',
                    queue='all.q',
                    resource_scales=RESOURCE_SCALES,
                    hard_limits=True)

                # add ST upstreams
                for loc_group in list(range(0, self.nparallel)):
                    st_label = 'st_{}_{}_{}_{}'.format(self.run_id, ko,
                                                       param_group, loc_group)
                    upstream_job = self.st_jobs[st_label]
                    task.add_upstream(upstream_job)

                self.workflow.add_task(task)
                self.descanso_jobs[task.name] = task
Esempio n. 4
0
    def create_apply_correction_tasks(self) -> None:
        for sex in self.parameters.sex_ids:
            for location in self.parameters.most_detailed_location_ids:
                correct_task = PythonTask(
                    script=os.path.join(
                        self.code_dir, DAG.Executables.CORRECT
                    ),
                    args=[
                        '--action', DAG.Tasks.Type.CORRECT,
                        '--parent_dir', self.parameters.parent_dir,
                        '--gbd_round_id', self.parameters.gbd_round_id,
                        '--version_id', self.parameters.version_id,
                        '--location_id', location,
                        '--sex_id', sex,
                        '--env_version_id', self.parameters.envelope_version_id
                    ],
                    name=DAG.Tasks.Name.APPLY_CORRECTION.format(
                        location=location,
                        sex=sex
                    ),
                    num_cores=DAG.Tasks.Cores.APPLY_CORRECTION,
                    m_mem_free=DAG.Tasks.Memory.APPLY_CORRECTION,
                    upstream_tasks=[
                        self.task_map[DAG.Tasks.Type.CACHE][
                            DAG.Tasks.Name.CACHE_MORTALITY.format(
                                mort_process=MortalityInputs.ENVELOPE_DRAWS
                            )
                        ],
                        self.task_map[DAG.Tasks.Type.CACHE][
                            DAG.Tasks.Name.CACHE_MORTALITY.format(
                                mort_process=(
                                    MortalityInputs.ENVELOPE_SUMMARY
                                )
                            )
                        ]
                    ],
                    max_runtime_seconds=DAG.Tasks.Runtime.APPLY_CORRECTION,
                    tag=DAG.Tasks.Type.CORRECT
                )
                for mvid in self.parameters.best_model_version_ids:
                    upstream_name = DAG.Tasks.Name.VALIDATE_DRAWS.format(
                        model_version_id=mvid
                    )
                    correct_task.add_upstream(
                        self.task_map[DAG.Tasks.Type.VALIDATE][upstream_name]
                    )
                self.task_map[DAG.Tasks.Type.CORRECT][
                    correct_task.name
                ] = correct_task

                self.workflow.add_task(correct_task)
Esempio n. 5
0
    def create_gpr_jobs(self):
        # set runtime and memory based on draws
        gpr_runtime = 1200
        gpr_memory = 4
        if self.draws == 100:
            gpr_runtime = 1500
            gpr_memory = 7
        elif self.draws == 1000:
            gpr_runtime = 1800
            gpr_memory = 10

        if self.is_diet_model:
            gpr_runtime *= 3
            gpr_memory *= 3

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):
                upstream_job = self.descanso_jobs['descanso_{}_{}_{}'.format(
                    self.run_id, ko, param_group)]
                for loc_group in list(range(0, self.nparallel)):

                    submit_params = ','.join(
                        [str(x) for x in self.param_groups[param_group]])

                    jname = 'gpr_{}_{}_{}_{}'.format(self.run_id, ko,
                                                     param_group, loc_group)

                    task = PythonTask(script=GPR_SCRIPT,
                                      args=[
                                          self.run_id, self.output_path, ko,
                                          self.draws, submit_params,
                                          self.nparallel, loc_group
                                      ],
                                      name=jname,
                                      num_cores=1,
                                      m_mem_free=f'{gpr_memory}G',
                                      max_runtime_seconds=gpr_runtime,
                                      max_attempts=2,
                                      tag='stgpr_gpr',
                                      queue='all.q',
                                      resource_scales=RESOURCE_SCALES,
                                      hard_limits=True)

                    task.add_upstream(upstream_job)

                    self.workflow.add_task(task)
                    self.gpr_jobs[task.name] = task
Esempio n. 6
0
    def create_st_jobs(self):
        for ko in range(0, self.holdouts + 1):
            upstream_job = self.stage1_jobs['stage1_{}_{}'.format(
                self.run_id, ko)]
            for param_group in range(0, len(self.param_groups)):
                for loc_group in range(0, self.nparallel):

                    submit_params = ','.join(
                        [str(x) for x in self.param_groups[param_group]])
                    jname = 'st_{}_{}_{}_{}'.format(self.run_id, ko,
                                                    param_group, loc_group)

                    memory = 50
                    runtime = 1500
                    if self.is_diet_model:
                        memory = 120
                        runtime = 28800  # 8 hours

                    task = PythonTask(script=ST_SCRIPT,
                                      args=[
                                          self.run_id, self.output_path, ko,
                                          self.run_type, submit_params,
                                          self.nparallel, loc_group
                                      ],
                                      name=jname,
                                      num_cores=6,
                                      m_mem_free=f'{memory}G',
                                      max_attempts=3,
                                      max_runtime_seconds=runtime,
                                      tag='stgpr_spacetime',
                                      queue='all.q',
                                      resource_scales=RESOURCE_SCALES,
                                      hard_limits=True)

                    task.add_upstream(upstream_job)

                    self.workflow.add_task(task)
                    self.st_jobs[task.name] = task
Esempio n. 7
0
    def create_eval_jobs(self):
        """
        For hyperparameter selection runs only, determine best hyperparameter
        set based on in-sample or out-of-sample RMSE.
        - run_type = "in_sample_selection"
        - run_type = "oos_selection"

        For runs with only one set of parameters, set the best_param_set
        to the *only* param_set (param_set 0) for consistency in rake inputs.

        Lastly, just collect the disparate fit_stats files and combine into
        a single file, saved as fit_stats.csv for all run types
        """

        task = PythonTask(script=EVAL_SCRIPT,
                          args=[
                              self.run_id, self.output_path, self.run_type,
                              self.holdouts, self.n_parameter_sets
                          ],
                          name=f'eval_{self.run_id}',
                          num_cores=1,
                          m_mem_free='500M',
                          max_runtime_seconds=180,
                          max_attempts=2,
                          tag='stgpr_eval',
                          queue='all.q',
                          resource_scales=RESOURCE_SCALES,
                          hard_limits=True)

        for ko in list(range(0, self.holdouts + 1)):
            for param_group in list(range(0, len(self.param_groups))):
                post_label = 'post_{}_{}_{}'.format(self.run_id, ko,
                                                    param_group)
                task.add_upstream(self.post_jobs[post_label])

        self.workflow.add_task(task)
        self.eval_jobs[task.name] = task
Esempio n. 8
0
    def create_upload_tasks(self) -> None:
        """
        Adds tasks to upload summaries to gbd and / or cod databases.
        """
        # Determine if percent change is part of workflow
        if self.parameters.year_start_ids is not None:
            upload_types = ['single', 'multi']
        else:
            upload_types = ['single']

        # gbd upload tasks
        for measure in self.parameters.measure_ids:
            for upload_type in upload_types:
                upload_gbd_task = PythonTask(
                    script=os.path.join(
                        self.code_dir,
                        DAG.Executables.UPLOAD
                    ),
                    args=[
                        '--machine_process', self.parameters.process,
                        '--version_id', self.parameters.version_id,
                        '--database', DataBases.GBD,
                        '--measure_id', measure,
                        '--upload_type', upload_type
                    ],
                    name=DAG.Tasks.Name.UPLOAD.format(
                        database=DataBases.GBD,
                        uploadtype=upload_type,
                        measure=measure
                    ),
                    num_cores=DAG.Tasks.Cores.UPLOAD,
                    m_mem_free=DAG.Tasks.Memory.UPLOAD,
                    max_runtime_seconds=DAG.Tasks.Runtime.UPLOAD,
                    queue=DAG.UPLOAD_QUEUE,
                    tag=DAG.Tasks.Type.UPLOAD
                )
                # add gbd summarization tasks as upstream dependencies
                if upload_type == 'single':
                    for year in self.parameters.year_ids:
                        for location in self.parameters.location_ids:
                            upload_gbd_task.add_upstream(
                                self.task_map[DAG.Tasks.Type.SUMMARIZE][
                                    DAG.Tasks.Name.SUMMARIZE_GBD.format(
                                        measure=measure,
                                        location=location,
                                        year=year
                                    )
                                ]
                            )
                else:
                    for year_index in range(len(self.parameters.year_start_ids)):
                        for location in self.parameters.location_ids:
                            upload_gbd_task.add_upstream(
                                self.task_map[DAG.Tasks.Type.SUMMARIZE][
                                    DAG.Tasks.Name.SUMMARIZE_PCT_CHANGE.format(
                                        measure=measure,
                                        location=location,
                                        year_start=self.parameters.year_start_ids[
                                            year_index],
                                        year_end=self.parameters.year_end_ids[
                                            year_index]
                                    )
                                ]
                            )
                self.task_map[DAG.Tasks.Type.UPLOAD][
                    upload_gbd_task.name
                ] = upload_gbd_task

                self.workflow.add_task(upload_gbd_task)

        # cod upload tasks - DEATHS only
        if DataBases.COD in self.parameters.databases:
            upload_cod_task = PythonTask(
                script=os.path.join(
                    self.code_dir,
                    DAG.Executables.UPLOAD),

                args=[
                    '--machine_process', self.parameters.process,
                    '--version_id', self.parameters.version_id,
                    '--database', DataBases.COD,
                    '--measure_id', Measures.Ids.DEATHS,
                ],
                name=DAG.Tasks.Name.UPLOAD.format(
                    database=DataBases.COD,
                    uploadtype='single',
                    measure=Measures.Ids.DEATHS
                ),
                num_cores=DAG.Tasks.Cores.UPLOAD,
                m_mem_free=DAG.Tasks.Memory.UPLOAD,
                max_runtime_seconds=DAG.Tasks.Runtime.UPLOAD,
                queue=DAG.QUEUE,
                tag=DAG.Tasks.Type.UPLOAD
            )
            # add cod summarization tasks as upstream dependencies
            for year in self.parameters.year_ids:
                for location in self.parameters.location_ids:
                    upload_cod_task.add_upstream(
                        self.task_map[DAG.Tasks.Type.SUMMARIZE][
                            DAG.Tasks.Name.SUMMARIZE_COD.format(
                                measure=Measures.Ids.DEATHS,
                                location=location,
                                year=year
                            )
                        ]
                    )
            self.task_map[DAG.Tasks.Type.UPLOAD][
                upload_cod_task.name
            ] = upload_cod_task

            self.workflow.add_task(upload_cod_task)
Esempio n. 9
0
    def create_summarize_tasks(self) -> None:
        """
        Adds tasks to summarize draw level estimates for each location, year,
        and measure in our FauxCorrect run.

        Dependent on the mortality input caching tasks as well as the append
        shocks tasks.

        """
        for measure in self.parameters.measure_ids:
            for year in self.parameters.year_ids:
                for location in self.parameters.location_ids:
                    # Create summarize tasks for gbd schema.
                    summarize_gbd_task = PythonTask(
                        script=os.path.join(
                            self.code_dir,
                            DAG.Executables.SUMMARIZE_GBD
                        ),
                        args=[
                            '--parent_dir', self.parameters.parent_dir,
                            '--gbd_round_id', self.parameters.gbd_round_id,
                            '--location_id', location,
                            '--year_id', year,
                            '--measure_id', measure,
                            '--machine_process', self.parameters.process
                        ],
                        name=DAG.Tasks.Name.SUMMARIZE_GBD.format(
                            measure=measure, location=location, year=year
                        ),
                        num_cores=DAG.Tasks.Cores.SUMMARIZE,
                        m_mem_free=DAG.Tasks.Memory.SUMMARIZE,
                        max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE,
                        queue=DAG.QUEUE,
                        upstream_tasks=[
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=MortalityInputs.ENVELOPE_DRAWS
                                )
                            ],
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=(
                                        MortalityInputs.ENVELOPE_SUMMARY
                                    )
                                )
                            ],
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=MortalityInputs.POPULATION
                                )
                            ]
                        ],
                        tag=DAG.Tasks.Type.SUMMARIZE
                    )
                    for append_sex in self.parameters.sex_ids:
                        summarize_gbd_task.add_upstream(
                            self.task_map[DAG.Tasks.Type.APPEND][
                                DAG.Tasks.Name.APPEND_SHOCKS.format(
                                    location=location,
                                    sex=append_sex
                                )
                            ]
                        )
                    self.task_map[DAG.Tasks.Type.SUMMARIZE][
                        summarize_gbd_task.name
                    ] = summarize_gbd_task

                    self.workflow.add_task(summarize_gbd_task)

                    if (
                        measure == Measures.Ids.YLLS
                        or
                        DataBases.COD not in self.parameters.databases
                    ):
                        continue
                    # Create summarize tasks for deaths and cod schema.
                    summarize_cod_task = PythonTask(
                        script=os.path.join(
                            self.code_dir,
                            DAG.Executables.SUMMARIZE_COD
                        ),
                        args=[
                            '--version_id', self.parameters.version_id,
                            '--parent_dir', self.parameters.parent_dir,
                            '--gbd_round_id', self.parameters.gbd_round_id,
                            '--location_id', location,
                            '--year_id', year
                        ],
                        name=DAG.Tasks.Name.SUMMARIZE_COD.format(
                            measure=Measures.Ids.DEATHS,
                            location=location,
                            year=year
                        ),
                        num_cores=DAG.Tasks.Cores.SUMMARIZE,
                        m_mem_free=DAG.Tasks.Memory.SUMMARIZE,
                        max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE,
                        queue=DAG.QUEUE,
                        upstream_tasks=[
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=MortalityInputs.ENVELOPE_DRAWS
                                )
                            ],
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=(
                                        MortalityInputs.ENVELOPE_SUMMARY
                                    )
                                )
                            ],
                            self.task_map[DAG.Tasks.Type.CACHE][
                                DAG.Tasks.Name.CACHE_MORTALITY.format(
                                    mort_process=MortalityInputs.POPULATION
                                )
                            ]
                        ],
                        tag=DAG.Tasks.Type.SUMMARIZE
                    )
                    for append_sex in self.parameters.sex_ids:
                        summarize_cod_task.add_upstream(
                            self.task_map[DAG.Tasks.Type.APPEND][
                                DAG.Tasks.Name.APPEND_SHOCKS.format(
                                    location=location,
                                    sex=append_sex
                                )
                            ]
                        )
                    self.task_map[DAG.Tasks.Type.SUMMARIZE][
                        summarize_cod_task.name
                    ] = summarize_cod_task

                    self.workflow.add_task(summarize_cod_task)

            # pct_change summarization tasks
            if self.parameters.year_start_ids:
                for year_index in range(len(self.parameters.year_start_ids)):
                    for pctc_location in self.parameters.location_ids:
                        summarize_pct_change_task = PythonTask(
                            script=os.path.join(
                                self.code_dir,
                                DAG.Executables.SUMMARIZE_PCT_CHANGE
                            ),
                            args=[
                                '--parent_dir', self.parameters.parent_dir,
                                '--gbd_round_id', self.parameters.gbd_round_id,
                                '--location_id', pctc_location,
                                '--year_start_id', self.parameters.year_start_ids[
                                    year_index],
                                '--year_end_id', self.parameters.year_end_ids[
                                    year_index],
                                '--measure_id', measure,
                                '--machine_process', self.parameters.process
                            ],
                            name=DAG.Tasks.Name.SUMMARIZE_PCT_CHANGE.format(
                                measure=measure, location=pctc_location,
                                year_start=self.parameters.year_start_ids[year_index],
                                year_end=self.parameters.year_end_ids[year_index]
                            ),
                            num_cores=DAG.Tasks.Cores.PCT_CHANGE,
                            m_mem_free=DAG.Tasks.Memory.PCT_CHANGE,
                            max_runtime_seconds=DAG.Tasks.Runtime.SUMMARIZE,
                            queue=DAG.QUEUE,
                            upstream_tasks=[
                                self.task_map[DAG.Tasks.Type.CACHE][
                                    DAG.Tasks.Name.CACHE_MORTALITY.format(
                                        mort_process=MortalityInputs.ENVELOPE_DRAWS
                                    )
                                ],
                                self.task_map[DAG.Tasks.Type.CACHE][
                                    DAG.Tasks.Name.CACHE_MORTALITY.format(
                                        mort_process=(
                                            MortalityInputs.ENVELOPE_SUMMARY
                                        )
                                    )
                                ],
                                self.task_map[DAG.Tasks.Type.CACHE][
                                    DAG.Tasks.Name.CACHE_MORTALITY.format(
                                        mort_process=MortalityInputs.POPULATION
                                    )
                                ]
                            ],
                            tag=DAG.Tasks.Type.SUMMARIZE
                        )
                        for append_sex in self.parameters.sex_ids:
                            summarize_pct_change_task.add_upstream(
                                self.task_map[DAG.Tasks.Type.APPEND][
                                    DAG.Tasks.Name.APPEND_SHOCKS.format(
                                        location=pctc_location,
                                        sex=append_sex
                                    )
                                ]
                            )
                        self.task_map[DAG.Tasks.Type.SUMMARIZE][
                            summarize_pct_change_task.name
                        ] = summarize_pct_change_task

                        self.workflow.add_task(summarize_pct_change_task)
Esempio n. 10
0
    def create_append_shocks_tasks(self) -> None:
        """
        Adds tasks to append shocks to Deaths and YLLs by location, sex,
        and year.

        Dependent on completed correction application for respective locations,
        sexes, and years. Also dependent on location aggregation.
        """
        for sex in self.parameters.sex_ids:
            for location in self.parameters.location_ids:
                most_detailed_location = (
                    location in self.parameters.most_detailed_location_ids
                )
                append_shocks = PythonTask(
                    script=os.path.join(
                        self.code_dir, DAG.Executables.APPEND_SHOCKS
                    ),
                    args=[
                        '--parent_dir', self.parameters.parent_dir,
                        '--machine_process', self.parameters.process,
                        '--measure_ids',
                        " ".join([str(x) for x in self.parameters.measure_ids]),
                        '--location_id', location,
                        '--most_detailed_location', most_detailed_location,
                        '--sex_id', sex
                    ],
                    name=DAG.Tasks.Name.APPEND_SHOCKS.format(
                        location=location, sex=sex
                    ),
                    num_cores=DAG.Tasks.Cores.APPEND_SHOCKS,
                    m_mem_free=DAG.Tasks.Memory.APPEND_SHOCKS,
                    max_runtime_seconds=DAG.Tasks.Runtime.APPEND,
                    tag=DAG.Tasks.Type.APPEND
                )
                if (
                    most_detailed_location
                    and
                    Measures.Ids.YLLS in self.parameters.measure_ids
                ):
                    # attach calc ylls jobs as upstream
                    append_shocks.add_upstream(
                        self.task_map[DAG.Tasks.Type.CALCULATE][
                            DAG.Tasks.Name.CALC_YLLS.format(
                                location=location,
                                sex=sex
                            )
                        ]
                    )
                elif (
                    most_detailed_location
                    and
                    Measures.Ids.YLLS not in self.parameters.measure_ids
                ):
                    # add cause agg jobs as upstream.
                    append_shocks.add_upstream(
                       self.task_map[DAG.Tasks.Type.CAUSE_AGG][
                            (DAG.Tasks.Name.CAUSE_AGGREGATION
                            .format(
                                location=location,
                                sex=sex
                            ))
                        ]
                    )
                else:
                    # Add location aggregation tasks as upstream dependency.
                    for location_set_id in self.parameters.location_set_ids:
                        for loc_agg_year in self.parameters.year_ids:
                            for loc_agg_measure in self.parameters.measure_ids:
                                for loc_agg_type in (
                                    LocationAggregation.Type.CODCORRECT
                                ):
                                    if (
                                        FilePaths.UNSCALED_DIR in loc_agg_type
                                        and
                                        loc_agg_measure == Measures.Ids.YLLS
                                    ):
                                        continue
                                    append_shocks.add_upstream(
                                        self.task_map[
                                            DAG.Tasks.Type.LOC_AGG][(
                                            DAG.Tasks.Name.LOCATION_AGGREGATION
                                            .format(
                                                aggregation_type=(
                                                    loc_agg_type
                                                    .replace("/","_")
                                                ),
                                                location_set=location_set_id,
                                                measure=loc_agg_measure,
                                                year=loc_agg_year,
                                            )
                                        )]
                                    )
                self.task_map[DAG.Tasks.Type.APPEND][
                    append_shocks.name
                ] = append_shocks
                self.workflow.add_task(append_shocks)
Esempio n. 11
0
    def create_location_aggregation_tasks(self) -> None:
        """
        Adds tasks to aggregate up the location hierarchy for each location set
        id, sex, year, and measure in our FauxCorrect run.

        Dependent on each location, sex, and year specific group of scalar or
        calculate ylls tasks to be completed for their respective measure ids.
        """
        for location_set_id in self.parameters.location_set_ids:
            for year in self.parameters.year_ids:
                for measure in self.parameters.measure_ids:
                    for loc_agg_type in LocationAggregation.Type.CODCORRECT:
                        if (
                            FilePaths.UNSCALED_DIR in loc_agg_type
                            and
                            measure == Measures.Ids.YLLS
                        ):
                            continue
                        agg_task = PythonTask(
                            script=os.path.join(
                                self.code_dir,
                                DAG.Executables.LOC_AGG
                            ),
                            args=[
                                '--action', DAG.Tasks.Type.LOC_AGG,
                                '--parent_dir', self.parameters.parent_dir,
                                '--gbd_round_id', (
                                    self.parameters.gbd_round_id
                                ),
                                '--aggregation_type', loc_agg_type,
                                '--location_set_id', location_set_id,
                                '--year_id', year,
                                '--measure_id', measure
                            ],
                            name=DAG.Tasks.Name.LOCATION_AGGREGATION.format(
                                aggregation_type=(
                                    loc_agg_type.replace("/","_")
                                ),
                                location_set=location_set_id,
                                measure=measure,
                                year=year
                            ),
                            num_cores=DAG.Tasks.Cores.LOCATION_AGGREGATION,
                            m_mem_free=(
                                DAG.Tasks.Memory.LOCATION_AGGREGATION
                            ),
                            max_runtime_seconds=(
                                DAG.Tasks.Runtime.LOCATION_AGGREGATION
                            ),
                            queue=DAG.QUEUE,
                            upstream_tasks=[
                                self.task_map[DAG.Tasks.Type.CACHE][
                                    DAG.Tasks.Name.CACHE_REGIONAL_SCALARS
                                ]
                            ],
                            tag=DAG.Tasks.Type.LOC_AGG
                        )
                        # Attach upstream dependencies, all locations for
                        # sex and year.
                        for sex in self.parameters.sex_ids:
                            for loc in (
                                self.parameters.most_detailed_location_ids
                            ):
                                if measure == Measures.Ids.YLLS:
                                    # If aggregating measure 4 (YLLs), attach
                                    # calc ylls jobs as upstream
                                    agg_task.add_upstream(
                                        self.task_map[DAG.Tasks.Type.CALCULATE][
                                            DAG.Tasks.Name.CALC_YLLS.format(
                                                location=loc,
                                                sex=sex
                                            )
                                        ]
                                    )
                                else:
                                    # If measure is not 4 (YLLs), then add
                                    # cause agg jobs as upstream.
                                    agg_task.add_upstream(
                                       self.task_map[DAG.Tasks.Type.CAUSE_AGG][
                                            (DAG.Tasks.Name.CAUSE_AGGREGATION
                                            .format(
                                                location=loc,
                                                sex=sex
                                            ))
                                        ]
                                    )
                        if location_set_id not in [
                            LocationSetId.OUTPUTS, LocationSetId.SDI,
                            LocationSetId.STANDARD
                        ]:
                            # If location set is one of the special
                            # sets that central computation only aggregates
                            # at the end of a round, add the outputs location
                            # set as upstream dependency so it finishes first.
                            agg_task.add_upstream(
                                self.task_map[DAG.Tasks.Type.LOC_AGG][
                                    DAG.Tasks.Name.LOCATION_AGGREGATION.format(
                                        aggregation_type=(
                                            loc_agg_type.replace("/","_")
                                        ),
                                        location_set=LocationSetId.OUTPUTS,
                                        measure=measure,
                                        year=year
                                    )
                                ]
                            )
                        self.task_map[DAG.Tasks.Type.LOC_AGG][
                            agg_task.name
                        ] = agg_task

                        self.workflow.add_task(agg_task)
Esempio n. 12
0
    def create_rake_jobs(self):
        """Depends on GPR jobs including all the subnationals
        and national locations for each
        rake job, parallelized out by parent_id.
        Raking only done on the first KO (KO 0),
        which does not hold out any data from the dataset."""
        for loc in self.subnat_locations:
            mem = int(
                np.ceil(
                    self.rake_memory_df.query(f'location == {loc}')
                    ['mem'].iat[0]))

            rt = int(
                np.ceil(
                    self.rake_memory_df.query(f'location == {loc}')
                    ['runtime'].iat[0]))

            if self.draws == 1000:
                mem *= 2
                rt *= 3
                rt = max(rt, 7200)

            if self.is_diet_model:
                mem *= 2
                rt *= 3
                rt = max(rt, 14400)

            task = PythonTask(script=RAKE_SCRIPT,
                              args=[
                                  self.run_id, self.output_path, 0, self.draws,
                                  self.run_type, self.rake_logit, loc
                              ],
                              name=f'rake_{self.run_id}_{loc}',
                              num_cores=1,
                              m_mem_free=f'{mem}G',
                              max_runtime_seconds=rt,
                              max_attempts=2,
                              tag='stgpr_rake',
                              queue='all.q',
                              resource_scales=RESOURCE_SCALES,
                              hard_limits=True)

            # grab all subnationals and country location_ids associated with a country
            lvl = 'level_{}'.format(NATIONAL_LEVEL)
            all_needed_locs = self.locs.loc[self.locs[lvl] == loc,
                                            'location_id'].unique()

            # add each gpr job containing a needed national/subnational
            # for raking to upstreams
            if self.holdouts == 0:
                for param_group in list(range(0, len(self.param_groups))):
                    for loc_group in list(range(0, self.nparallel)):
                        loc_group_vals = self.parallel_groups[loc_group]

                        common_elements = len(
                            intersection(all_needed_locs.tolist(),
                                         loc_group_vals.tolist()))
                        if common_elements > 0:
                            task.add_upstream(
                                self.gpr_jobs['gpr_{}_0_{}_{}'.format(
                                    self.run_id, param_group, loc_group)])
            else:
                task.add_upstream(self.eval_jobs['eval_{}'.format(
                    self.run_id)])

            self.workflow.add_task(task)
            self.rake_jobs[task.name] = task