Exemple #1
0
    def create_shock_and_correct_jobs(self):
        """First set of tasks, no upstream tasks."""
        slots, mem = self.calculate_slots_and_memory(self.year_ids, 3)
        for loc in self.most_detailed_locations:
            for sex in self.sex_ids:
                shock_task = PythonTask(
                    script=os.path.join(self.code_dir, 'shocks.py'),
                    args=[
                        '--output_version_id', self.version_id,
                        '--location_id', loc, '--sex_id', sex
                    ],
                    name='shocks_{version}_{loc}_{sex}'.format(
                        version=self.version_id, loc=loc, sex=sex),
                    slots=slots,
                    mem_free=mem,
                    max_attempts=3,
                    tag='shock')
                self.task_dag.add_task(shock_task)
                self.shock_jobs_by_command[shock_task.name] = shock_task

                correct_task = PythonTask(
                    script=os.path.join(self.code_dir, 'correct.py'),
                    args=[
                        '--output_version_id', self.version_id,
                        '--location_id', loc, '--sex_id', sex
                    ],
                    name='correct_{version}_{loc}_{sex}'.format(
                        version=self.version_id, loc=loc, sex=sex),
                    slots=slots,
                    mem_free=mem,
                    max_attempts=3,
                    tag='correct')
                self.task_dag.add_task(correct_task)
                self.correct_jobs_by_command[correct_task.name] = correct_task
Exemple #2
0
 def create_append_diagnostic_jobs(self):
     slots, mem = (18, 36)
     task = PythonTask(script=os.path.join(self.code_dir,
                                           'append_diagnostics.py'),
                       args=['--output_version_id', self.version_id],
                       name='append_diagnostics_{version}'.format(
                           version=self.version_id),
                       slots=slots,
                       mem_free=mem,
                       max_attempts=3,
                       tag='append_diag')
     for job in self.append_shock_jobs_by_command.values():
         task.add_upstream(job)
     self.task_dag.add_task(task)
     self.append_diag_jobs_by_command[task.name] = task
Exemple #3
0
 def create_yll_jobs(self):
     slots, mem = self.calculate_slots_and_memory(self.year_ids, 3)
     for loc in self.most_detailed_locations:
         task = PythonTask(script=os.path.join(self.code_dir, 'ylls.py'),
                           args=[
                               '--output_version_id', self.version_id,
                               '--location_id', loc
                           ],
                           name='ylls_{version}_{loc}'.format(
                               version=self.version_id, loc=loc),
                           slots=slots,
                           mem_free=mem,
                           max_attempts=3,
                           tag='ylls')
         # add cause_agg upstream dependencies
         task.add_upstream(self.agg_cause_jobs_by_command[
             'agg_cause_{version}_{loc}'.format(version=self.version_id,
                                                loc=loc)])
         self.task_dag.add_task(task)
         self.ylls_jobs_by_command[task.name] = task
Exemple #4
0
 def create_post_scriptum_upload(self):
     slots, mem = (1, 2)
     for db in self.databases:
         if db in ['cod', 'gbd']:
             task = PythonTask(
                 script=os.path.join(self.code_dir,
                                     'post_scriptum_upload.py'),
                 args=[
                     '--output_version_id', self.version_id, '--db', db,
                     '{}'.format('--test' if self.db_env == 'dev' else '')
                 ],
                 name=('post_scriptum_upload_{version}_{db}'.format(
                     version=self.version_id, db=db)),
                 slots=slots,
                 mem_free=mem,
                 max_attempts=1,
                 tag='post_scriptum_upload')
             upload_jobs = list(self.upload_jobs_by_command.values())
             for job in upload_jobs:
                 task.add_upstream(job)
             self.task_dag.add_task(task)
Exemple #5
0
 def create_summary_jobs(self):
     for loc in self.all_locations:
         for db in ['gbd', 'cod']:
             slots, mem = (15, 30) if db == 'cod' else (26, 52)
             task = PythonTask(script=os.path.join(self.code_dir,
                                                   'summary.py'),
                               args=[
                                   '--output_version_id', self.version_id,
                                   '--location_id', loc, '--db', db
                               ],
                               name='summary_{version}_{loc}_{db}'.format(
                                   version=self.version_id, loc=loc, db=db),
                               slots=slots,
                               mem_free=mem,
                               max_attempts=3,
                               tag='summary')
             task.add_upstream(self.append_shock_jobs_by_command[
                 'append_shocks_{version}_{loc}'.format(
                     version=self.version_id, loc=loc)])
             self.task_dag.add_task(task)
             self.summarize_jobs_by_command[task.name] = task
Exemple #6
0
 def create_agg_cause_jobs(self):
     slots, mem = self.calculate_slots_and_memory(self.year_ids, 7)
     for loc in self.most_detailed_locations:
         task = PythonTask(script=os.path.join(self.code_dir,
                                               'aggregate_causes.py'),
                           args=[
                               '--output_version_id', self.version_id,
                               '--location_id', loc
                           ],
                           name='agg_cause_{version}_{loc}'.format(
                               version=self.version_id, loc=loc),
                           slots=slots,
                           mem_free=mem,
                           max_attempts=3,
                           tag='agg_cause')
         # add shock/correct upstream dependencies
         for sex in self.sex_ids:
             task.add_upstream(self.shock_jobs_by_command[
                 'shocks_{version}_{loc}_{sex}'.format(
                     version=self.version_id, loc=loc, sex=sex)])
             task.add_upstream(self.correct_jobs_by_command[
                 'correct_{version}_{loc}_{sex}'.format(
                     version=self.version_id, loc=loc, sex=sex)])
         self.task_dag.add_task(task)
         self.agg_cause_jobs_by_command[task.name] = task
    def get_task(self, measure_id, location_id):
        upsteam_tasks = []
        d = self.como_version.nonfatal_dimensions.get_simulation_dimensions(
            measure_id)

        # if location is not most detailed then its dependent on loc agg
        if location_id not in d.index_dim.get_level("location_id"):
            location_set_version_id = self.agg_loc_set_map[location_id]
            for component in self.como_version.components:
                for year_id in self._year_set:
                    for sex_id in [1, 2]:
                        if not (component == "impairment" and measure_id == 6):
                            task_name = LocationAggTaskFactory.get_task_name(
                                component=component,
                                year_id=year_id,
                                sex_id=sex_id,
                                measure_id=measure_id,
                                location_set_version_id=location_set_version_id
                            )
                            upsteam_tasks.append(self.task_registry[task_name])
        # otherwise it is dependent on simulation tasks or incidence
        else:
            if measure_id == 6:
                for sex_id in [1, 2]:
                    task_name = IncidenceTaskFactory.get_task_name(
                        location_id=location_id, sex_id=sex_id)
                    upsteam_tasks.append(self.task_registry[task_name])
            else:
                for year_id in self._year_set:
                    for sex_id in [1, 2]:
                        task_name = SimulationTaskFactory.get_task_name(
                            location_id=location_id,
                            sex_id=sex_id,
                            year_id=year_id)
                        upsteam_tasks.append(self.task_registry[task_name])

        name = self.get_task_name(measure_id, location_id)
        task = PythonTask(script=this_file,
                          args=[
                              "--como_dir", self.como_version.como_dir,
                              "--measure_id", measure_id, "--location_id",
                              location_id
                          ],
                          name=name,
                          upstream_tasks=upsteam_tasks,
                          slots=25,
                          mem_free=300,
                          max_attempts=5,
                          max_runtime=(60 * 60 * 6),
                          tag="summary")
        self.task_registry[name] = task
        return task
Exemple #8
0
 def create_agg_location_jobs(self):
     slots, mem = 10, 100
     for loc_set in self.location_set_ids:
         for measure in self.measure_ids:
             for data_type in ['shocks', 'unscaled', 'rescaled']:
                 if data_type == 'unscaled' and measure == 4:
                     continue
                 for year_id in self.year_ids:
                     task = PythonTask(
                         script=os.path.join(self.code_dir,
                                             'aggregate_locations.py'),
                         args=[
                             '--output_version_id', self.version_id,
                             '--df_type', data_type, '--measure_id',
                             measure, '--location_set_id', loc_set,
                             '--year_id', year_id
                         ],
                         name=('agg_locations_{}_{}_{}_{}_{}'.format(
                             self.version_id, data_type, measure, loc_set,
                             year_id)),
                         slots=slots,
                         mem_free=mem,
                         max_attempts=5,
                         tag='agg_location')
                     for loc in self.most_detailed_locations:
                         if measure == 4:
                             task.add_upstream(self.ylls_jobs_by_command[
                                 'ylls_{}_{}'.format(self.version_id, loc)])
                         else:
                             task.add_upstream(
                                 self.agg_cause_jobs_by_command[
                                     'agg_cause_{}_{}'.format(
                                         self.version_id, loc)])
                     # Some of our special locations for final round
                     # estimates treat otherwise aggregated locations as
                     # most-detailed locations. This will throw an
                     # AssertionError in the aggregator if it cannot find
                     # the aggregate location's file. This if block ensures
                     # that the primary estimation location set (35) is run
                     # first before these special location aggregation jobs
                     # are run. This will slow down CoDCorrect overall.
                     if loc_set in SPECIAL_LOCATIONS:
                         task.add_upstream(self.agg_loc_jobs_by_command[
                             'agg_locations_{}_{}_{}_{}_{}'.format(
                                 self.version_id, data_type, measure, 35,
                                 year_id)])
                     self.task_dag.add_task(task)
                     self.agg_loc_jobs_by_command[task.name] = task
Exemple #9
0
    def get_task(self, component, year_id, sex_id, measure_id,
                 location_set_version_id, redis_host):
        loc_tree = dbtrees.loctree(
            location_set_version_id=location_set_version_id)
        # put all aggregate locations in a mapping dict for summary dependency
        agg_locs = [
            node for node in loc_tree.node_ids
            if node not in [x.id for x in loc_tree.leaves()]
        ]
        for location_id in agg_locs:
            self.agg_loc_set_map[location_id] = location_set_version_id

        # hunt down upstream task list
        upstream_tasks = []
        for location_id in [node.id for node in loc_tree.leaves()]:
            if measure_id == 6:
                upstream_name = IncidenceTaskFactory.get_task_name(
                    location_id, sex_id)
            elif measure_id == 5 and component in [
                    "sequela", "injuries", "impairment"
            ]:
                upstream_name = SimulationInputTaskFactory.get_task_name(
                    location_id, sex_id)
            else:
                upstream_name = SimulationTaskFactory.get_task_name(
                    location_id, sex_id, year_id)
            upstream_tasks.append(self.task_registry[upstream_name])

        name = self.get_task_name(component, year_id, sex_id, measure_id,
                                  location_set_version_id)
        task = PythonTask(script=this_file,
                          args=[
                              "--como_dir", self.como_version.como_dir,
                              "--component", component, "--year_id", year_id,
                              "--sex_id", sex_id, "--measure_id", measure_id,
                              "--location_set_version_id",
                              location_set_version_id, "--redis_host",
                              redis_host
                          ],
                          name=name,
                          upstream_tasks=upstream_tasks,
                          slots=25,
                          mem_free=300,
                          max_attempts=5,
                          max_runtime=(60 * 60 * 10),
                          tag="loc_agg")
        self.task_registry[name] = task
        return task
 def get_task(self, location_id, sex_id, n_processes):
     name = self.get_task_name(location_id, sex_id)
     task = PythonTask(script=this_file,
                       args=[
                           "--como_dir", self.como_version.como_dir,
                           "--location_id", location_id, "--sex_id", sex_id,
                           "--n_processes", n_processes
                       ],
                       name=name,
                       slots=25,
                       mem_free=100,
                       max_attempts=5,
                       max_runtime=(60 * 60 * 6),
                       tag="inci")
     self.task_registry[name] = task
     return task
Exemple #11
0
 def create_imported_cases_jobs(self):
     """Generates the tasks and adds them to the task_dag."""
     slots = 38
     memory = slots * 2
     for cause in self.cause_ids:
         task = PythonTask(
             script=os.path.join(self.code_dir, 'imported_cases.py'),
             args=[
                 self.version_id, '--cause_id', cause, '--gbd_round_id',
                 self.gbd_round_id, '--output_dir', self.out_dir
             ],
             name='imported_cases_{}_{}'.format(self.version_id, cause),
             slots=slots,
             mem_free=memory,
             max_attempts=3,
             tag='imported_cases')
         self.task_dag.add_task(task)
Exemple #12
0
 def create_append_shock_jobs(self):
     slots, mem = self.calculate_slots_and_memory(self.year_ids, 7)
     for loc in self.all_locations:
         task = PythonTask(script=os.path.join(self.code_dir,
                                               'append_shocks.py'),
                           args=[
                               '--output_version_id', self.version_id,
                               '--location_id', loc
                           ],
                           name='append_shocks_{version}_{loc}'.format(
                               version=self.version_id, loc=loc),
                           slots=slots,
                           mem_free=mem,
                           max_attempts=3,
                           tag='append_shock')
         # for job in self.agg_loc_jobs_by_command.values():
         #     task.add_upstream(job)
         self.task_dag.add_task(task)
         self.append_shock_jobs_by_command[task.name] = task
    def get_task(self, location_id, sex_id, year_id, n_simulants, n_processes):
        # get upstream
        dep_name = SimulationInputTaskFactory.get_task_name(
            location_id, sex_id)
        dep = self.task_registry[dep_name]

        # make task
        name = self.get_task_name(location_id, sex_id, year_id)
        task = PythonTask(script=this_file,
                          args=[
                              "--como_dir", self.como_version.como_dir,
                              "--location_id", location_id, "--sex_id", sex_id,
                              "--year_id", year_id, "--n_simulants",
                              n_simulants, "--n_processes", n_processes
                          ],
                          name=name,
                          upstream_tasks=[dep],
                          slots=25,
                          mem_free=100,
                          max_attempts=5,
                          max_runtime=(60 * 60 * 3),
                          tag="sim")
        self.task_registry[name] = task
        return task
Exemple #14
0
 def create_upload_jobs(self):
     slots, mem = (10, 20)
     for measure in self.measure_ids:
         for db in self.databases:
             # cod and codcorrect databases only upload measure 1: deaths.
             if measure == 4 and db in ['cod', 'codcorrect']:
                 continue
             # cod and gbd db have separate test and production servers to
             # choose from. The codcorrect db doesn't have a test server
             if db in ['cod', 'gbd']:
                 conn_def = self.CONN_DEF_MAP[db][self.db_env]
             else:
                 conn_def = 'codcorrect'
             for change in self.pct_change:
                 # codcorrect & cod database does not upload for change.
                 if change and db in ['codcorrect', 'cod']:
                     continue
                 task = PythonTask(
                     script=os.path.join(self.code_dir, 'upload.py'),
                     args=[
                         '--output_version_id', self.version_id, '--db', db,
                         '--measure_id', measure, '--conn_def', conn_def,
                         '{}'.format('--change' if change else '')
                     ],
                     name='upload_{version}_{db}_{meas}_{change}'.format(
                         version=self.version_id,
                         db=db,
                         meas=measure,
                         change=change),
                     slots=slots,
                     mem_free=mem,
                     max_attempts=3,
                     tag='upload')
                 if db in ['cod', 'gbd']:
                     for loc in self.all_locations:
                         task.add_upstream(self.summarize_jobs_by_command[
                             'summary_{version}_{loc}_{db}'.format(
                                 version=self.version_id, loc=loc, db=db)])
                 else:
                     for job in self.append_diag_jobs_by_command.values():
                         task.add_upstream(job)
                 self.task_dag.add_task(task)
Exemple #15
0
def run_master(root_dir,
               envr,
               sweep_lt,
               sweep_yld,
               sweep_hale,
               prep_lt,
               prep_yld,
               calc_hale,
               summarize,
               upload_hale,
               n_draws,
               loc_set_id,
               year_id,
               yld_version,
               local,
               test_location,
               custom_lt,
               log_dir='DIRECTORY'):
    ###############################################
    #Start jobmon and launch different jobs. Also
    #set up directories, and run get_population
    #to cache pop for compile_yld file
    ###############################################
    if not os.path.isdir(log_dir):
        os.mkdir(log_dir)

    if local:
        out_dir = root_dir
    else:
        out_dir = 'DIRECTORY'

    parameter_csv.run_param(envr,
                            yld_version,
                            loc_set_id,
                            year_id,
                            gbd_round_id=GBD_ROUND_ID)

    param_sheet = pd.read_csv('%s/inputs/parameters.csv' % root_dir)
    param_sheet = param_sheet.loc[param_sheet['status'] == 'best']

    hale_version = param_sheet['hale_version'].item()
    mort_version = param_sheet['mort_run'].item()
    print('HALE VERSION IS {}'.format(hale_version))
    print('MORT VERSION IS {}'.format(mort_version))
    print('YLD VERSION IS {}'.format(yld_version))

    prog_dir = '%s/v%s' % (out_dir, hale_version)
    draw_dir = '%s/draws' % prog_dir
    summ_dir = '%s/summaries' % prog_dir

    for direc in [prog_dir, draw_dir, summ_dir]:
        if not os.path.isdir(direc):
            os.mkdir(direc)
        os.chmod(direc, 0o777)

    if custom_lt is not None:
        lt_in = custom_lt
    else:
        lt_in = ("DIRECTORY")

    lt_tmp = '%s/lt' % draw_dir
    lt_dir = '%s/lt' % summ_dir
    yld_tmp = '%s/yld' % draw_dir
    yld_dir = '%s/yld' % summ_dir
    hale_tmp = '%s/results' % draw_dir
    hale_dir = '%s/results' % summ_dir

    sweep([lt_tmp, lt_dir], sweep_lt)
    sweep([yld_tmp, yld_dir], sweep_yld)
    sweep([hale_tmp, hale_dir], sweep_hale)

    err = glob('{}/*.e*'.format(log_dir))
    out = glob('{}/*.o*'.format(log_dir))
    ps = glob('{}/*.p*'.format(log_dir))
    for log in err + out + ps:
        os.remove(log)

    if test_location is not None:
        locations = [test_location]
    else:
        locations = []
        for location_set in loc_set_id:
            location_meta = get_location_metadata(location_set_id=location_set,
                                                  gbd_round_id=GBD_ROUND_ID)
            location_meta = location_meta.loc[
                location_meta['location_id'] != 44620]
            locs = location_meta['location_id'].unique().tolist()
            locations = locations + locs
        locations = list(set(locations))

    year_draws = list(zip(year_id, n_draws))

    d_str = "[%m/%d/%Y %H:%M:%S]"
    wf = Workflow('HALE_{}'.format(datetime.now().strftime(d_str)),
                  project='proj_hale',
                  stderr=log_dir,
                  stdout=log_dir)

    print('Building DAG')
    if prep_lt:
        lt_task = {}
        for location in locations:
            for year, draws in year_draws:
                args = [
                    '--lt_in', lt_in, '--lt_tmp', lt_tmp, '--location',
                    location, '--year', year, '--n_draws', draws
                ]
                script = os.path.join(root_dir, '01_compile_lt.py')
                name = 'lt_{}_{}_prep'.format(location, year)
                lt_task[(location, year)] = PythonTask(script=script,
                                                       args=args,
                                                       name=name,
                                                       slots=4,
                                                       mem_free=8,
                                                       max_attempts=3,
                                                       tag='lt_prep')
                wf.add_task(lt_task[(location, year)])

    if prep_yld:
        population = get_population(location_id=locations,
                                    year_id=year_id,
                                    age_group_id='all',
                                    sex_id='all',
                                    gbd_round_id=GBD_ROUND_ID)
        population.drop('run_id', axis=1, inplace=True)
        population.set_index('location_id', inplace=True)
        population.to_csv('%s/inputs/pop.csv' % root_dir)

        yld_task = {}
        for location in locations:
            for year, draws in year_draws:
                args = [
                    '--yld_tmp', yld_tmp, '--root_dir', root_dir, '--location',
                    location, '--yld_version', yld_version, '--year', year,
                    '--n_draws', draws
                ]
                script = os.path.join(root_dir, '02_compile_yld.py')
                name = 'yld_{}_{}_prep'.format(location, year)
                yld_task[(location, year)] = PythonTask(script=script,
                                                        args=args,
                                                        name=name,
                                                        slots=4,
                                                        mem_free=8,
                                                        max_attempts=3,
                                                        tag='yld_prep')
                wf.add_task(yld_task[(location, year)])

    if calc_hale:
        hale_task = {}
        for location in locations:
            for year in year_id:
                if prep_yld and prep_lt:
                    upstream_tasks = [
                        lt_task[(location, year)], yld_task[(location, year)]
                    ]
                elif prep_yld:
                    upstream_tasks = [yld_task[(location, year)]]
                elif prep_lt:
                    upstream_tasks = [lt_task[(location, year)]]
                else:
                    upstream_tasks = None
                args = [
                    '--hale_tmp', hale_tmp, '--lt_tmp', lt_tmp, '--yld_tmp',
                    yld_tmp, '--location', location, '--year', year
                ]
                script = os.path.join(root_dir, '03_calc_hale.py')
                name = 'hale_{}_{}_calc'.format(location, year)
                hale_task[(location,
                           year)] = PythonTask(script=script,
                                               args=args,
                                               name=name,
                                               slots=4,
                                               mem_free=8,
                                               max_attempts=3,
                                               tag='hale_calc',
                                               upstream_tasks=upstream_tasks)
                wf.add_task(hale_task[(location, year)])

    if summarize:
        summary_task = {}
        for location in locations:
            if calc_hale:
                upstream_tasks = [
                    hale_task[(location, year)] for year in year_id
                ]
            else:
                upstream_tasks = None
            args = [
                '--lt_tmp', lt_tmp, '--lt_dir', lt_dir, '--yld_tmp', yld_tmp,
                '--yld_dir', yld_dir, '--hale_tmp', hale_tmp, '--hale_dir',
                hale_dir, '--location', location
            ]
            script = os.path.join(root_dir, '04_calc_summaries.py')
            name = 'summary_{}_calc'.format(location)
            summary_task[location] = PythonTask(script=script,
                                                args=args,
                                                name=name,
                                                slots=4,
                                                mem_free=8,
                                                max_attempts=3,
                                                tag='summarize',
                                                upstream_tasks=upstream_tasks)
            wf.add_task(summary_task[location])

    if upload_hale:
        if summarize:
            upstream_tasks = [summary_task[loc] for loc in locations]
        else:
            upstream_tasks = None
        args = [
            '--hale_version', hale_version, '--hale_dir', hale_dir, '--envr',
            envr
        ]
        script = os.path.join(root_dir, '05_upload_hale.py')
        name = 'upload_hale'
        upload_task = PythonTask(script=script,
                                 args=args,
                                 name=name,
                                 slots=12,
                                 mem_free=24,
                                 max_attempts=3,
                                 tag='upload',
                                 upstream_tasks=upstream_tasks)
        wf.add_task(upload_task)

    print("executing workflow")
    integer_result = wf.execute()
    if integer_result:
        raise RuntimeError("Workflow failure")
    print("FINISHED")