Example #1
0
 def test_default(self):
     '''Create default worker groups with various command line arguments'''
     
     config = get_args('--consume-all --num-workers 1'.split())
     group = worker.WorkerGroup(config, host_type='DEFAULT', workers_str=None)
     self.assertEqual(len(group.workers), 1)
     self.assertEqual(group.workers[0].num_nodes, 1)
     self.assertEqual(group.workers[0].max_ranks_per_node, 4)
     
     config = get_args('--consume-all --num-workers 3 --max-ranks-per-node 4'.split())
     group = worker.WorkerGroup(config, host_type='DEFAULT', workers_str=None)
     self.assertEqual(len(group.workers), 3)
     self.assertEqual(group.workers[0].num_nodes, 1)
     self.assertEqual(group.workers[0].max_ranks_per_node, 4)
Example #2
0
    def __init__(self, wf_name, time_limit_minutes, gpus_per_node):
        self.jobsource = BalsamJob.source
        self.jobsource.workflow = wf_name
        if wf_name:
            logger.info(f'Filtering jobs with workflow matching {wf_name}')
        else:
            logger.info('No workflow filter')

        self.jobsource.clear_stale_locks()
        self.jobsource.start_tick()
        self.worker_group = worker.WorkerGroup()
        self.total_nodes = sum(w.num_nodes for w in self.worker_group)
        os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes)
        os.environ['BALSAM_JOB_MODE'] = "mpi"

        self.timer = remaining_time_minutes(time_limit_minutes)
        self.delayer = delay_generator()
        self.last_report = 0
        self.exit_counter = 0
        self.mpi_runs = []
        self.jobsource.check_qLaunch()
        if self.jobsource.qLaunch is not None:
            sched_id = self.jobsource.qLaunch.scheduler_id
            self.RUN_MESSAGE = f'Batch Scheduler ID: {sched_id}'
        else:
            self.RUN_MESSAGE = 'Not scheduled by service'
Example #3
0
    def __init__(self,
                 wf_name=None,
                 time_limit_minutes=60,
                 gpus_per_node=None,
                 persistent=False,
                 limit_nodes=None,
                 offset_nodes=None):
        self.wf_name = wf_name
        self.gpus_per_node = gpus_per_node
        self.is_persistent = persistent

        timer = remaining_time_minutes(time_limit_minutes)
        minutes_left = max(0.1, next(timer) - 1)
        self.worker_group = worker.WorkerGroup(limit=limit_nodes,
                                               offset=offset_nodes)
        self.total_nodes = sum(w.num_nodes for w in self.worker_group)
        os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes)
        os.environ['BALSAM_JOB_MODE'] = "serial"

        self.app_cmd = f"{sys.executable} {self.ZMQ_ENSEMBLE_EXE}"
        self.app_cmd += f" --time-limit-min={minutes_left}"
        if self.wf_name:
            self.app_cmd += f" --wf-name={self.wf_name}"
        if self.gpus_per_node:
            self.app_cmd += f" --gpus-per-node={self.gpus_per_node}"
Example #4
0
    def __init__(self, wf_name=None, time_limit_minutes=60, gpus_per_node=None,
                 persistent=False, limit_nodes=None, offset_nodes=None):
        self.wf_name = wf_name
        self.gpus_per_node = gpus_per_node
        self.is_persistent = persistent

        timer = remaining_time_minutes(time_limit_minutes)
        minutes_left = max(0.1, next(timer) - 1)
        self.worker_group = worker.WorkerGroup(limit=limit_nodes, offset=offset_nodes)
        num_workers = len(self.worker_group)

        hostnames = sorted([w.hostname for w in self.worker_group])
        master_host = hostnames[0]
        master_port = 19876
        timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
        log_fname = f'serial-ensemble_{timestamp}.log'

        self.total_nodes = sum(w.num_nodes for w in self.worker_group)
        os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes)
        os.environ['BALSAM_JOB_MODE'] = "serial"

        self.app_cmd = f"{sys.executable} {self.ZMQ_ENSEMBLE_EXE}"
        self.app_cmd += f" --time-limit-min={minutes_left}"
        self.app_cmd += f" --master-address {master_host}:{master_port}"
        self.app_cmd += f" --log-filename {log_fname}"
        self.app_cmd += f" --num-workers {num_workers}"
        if self.wf_name:
            self.app_cmd += f" --wf-name={self.wf_name}"
        if self.gpus_per_node:
            self.app_cmd += f" --gpus-per-node={self.gpus_per_node}"
        if self.is_persistent:
            self.app_cmd += " --persistent"
Example #5
0
 def test_cooley(self):
     '''Construct WorkerGroup from reading Cooley environment'''
     config = get_args('--consume-all'.split())
     if self.scheduler.host_type != 'COOLEY':
         self.skipTest('scheduler did not recognize Cooley environment')
     group = worker.WorkerGroup(config, host_type='COOLEY',
                                workers_str=self.scheduler.workers_str,
                                workers_file=self.scheduler.workers_file)
     self.assertGreaterEqual(len(group.workers), 1)
Example #6
0
 def test_cray(self):
     '''Construct WorkerGroup from reading Cray environment'''
     config = get_args('--consume-all'.split())
     if self.scheduler.host_type != 'CRAY':
         self.skipTest('scheduler did not recognize Cray environment')
     group = worker.WorkerGroup(config, host_type='CRAY', 
                                workers_str=self.scheduler.workers_str,
                                workers_file=self.scheduler.workers_file)
     if self.scheduler.workers_str:
         num_worker_env = self.scheduler.SCHEDULER_VARIABLES['num_workers']
         self.assertEqual(len(group.workers), int(os.environ[num_worker_env]))
Example #7
0
def launcher_info(num_workers=None, max_ranks=None):
    from balsam.service.schedulers import Scheduler
    from balsam.launcher import worker
    from balsam.launcher.launcher import get_args
    from balsam.launcher import mpi_commands

    args = '--consume-all '
    if num_workers and num_workers > 0:
        args += f'--num-workers {num_workers} '

    if max_ranks and max_ranks > 0:
        args += f'--max-ranks-per-node {max_ranks} '

    config = get_args(args.split())
    scheduler = Scheduler.scheduler_main
    group = worker.WorkerGroup(config,
                               host_type=scheduler.host_type,
                               workers_str=scheduler.workers_str,
                               workers_file=scheduler.workers_file)
    host_type = scheduler.host_type
    num_workers = scheduler.num_workers or 1

    mpi_cmd_class = getattr(mpi_commands, f"{host_type}MPICommand")
    mpi_cmd = mpi_cmd_class()

    class LaunchInfo:
        pass

    info = LaunchInfo()
    info.parsed_args = config
    info.host_type = host_type
    info.workerGroup = group
    info.scheduler = scheduler
    info.num_workers = num_workers
    info.mpi_cmd = mpi_cmd

    return info