def test_default(self): '''Create default worker groups with various command line arguments''' config = get_args('--consume-all --num-workers 1'.split()) group = worker.WorkerGroup(config, host_type='DEFAULT', workers_str=None) self.assertEqual(len(group.workers), 1) self.assertEqual(group.workers[0].num_nodes, 1) self.assertEqual(group.workers[0].max_ranks_per_node, 4) config = get_args('--consume-all --num-workers 3 --max-ranks-per-node 4'.split()) group = worker.WorkerGroup(config, host_type='DEFAULT', workers_str=None) self.assertEqual(len(group.workers), 3) self.assertEqual(group.workers[0].num_nodes, 1) self.assertEqual(group.workers[0].max_ranks_per_node, 4)
def __init__(self, wf_name, time_limit_minutes, gpus_per_node): self.jobsource = BalsamJob.source self.jobsource.workflow = wf_name if wf_name: logger.info(f'Filtering jobs with workflow matching {wf_name}') else: logger.info('No workflow filter') self.jobsource.clear_stale_locks() self.jobsource.start_tick() self.worker_group = worker.WorkerGroup() self.total_nodes = sum(w.num_nodes for w in self.worker_group) os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes) os.environ['BALSAM_JOB_MODE'] = "mpi" self.timer = remaining_time_minutes(time_limit_minutes) self.delayer = delay_generator() self.last_report = 0 self.exit_counter = 0 self.mpi_runs = [] self.jobsource.check_qLaunch() if self.jobsource.qLaunch is not None: sched_id = self.jobsource.qLaunch.scheduler_id self.RUN_MESSAGE = f'Batch Scheduler ID: {sched_id}' else: self.RUN_MESSAGE = 'Not scheduled by service'
def __init__(self, wf_name=None, time_limit_minutes=60, gpus_per_node=None, persistent=False, limit_nodes=None, offset_nodes=None): self.wf_name = wf_name self.gpus_per_node = gpus_per_node self.is_persistent = persistent timer = remaining_time_minutes(time_limit_minutes) minutes_left = max(0.1, next(timer) - 1) self.worker_group = worker.WorkerGroup(limit=limit_nodes, offset=offset_nodes) self.total_nodes = sum(w.num_nodes for w in self.worker_group) os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes) os.environ['BALSAM_JOB_MODE'] = "serial" self.app_cmd = f"{sys.executable} {self.ZMQ_ENSEMBLE_EXE}" self.app_cmd += f" --time-limit-min={minutes_left}" if self.wf_name: self.app_cmd += f" --wf-name={self.wf_name}" if self.gpus_per_node: self.app_cmd += f" --gpus-per-node={self.gpus_per_node}"
def __init__(self, wf_name=None, time_limit_minutes=60, gpus_per_node=None, persistent=False, limit_nodes=None, offset_nodes=None): self.wf_name = wf_name self.gpus_per_node = gpus_per_node self.is_persistent = persistent timer = remaining_time_minutes(time_limit_minutes) minutes_left = max(0.1, next(timer) - 1) self.worker_group = worker.WorkerGroup(limit=limit_nodes, offset=offset_nodes) num_workers = len(self.worker_group) hostnames = sorted([w.hostname for w in self.worker_group]) master_host = hostnames[0] master_port = 19876 timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S') log_fname = f'serial-ensemble_{timestamp}.log' self.total_nodes = sum(w.num_nodes for w in self.worker_group) os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes) os.environ['BALSAM_JOB_MODE'] = "serial" self.app_cmd = f"{sys.executable} {self.ZMQ_ENSEMBLE_EXE}" self.app_cmd += f" --time-limit-min={minutes_left}" self.app_cmd += f" --master-address {master_host}:{master_port}" self.app_cmd += f" --log-filename {log_fname}" self.app_cmd += f" --num-workers {num_workers}" if self.wf_name: self.app_cmd += f" --wf-name={self.wf_name}" if self.gpus_per_node: self.app_cmd += f" --gpus-per-node={self.gpus_per_node}" if self.is_persistent: self.app_cmd += " --persistent"
def test_cooley(self): '''Construct WorkerGroup from reading Cooley environment''' config = get_args('--consume-all'.split()) if self.scheduler.host_type != 'COOLEY': self.skipTest('scheduler did not recognize Cooley environment') group = worker.WorkerGroup(config, host_type='COOLEY', workers_str=self.scheduler.workers_str, workers_file=self.scheduler.workers_file) self.assertGreaterEqual(len(group.workers), 1)
def test_cray(self): '''Construct WorkerGroup from reading Cray environment''' config = get_args('--consume-all'.split()) if self.scheduler.host_type != 'CRAY': self.skipTest('scheduler did not recognize Cray environment') group = worker.WorkerGroup(config, host_type='CRAY', workers_str=self.scheduler.workers_str, workers_file=self.scheduler.workers_file) if self.scheduler.workers_str: num_worker_env = self.scheduler.SCHEDULER_VARIABLES['num_workers'] self.assertEqual(len(group.workers), int(os.environ[num_worker_env]))
def launcher_info(num_workers=None, max_ranks=None): from balsam.service.schedulers import Scheduler from balsam.launcher import worker from balsam.launcher.launcher import get_args from balsam.launcher import mpi_commands args = '--consume-all ' if num_workers and num_workers > 0: args += f'--num-workers {num_workers} ' if max_ranks and max_ranks > 0: args += f'--max-ranks-per-node {max_ranks} ' config = get_args(args.split()) scheduler = Scheduler.scheduler_main group = worker.WorkerGroup(config, host_type=scheduler.host_type, workers_str=scheduler.workers_str, workers_file=scheduler.workers_file) host_type = scheduler.host_type num_workers = scheduler.num_workers or 1 mpi_cmd_class = getattr(mpi_commands, f"{host_type}MPICommand") mpi_cmd = mpi_cmd_class() class LaunchInfo: pass info = LaunchInfo() info.parsed_args = config info.host_type = host_type info.workerGroup = group info.scheduler = scheduler info.num_workers = num_workers info.mpi_cmd = mpi_cmd return info