def __init__(self, args): self.MAX_IDLE_TIME = 120.0 self.DELAY_PERIOD = 0.2 self.idle_time = None self.EXIT_FLAG = False self.num_workers = args.num_workers self.active_ids = set() self.is_persistent = args.persistent self.remaining_timer = remaining_time_minutes(args.time_limit_min) next(self.remaining_timer) if args.db_prefetch_count == 0: prefetch = self.num_workers * 96 else: prefetch = args.db_prefetch_count logger.debug("Master creating source/status updater") self.job_source = BalsamJobSource(prefetch, args.wf_name) self.status_updater = BalsamDBStatusUpdater() self.status_updater.start() self.job_source.start() logger.debug("source/status updater created") logger.debug("Master ZMQ binding...") self.context = zmq.Context() self.socket = self.context.socket(zmq.REP) self.socket.bind(f"tcp://*:{args.master_port}") logger.debug("Master ZMQ socket bound.")
def __init__(self, args, hostname): self.context = zmq.Context() self.socket = self.context.socket(zmq.REQ) self.socket.connect(f"tcp://{args.master_address}") self.remaining_timer = remaining_time_minutes(args.time_limit_min) self.hostname = hostname next(self.remaining_timer) self.EXIT_FLAG = False self.gpus_per_node = args.gpus_per_node self.prefetch_count = args.worker_prefetch_count config_logging('serial-launcher', filename=args.log_filename, use_buffer=True) self.processes = {} self.outfiles = {} self.cuteids = {} self.start_times = {} self.retry_counts = {} self.job_specs = {} self.runnable_cache = {} self.occupancy = 0.0 self.all_affinity = [ i * SERIAL_HYPERTHREAD_STRIDE for i in range(SERIAL_CORES_PER_NODE) ] self.used_affinity = []
def __init__(self, wf_name, time_limit_minutes, gpus_per_node): self.jobsource = BalsamJob.source self.jobsource.workflow = wf_name if wf_name: logger.info(f'Filtering jobs with workflow matching {wf_name}') else: logger.info('No workflow filter') self.jobsource.clear_stale_locks() self.jobsource.start_tick() self.worker_group = worker.WorkerGroup() self.total_nodes = sum(w.num_nodes for w in self.worker_group) os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes) os.environ['BALSAM_JOB_MODE'] = "mpi" self.timer = remaining_time_minutes(time_limit_minutes) self.delayer = delay_generator() self.last_report = 0 self.exit_counter = 0 self.mpi_runs = [] self.jobsource.check_qLaunch() if self.jobsource.qLaunch is not None: sched_id = self.jobsource.qLaunch.scheduler_id self.RUN_MESSAGE = f'Batch Scheduler ID: {sched_id}' else: self.RUN_MESSAGE = 'Not scheduled by service'
def __init__(self, args): self.MAX_IDLE_TIME = 120.0 self.DELAY_PERIOD = 0.2 self.idle_time = 0.0 self.EXIT_FLAG = False config_logging('serial-launcher', filename=args.log_filename, use_buffer=True) self.remaining_timer = remaining_time_minutes(args.time_limit_min) next(self.remaining_timer) if args.db_prefetch_count == 0: prefetch = args.num_workers * 96 else: prefetch = args.db_prefetch_count self.job_source = BalsamJobSource(prefetch, args.wf_name) self.status_updater = BalsamDBStatusUpdater() self.status_updater.start() self.job_source.start() self.context = zmq.Context() self.socket = self.context.socket(zmq.REP) self.socket.bind(f"tcp://*:{args.master_port}")
def __init__(self, wf_name=None, time_limit_minutes=60, gpus_per_node=None, persistent=False, limit_nodes=None, offset_nodes=None): self.wf_name = wf_name self.gpus_per_node = gpus_per_node self.is_persistent = persistent timer = remaining_time_minutes(time_limit_minutes) minutes_left = max(0.1, next(timer) - 1) self.worker_group = worker.WorkerGroup(limit=limit_nodes, offset=offset_nodes) self.total_nodes = sum(w.num_nodes for w in self.worker_group) os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes) os.environ['BALSAM_JOB_MODE'] = "serial" self.app_cmd = f"{sys.executable} {self.ZMQ_ENSEMBLE_EXE}" self.app_cmd += f" --time-limit-min={minutes_left}" if self.wf_name: self.app_cmd += f" --wf-name={self.wf_name}" if self.gpus_per_node: self.app_cmd += f" --gpus-per-node={self.gpus_per_node}"
def __init__(self): self.MAX_IDLE_TIME = 120.0 self.DELAY_PERIOD = 0.2 self.idle_time = 0.0 self.EXIT_FLAG = False args = self.parse_args() log_filename = config_logging('serial-launcher') bcast_msg = { "gpus_per_node": args.gpus_per_node, "worker_prefetch": args.worker_prefetch_count, "log_fname": log_filename, } comm.bcast(bcast_msg, root=0) self.remaining_timer = remaining_time_minutes(args.time_limit_min) next(self.remaining_timer) if args.db_prefetch_count == 0: prefetch = (comm.size - 1) * 128 else: prefetch = args.db_prefetch_count job_source = BalsamJobSource(prefetch, args.wf_name) status_updater = BalsamDBStatusUpdater() self.manager = ResourceManager(job_source, status_updater)
def __init__(self, wf_name=None, time_limit_minutes=60, gpus_per_node=None, persistent=False, limit_nodes=None, offset_nodes=None): self.wf_name = wf_name self.gpus_per_node = gpus_per_node self.is_persistent = persistent timer = remaining_time_minutes(time_limit_minutes) minutes_left = max(0.1, next(timer) - 1) self.worker_group = worker.WorkerGroup(limit=limit_nodes, offset=offset_nodes) num_workers = len(self.worker_group) hostnames = sorted([w.hostname for w in self.worker_group]) master_host = hostnames[0] master_port = 19876 timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S') log_fname = f'serial-ensemble_{timestamp}.log' self.total_nodes = sum(w.num_nodes for w in self.worker_group) os.environ['BALSAM_LAUNCHER_NODES'] = str(self.total_nodes) os.environ['BALSAM_JOB_MODE'] = "serial" self.app_cmd = f"{sys.executable} {self.ZMQ_ENSEMBLE_EXE}" self.app_cmd += f" --time-limit-min={minutes_left}" self.app_cmd += f" --master-address {master_host}:{master_port}" self.app_cmd += f" --log-filename {log_fname}" self.app_cmd += f" --num-workers {num_workers}" if self.wf_name: self.app_cmd += f" --wf-name={self.wf_name}" if self.gpus_per_node: self.app_cmd += f" --gpus-per-node={self.gpus_per_node}" if self.is_persistent: self.app_cmd += " --persistent"
def __init__(self): self.MAX_IDLE_TIME = 20.0 self.DELAY_PERIOD = 1.0 self.idle_time = 0.0 self.EXIT_FLAG = False args = self.parse_args() comm.bcast(args.gpus_per_node, root=0) self.remaining_timer = remaining_time_minutes(args.time_limit_min) next(self.remaining_timer) job_source = BalsamJob.source job_source.workflow = args.wf_name job_source.start_tick() job_source.clear_stale_locks() self.manager = ResourceManager(job_source) if job_source.workflow: logger.info(f'MPI Ensemble pulling jobs with WF {args.wf_name}') else: logger.info('MPI Ensemble consuming jobs matching any WF name')