def construct_worker_launch_command(batch: Optional[Dict], btype: str, nodes: int) -> str: """ If no 'worker_launch' is found in the batch yaml, this method constructs the needed launch command. : param batch : (Optional[Dict]): An optional batch override from the worker config : param btype : (str): The type of batch (flux, local, lsf) : param nodes : (int): The number of nodes to use in the batch launch """ launch_command: str = "" workload_manager: str = get_batch_type() bank: str = get_yaml_var(batch, "bank", "") queue: str = get_yaml_var(batch, "queue", "") walltime: str = get_yaml_var(batch, "walltime", "") if btype == "slurm" or workload_manager == "slurm": launch_command = f"srun -N {nodes} -n {nodes}" if bank: launch_command += f" -A {bank}" if queue: launch_command += f" -p {queue}" if walltime: launch_command += f" -t {walltime}" if workload_manager == "lsf": # The jsrun utility does not have a time argument launch_command = f"jsrun -a 1 -c ALL_CPUS -g ALL_GPUS --bind=none -n {nodes}" return launch_command
def examine_and_log_machines(worker_val, yenv) -> bool: """ Examines whether a worker should be skipped in a step of start_celery_workers(), logs errors in output path for a celery worker. """ worker_machines = get_yaml_var(worker_val, "machines", None) if worker_machines: LOG.debug("check machines = ", check_machines(worker_machines)) if not check_machines(worker_machines): return True if yenv: output_path = get_yaml_var(yenv, "OUTPUT_PATH", None) if output_path and not os.path.exists(output_path): hostname = socket.gethostname() LOG.error( f"The output path, {output_path}, is not accessible on this host, {hostname}" ) else: LOG.warning( "The env:variables section does not have an OUTPUT_PATH specified, multi-machine checks cannot be performed." ) return False
def batch_check_parallel(spec): """ Check for a parallel batch section in the yaml file. """ parallel = False try: batch = spec.batch except AttributeError: LOG.error("The batch section is required in the specification file.") raise btype = get_yaml_var(batch, "type", "local") if btype != "local": parallel = True return parallel
def batch_worker_launch(spec, com, nodes=None, batch=None): """ The configuration in the batch section of the merlin spec is used to create the worker launch line, which may be different from a simulation launch. com (str): The command to launch with batch configuration nodes (int): The number of nodes to use in the batch launch batch (dict): An optional batch override from the worker config """ if batch is None: try: batch = spec.batch except AttributeError: LOG.error( "The batch section is required in the specification file.") raise btype = get_yaml_var(batch, "type", "local") # A jsrun submission cannot be run under a parent jsrun so # all non flux lsf submissions need to be local. if btype == "local" or "lsf" in btype: return com if nodes is None: # Use the value in the batch section nodes = get_yaml_var(batch, "nodes", None) # Get the number of nodes from the environment if unset if nodes is None or nodes == "all": nodes = get_node_count(default=1) bank = get_yaml_var(batch, "bank", "") queue = get_yaml_var(batch, "queue", "") shell = get_yaml_var(batch, "shell", "bash") walltime = get_yaml_var(batch, "walltime", "") launch_pre = get_yaml_var(batch, "launch_pre", "") launch_args = get_yaml_var(batch, "launch_args", "") worker_launch = get_yaml_var(batch, "worker_launch", "") if btype == "flux": launcher = get_batch_type() else: launcher = get_batch_type() launchs = worker_launch if not launchs: if btype == "slurm" or launcher == "slurm": launchs = f"srun --mpi=none -N {nodes} -n {nodes}" if bank: launchs += f" -A {bank}" if queue: launchs += f" -p {queue}" if walltime: launchs += f" -t {walltime}" if launcher == "lsf": # The jsrun utility does not have a time argument launchs = f"jsrun -a 1 -c ALL_CPUS -g ALL_GPUS --bind=none -n {nodes}" launchs += f" {launch_args}" # Allow for any pre launch manipulation, e.g. module load # hwloc/1.11.10-cuda if launch_pre: launchs = f"{launch_pre} {launchs}" worker_cmd = f"{launchs} {com}" if btype == "flux": flux_path = get_yaml_var(batch, "flux_path", "") flux_opts = get_yaml_var(batch, "flux_start_opts", "") flux_exec_workers = get_yaml_var(batch, "flux_exec_workers", True) flux_exec = "" if flux_exec_workers: flux_exec = "flux exec" if "/" in flux_path: flux_path += "/" flux_exe = os.path.join(flux_path, "flux") launch = ( f"{launchs} {flux_exe} start {flux_opts} {flux_exec} `which {shell}` -c" ) worker_cmd = f'{launch} "{com}"' return worker_cmd
def start_celery_workers(spec, steps, celery_args, just_return_command): """ Start the celery workers on the allocation specs Tuple of (YAMLSpecification, MerlinSpec) ... example config: merlin: resources: task_server: celery overlap: False workers: simworkers: args: -O fair --prefetch-multiplier 1 -E -l info --concurrency 4 steps: [run, data] nodes: 1 machine: [hostA, hostB] """ if not just_return_command: LOG.info("Starting workers") overlap = spec.merlin["resources"]["overlap"] workers = spec.merlin["resources"]["workers"] senv = spec.environment spenv = os.environ.copy() yenv = None if senv: yenv = get_yaml_var(senv, "variables", {}) for k, v in yenv.items(): spenv[str(k)] = str(v) # For expandvars os.environ[str(k)] = str(v) worker_list = [] local_queues = [] for worker_name, worker_val in workers.items(): worker_machines = get_yaml_var(worker_val, "machines", None) if worker_machines: LOG.debug("check machines = ", check_machines(worker_machines)) if not check_machines(worker_machines): continue if yenv: output_path = get_yaml_var(yenv, "OUTPUT_PATH", None) if output_path and not os.path.exists(output_path): hostname = socket.gethostname() LOG.error( f"The output path, {output_path}, is not accessible on this host, {hostname}" ) else: LOG.warning( "The env:variables section does not have an OUTPUT_PATH" "specified, multi-machine checks cannot be performed." ) worker_args = get_yaml_var(worker_val, "args", celery_args) with suppress(KeyError): if worker_val["args"] is None: worker_args = "" worker_nodes = get_yaml_var(worker_val, "nodes", None) worker_batch = get_yaml_var(worker_val, "batch", None) wsteps = get_yaml_var(worker_val, "steps", steps) queues = spec.make_queue_string(wsteps).split(",") # Check for missing arguments parallel = batch_check_parallel(spec) if parallel: if "--concurrency" not in worker_args: LOG.warning( "The worker arg --concurrency [1-4] is recommended " "when running parallel tasks" ) if "--prefetch-multiplier" not in worker_args: LOG.warning( "The worker arg --prefetch-multiplier 1 is " "recommended when running parallel tasks" ) if "fair" not in worker_args: LOG.warning( "The worker arg -O fair is recommended when running " "parallel tasks" ) if "-n" not in worker_args: nhash = "" if overlap: nhash = time.strftime("%Y%m%d-%H%M%S") # TODO: Once flux fixes their bug, change this back to %h worker_args += f" -n {worker_name}{nhash}.%%h" if "-l" not in worker_args: worker_args += f" -l {logging.getLevelName(LOG.getEffectiveLevel())}" # Add a per worker log file (debug) if LOG.isEnabledFor(logging.DEBUG): LOG.debug("Redirecting worker output to individual log files") worker_args += f" --logfile %p.%i" # Get the celery command celery_com = launch_celery_workers( spec, steps=wsteps, worker_args=worker_args, just_return_command=True ) celery_cmd = os.path.expandvars(celery_com) worker_cmd = batch_worker_launch( spec, celery_cmd, nodes=worker_nodes, batch=worker_batch ) worker_cmd = os.path.expandvars(worker_cmd) try: kwargs = {"env": spenv, "shell": True, "universal_newlines": True} # These cannot be used with a detached process # "stdout": subprocess.PIPE, # "stderr": subprocess.PIPE, LOG.debug(f"worker cmd={worker_cmd}") LOG.debug(f"env={spenv}") found = [] running_queues = [] if not just_return_command and not overlap: running_queues.extend(get_running_queues()) running_queues.extend(local_queues) for q in queues: if q in running_queues: found.append(q) if found: LOG.warning( f"A celery worker named '{worker_name}' is already configured/running for queue(s) = {' '.join(found)}" ) continue # Cache the queues from this worker to use to test # for existing queues in any subsequent workers. # If overlap is True, then do not check the local queues. # This will allow multiple workers to pull from the same # queue. if not overlap: local_queues.extend(queues) if just_return_command: worker_list = "" print(worker_cmd) continue _ = subprocess.Popen(worker_cmd, **kwargs) worker_list.append(worker_cmd) except Exception as e: LOG.error(f"Cannot start celery workers, {e}") raise # Return a string with the worker commands for logging return str(worker_list)
def batch_worker_launch( spec: Dict, com: str, nodes: Optional[Union[str, int]] = None, batch: Optional[Dict] = None, ) -> str: """ The configuration in the batch section of the merlin spec is used to create the worker launch line, which may be different from a simulation launch. : param spec : (Dict) workflow specification : param com : (str): The command to launch with batch configuration : param nodes : (Optional[Union[str, int]]): The number of nodes to use in the batch launch : param batch : (Optional[Dict]): An optional batch override from the worker config """ if batch is None: try: batch = spec.batch except AttributeError: LOG.error( "The batch section is required in the specification file.") raise btype: str = get_yaml_var(batch, "type", "local") # A jsrun submission cannot be run under a parent jsrun so # all non flux lsf submissions need to be local. if btype == "local" or "lsf" in btype: return com if nodes is None: # Use the value in the batch section nodes = get_yaml_var(batch, "nodes", None) # Get the number of nodes from the environment if unset if nodes is None or nodes == "all": nodes = get_node_count(default=1) elif not isinstance(nodes, int): raise TypeError( "Nodes was passed into batch_worker_launch with an invalid type (likely a string other than 'all')." ) shell: str = get_yaml_var(batch, "shell", "bash") launch_pre: str = get_yaml_var(batch, "launch_pre", "") launch_args: str = get_yaml_var(batch, "launch_args", "") launch_command: str = get_yaml_var(batch, "worker_launch", "") if not launch_command: launch_command = construct_worker_launch_command(batch, btype, nodes) launch_command += f" {launch_args}" # Allow for any pre launch manipulation, e.g. module load # hwloc/1.11.10-cuda if launch_pre: launch_command = f"{launch_pre} {launch_command}" worker_cmd: str = "" if btype == "flux": flux_path: str = get_yaml_var(batch, "flux_path", "") flux_opts: Union[str, Dict] = get_yaml_var(batch, "flux_start_opts", "") flux_exec_workers: Union[str, Dict, bool] = get_yaml_var(batch, "flux_exec_workers", True) flux_exec: str = "" if flux_exec_workers: flux_exec = "flux exec" if "/" in flux_path: flux_path += "/" flux_exe: str = os.path.join(flux_path, "flux") launch: str = f"{launch_command} {flux_exe} start {flux_opts} {flux_exec} `which {shell}` -c" worker_cmd = f'{launch} "{com}"' else: worker_cmd = f"{launch_command} {com}" return worker_cmd
def start_celery_workers(spec, steps, celery_args, just_return_command): """Start the celery workers on the allocation specs Tuple of (YAMLSpecification, MerlinSpec) ... example config: merlin: resources: task_server: celery overlap: False workers: simworkers: args: -O fair --prefetch-multiplier 1 -E -l info --concurrency 4 steps: [run, data] nodes: 1 machine: [hostA, hostB] """ if not just_return_command: LOG.info("Starting workers") overlap = spec.merlin["resources"]["overlap"] workers = spec.merlin["resources"]["workers"] senv = spec.environment spenv = os.environ.copy() yenv = None if senv: yenv = get_yaml_var(senv, "variables", {}) for k, v in yenv.items(): spenv[str(k)] = str(v) # For expandvars os.environ[str(k)] = str(v) worker_list = [] local_queues = [] for worker_name, worker_val in workers.items(): skip_loop_step: bool = examine_and_log_machines(worker_val, yenv) if skip_loop_step: continue worker_args = get_yaml_var(worker_val, "args", celery_args) with suppress(KeyError): if worker_val["args"] is None: worker_args = "" worker_nodes = get_yaml_var(worker_val, "nodes", None) worker_batch = get_yaml_var(worker_val, "batch", None) wsteps = get_yaml_var(worker_val, "steps", steps) queues = spec.make_queue_string(wsteps).split(",") # Check for missing arguments verify_args(spec, worker_args, worker_name, overlap) # Add a per worker log file (debug) if LOG.isEnabledFor(logging.DEBUG): LOG.debug("Redirecting worker output to individual log files") worker_args += " --logfile %p.%i" # Get the celery command celery_com = launch_celery_workers(spec, steps=wsteps, worker_args=worker_args, just_return_command=True) celery_cmd = os.path.expandvars(celery_com) worker_cmd = batch_worker_launch(spec, celery_cmd, nodes=worker_nodes, batch=worker_batch) worker_cmd = os.path.expandvars(worker_cmd) try: kwargs = {"env": spenv, "shell": True, "universal_newlines": True} # These cannot be used with a detached process # "stdout": subprocess.PIPE, # "stderr": subprocess.PIPE, LOG.debug(f"worker cmd={worker_cmd}") LOG.debug(f"env={spenv}") if just_return_command: worker_list = "" print(worker_cmd) continue found = [] running_queues = [] running_queues.extend(local_queues) if not overlap: running_queues.extend(get_running_queues()) # Cache the queues from this worker to use to test # for existing queues in any subsequent workers. # If overlap is True, then do not check the local queues. # This will allow multiple workers to pull from the same # queue. local_queues.extend(queues) for q in queues: if q in running_queues: found.append(q) if found: LOG.warning( f"A celery worker named '{worker_name}' is already configured/running for queue(s) = {' '.join(found)}" ) continue _ = subprocess.Popen(worker_cmd, **kwargs) worker_list.append(worker_cmd) except Exception as e: LOG.error(f"Cannot start celery workers, {e}") raise # Return a string with the worker commands for logging return str(worker_list)