def get_processes_by_hostname(cluster: Cluster, hostname: str) -> Iterator[ProcessInfo]: for (_, process) in cluster.get_processes(hostname=hostname): if is_valid_process(process): yield process
class ClusterHelper: def __init__(self, cluster_info: ClusterInfo, workdir: Path): self.cluster_info = cluster_info self.workdir = workdir self.workdir.mkdir(exist_ok=True, parents=True) self.cluster = Cluster(str(self.workdir)) @property def active_nodes(self) -> List[str]: return list(self.cluster.nodes.keys()) @property def processes(self) -> List[Process]: processes = [] for node in self.cluster.nodes.values(): processes += node.processes return processes def commit(self): with open(self.workdir / CLUSTER_FILENAME, "w") as f: self.cluster.serialize(f) def stop(self, use_sigint=False): start = time.time() fn = functools.partial(kill_fn, use_sigint) self.cluster.kill(fn) logging.debug(f"Cluster killed in {time.time() - start} seconds") def start_processes(self, processes: List[StartProcessArgs]): def prepare_workdir(workdir: Path) -> Path: workdir = workdir if workdir else self.workdir workdir.mkdir(parents=True, exist_ok=True) return workdir.absolute() pool_args = [ dataclasses.replace(args, workdir=prepare_workdir(args.workdir)) for args in processes ] logging.debug(f"Starting cluster processes: {pool_args}") for process in pool_args: logging.debug(f"Command: {' '.join(process.args)}") spawned = [] if len(pool_args) == 1: spawned.append(start_process_pool(pool_args[0])) else: with Pool() as pool: for res in pool.map(start_process_pool, pool_args): spawned.append(res) for (process, args) in zip(spawned, pool_args): self.cluster.add(process=process, key=args.name, **args.metadata) def start_monitoring(self, nodes: List[str], observe_processes=False): if not self.cluster_info.monitor_nodes: return init_cmd = [] pyenv = get_pyenv_from_env() if pyenv: init_cmd += [f"source {pyenv}/bin/activate"] else: logging.warning( "No Python virtualenv detected. Monitoring will probably not work." ) nodes = sorted(set(nodes)) workdir = self.workdir / "monitoring" processes = [] for node in nodes: args = [ "python", str(MONITOR_SCRIPT_PATH), str(node_monitoring_trace(self.workdir, node)), ] if observe_processes: node_processes = self.cluster.get_processes(hostname=node) pids = [str(process.pid) for (_, process) in node_processes] args += ["--observe-pids", ",".join(pids)] process = StartProcessArgs( args=args, hostname=node, name="monitor", workdir=workdir, init_cmd=init_cmd, ) processes.append(process) self.start_processes(processes)