def run_client(graphmgr_addr, load): saved_cfg = None if load is not None: try: with open(load, 'rb') as cnf: saved_cfg = dill.load(cnf) except OSError: logger.exception( "ami-client: problem opening saved graph configuration file:") return 1 except dill.UnpicklingError: logger.exception( "ami-client: problem parsing saved graph configuration file (%s):", load) return 1 queue = mp.Queue() list_proc = mp.Process(target=run_main_window, args=(queue, graphmgr_addr, saved_cfg)) list_proc.start() widget_procs = [] while True: window_type, name, topic = queue.get() if window_type == 'exit': logger.info("received exit signal - exiting!") break logger.debug("opening new widget: %s %s %s", window_type, name, topic) proc = mp.Process(target=run_widget, args=(queue, window_type, name, topic, graphmgr_addr)) proc.start() widget_procs.append(proc)
async def process_messages(self): while True: topic = await self.broker_sub_sock.recv_string() msg = await self.broker_sub_sock.recv_pyobj() if isinstance(msg, fcMsgs.CreateNode): proc = mp.Process( target=NodeProcess, name=msg.name, args=(msg, self.broker_pub_addr, self.graphmgr_addr, self.checkpoint_sub_addr), kwargs={'library_paths': self.library_paths}, daemon=True ) proc.start() logger.info("creating process: %s pid: %d", msg.name, proc.pid) async with self.lock: self.widget_procs[msg.name] = (msg.node_type, proc) elif isinstance(msg, fcMsgs.Profiler): if self.profiler is None: self.profiler = mp.Process(target=Profiler, args=(self.broker_pub_addr, self.graphmgr_addr.profile, msg.name), daemon=True) self.profiler.start() logger.info("creating process: Profiler pid: %d", self.profiler.pid) async with self.lock: self.msgs[topic] = msg await self.broker_pub_sock.send_string(topic, zmq.SNDMORE) await self.broker_pub_sock.send_pyobj(msg) elif isinstance(msg, fcMsgs.DisplayNode): await self.forward_message_to_node(topic, msg) elif isinstance(msg, fcMsgs.ReloadLibrary): await self.forward_message_to_node(topic, msg) elif isinstance(msg, fcMsgs.CloseNode): await self.forward_message_to_node(topic, msg) async with self.lock: if topic in self.widget_procs: logger.info("deleting process: %s pid: %d", topic, proc.pid) _, proc = self.widget_procs[topic] proc.terminate() proc.join() del self.widget_procs[topic] if topic in self.msgs: del self.msgs[topic] elif isinstance(msg, fcMsgs.Library): self.library_paths.update(msg.paths)
async def monitor_processes(self): while True: await asyncio.sleep(0.25) dead_procs = [] for name, ntp in self.widget_procs.items(): node_type, proc = ntp if not proc.is_alive(): dead_procs.append(name) async with self.lock: for name in dead_procs: typ, proc = self.widget_procs[name] state = {} if name in self.checkpoints: state = self.checkpoints[name].state msg = fcMsgs.CreateNode(name, typ, state) # don't resend last message del self.msgs[msg.name] proc = mp.Process( target=NodeProcess, name=msg.name, args=(msg, self.broker_pub_addr, self.graphmgr_addr, self.checkpoint_sub_addr), kwargs={'library_paths': self.library_paths}, daemon=True ) proc.start() logger.info("restarting process: %s pid: %d", msg.name, proc.pid) self.widget_procs[msg.name] = (msg.node_type, proc)
def launch_editor_window(self, configure): editor_proc = mp.Process( name='editor', target=run_editor_window, args=(self.broker_sub_addr, self.graphmgr_addr, self.checkpoint_pub_addr, self.load, self.prometheus_dir, self.prometheus_port, self.hutch, configure), daemon=True) editor_proc.start() self.editor = editor_proc
def start_ami(request, workerjson): try: from pytest_cov.embed import cleanup_on_sigterm cleanup_on_sigterm() except ImportError: pass parser = build_parser() args = parser.parse_args([ "-n", "1", '--headless', '--tcp', '%s://%s' % (request.param, workerjson) ]) queue = mp.Queue() ami = mp.Process(name='ami', target=run_ami, args=(args, queue)) ami.start() try: host = "127.0.0.1" comm_addr = "tcp://%s:%d" % (host, BasePort + Ports.Comm) with GraphCommHandler(args.graph_name, comm_addr) as comm_handler: yield comm_handler except Exception as e: # let the fixture exit 'gracefully' if it fails print(e) yield None finally: queue.put(None) ami.join(1) # if ami still hasn't exitted then kill it if ami.is_alive(): ami.terminate() ami.join(1) if ami.exitcode == 0 or ami.exitcode == -signal.SIGTERM: return 0 else: print('AMI exited with non-zero status code: %d' % ami.exitcode) return 1
def run_ami(args, queue=None): xtcdir = None ipcdir = None owns_ipcdir = True flags = {} if queue is None: queue = mp.Queue() if args.ipc: ipcdir = tempfile.mkdtemp() owns_ipcdir = True elif args.ipc_dir is not None: ipcdir = args.ipc_dir owns_ipcdir = False if ipcdir is None: host = "127.0.0.1" comm_addr = "tcp://%s:%d" % (host, args.port) graph_addr = "tcp://%s:%d" % (host, args.port+1) collector_addr = "tcp://%s:%d" % (host, args.port+2) globalcol_addr = "tcp://%s:%d" % (host, args.port+3) results_addr = "tcp://%s:%d" % (host, args.port+4) export_addr = "tcp://%s:%d" % (host, args.port+5) msg_addr = "tcp://%s:%d" % (host, args.port+6) info_addr = "tcp://%s:%d" % (host, args.port+7) view_addr = "tcp://%s:%d" % (host, args.port+8) profile_addr = "tcp://%s:%d" % (host, args.port+9) else: collector_addr = "ipc://%s/node_collector" % ipcdir globalcol_addr = "ipc://%s/collector" % ipcdir graph_addr = "ipc://%s/graph" % ipcdir comm_addr = "ipc://%s/comm" % ipcdir results_addr = "ipc://%s/results" % ipcdir export_addr = "ipc://%s/export" % ipcdir msg_addr = "ipc://%s/message" % ipcdir info_addr = "ipc://%s/info" % ipcdir view_addr = "ipc://%s/view" % ipcdir profile_addr = "ipc://%s/profile" % ipcdir procs = [] client_proc = None log_handlers = [logging.StreamHandler()] if args.headless or args.console: console_fmt = logging.Formatter(LogConfig.BasicFormat) log_handlers[0].setFormatter(console_fmt) if args.log_file is not None: log_handlers.append(logging.FileHandler(args.log_file)) log_level = getattr(logging, args.log_level.upper(), logging.INFO) logging.basicConfig(format=LogConfig.FullFormat, level=log_level, handlers=log_handlers) try: for flag in args.flags: try: key, value = flag.split('=') flags[key] = value except ValueError: logger.exception("Problem parsing data source flag %s", flag) if args.source is not None: src_url_match = re.match('(?P<prot>.*)://(?P<body>.*)', args.source) if src_url_match: src_cfg = src_url_match.groups() else: logger.critical("Invalid data source config string: %s", args.source) return 1 else: src_cfg = None for i in range(args.num_workers): proc = mp.Process( name='worker%03d-n0' % i, target=functools.partial(_sys_exit, run_worker), args=(i, args.num_workers, args.heartbeat, src_cfg, collector_addr, graph_addr, msg_addr, export_addr, flags, args.prometheus_dir, args.hutch) ) proc.daemon = True proc.start() procs.append(proc) collector_proc = mp.Process( name='nodecol-n0', target=functools.partial(_sys_exit, run_node_collector), args=(0, args.num_workers, collector_addr, globalcol_addr, graph_addr, msg_addr, args.prometheus_dir, args.hutch) ) collector_proc.daemon = True collector_proc.start() procs.append(collector_proc) globalcol_proc = mp.Process( name='globalcol', target=functools.partial(_sys_exit, run_global_collector), args=(0, 1, globalcol_addr, results_addr, graph_addr, msg_addr, args.prometheus_dir, args.hutch) ) globalcol_proc.daemon = True globalcol_proc.start() procs.append(globalcol_proc) manager_proc = mp.Process( name='manager', target=functools.partial(_sys_exit, run_manager), args=(args.num_workers, 1, results_addr, graph_addr, comm_addr, msg_addr, info_addr, export_addr, view_addr, profile_addr, args.prometheus_dir, args.hutch) ) manager_proc.daemon = True manager_proc.start() procs.append(manager_proc) if args.export: if run_export is None: logger.critical("Export module is not available: p4p needs to be installed to use the export feature!") return 1 export_proc = mp.Process( name='export', target=functools.partial(_sys_exit, run_export), args=(args.export, comm_addr, export_addr, args.aggregate) ) export_proc.daemon = True export_proc.start() procs.append(export_proc) if not (args.console or args.headless): client_proc = mp.Process( name='client', target=run_client, args=(args.graph_name, comm_addr, info_addr, view_addr, profile_addr, args.load, args.gui_mode, args.prometheus_dir, args.hutch) ) client_proc.daemon = False client_proc.start() procs.append(client_proc) # register a signal handler for cleanup on sigterm signal.signal(signal.SIGTERM, functools.partial(_sig_handler, procs)) if args.console: run_console(args.graph_name, comm_addr, args.load) elif args.headless: if args.load: comm_handler = GraphCommHandler(args.graph_name, comm_addr) comm_handler.load(args.load) while queue.empty(): pass else: client_proc.join() except KeyboardInterrupt: logger.info("Worker killed by user...") finally: failed_proc = cleanup(procs) # cleanup ipc directories if owns_ipcdir and ipcdir is not None and os.path.exists(ipcdir): shutil.rmtree(ipcdir) if xtcdir is not None and os.path.exists(xtcdir): shutil.rmtree(xtcdir) # return a non-zero status code if any workerss died if client_proc is not None and client_proc.exitcode != 0: return client_proc.exitcode elif failed_proc: return 1
def run_ami(args, queue=None): flags = {} if queue is None: queue = mp.Queue() host = args.host graph_addr = "tcp://%s:%d" % (host, args.port + 1) collector_addr = "tcp://127.0.0.1:%d" % (args.port + 2) globalcol_addr = "tcp://%s:%d" % (host, args.port + 3) export_addr = "tcp://%s:%d" % (host, args.port + 5) msg_addr = "tcp://%s:%d" % (host, args.port + 6) procs = [] log_handlers = [logging.StreamHandler()] if args.log_file is not None: log_handlers.append(logging.FileHandler(args.log_file)) log_level = getattr(logging, args.log_level.upper(), logging.INFO) logging.basicConfig(format=LogConfig.FullFormat, level=log_level, handlers=log_handlers) try: for flag in args.flags: try: key, value = flag.split('=') flags[key] = value except ValueError: logger.exception("Problem parsing data source flag %s", flag) if args.source is not None: src_url_match = re.match('(?P<prot>.*)://(?P<body>.*)', args.source) if src_url_match: src_cfg = src_url_match.groups() else: logger.critical("Invalid data source config string: %s", args.source) return 1 else: src_cfg = None comm = MPI.COMM_WORLD size = comm.Get_size() - 2 global_rank = comm.Get_rank() local_comm = comm.Split_type(MPI.COMM_TYPE_SHARED, global_rank, MPI.INFO_NULL) local_rank_size = local_comm.Get_size() local_rank = local_comm.Get_rank() node_rank = global_rank // local_rank_size # name = MPI.Get_processor_name() # print(f"SIZE: {size}, RANK: {global_rank}, LOCAL RANK: {local_rank}, NODE RANK: {node_rank} NAME: {name}") if local_rank == 0: collector_proc = mp.Process( name=f'nodecol-n{node_rank}', target=functools.partial(_sys_exit, run_node_collector), args=(node_rank, local_rank_size, collector_addr, globalcol_addr, graph_addr, msg_addr, args.prometheus_dir, args.hutch)) collector_proc.daemon = True collector_proc.start() procs.append(collector_proc) run_worker(global_rank, size, args.heartbeat, src_cfg, collector_addr, graph_addr, msg_addr, export_addr, flags, args.prometheus_dir, args.hutch) # register a signal handler for cleanup on sigterm signal.signal(signal.SIGTERM, functools.partial(_sig_handler, procs)) while True: pass except KeyboardInterrupt: logger.info("Worker killed by user...") finally: failed_proc = cleanup(procs) # return a non-zero status code if any workerss died if failed_proc: return 1
def main(color, upstream_port, downstream_port): parser = argparse.ArgumentParser(description='AMII Collector App') parser.add_argument( '-H', '--host', default=Defaults.Host, help='hostname of the AMII Manager (default: %s)' % Defaults.Host ) parser.add_argument( '-c', '--collector', type=int, default=upstream_port, help='port of the collector (default: %d)' % upstream_port ) parser.add_argument( '-d', '--downstream', type=int, default=downstream_port, help='port for global collector (default: %d)' % downstream_port ) parser.add_argument( '-g', '--graph', type=int, default=Ports.Graph, help='port for graph communication (default: %d)' % Ports.Graph ) parser.add_argument( '-m', '--message', type=int, default=Ports.Message, help='port for sending out-of-band messages from nodes (default: %d)' % Ports.Message ) parser.add_argument( '-n', '--num-contribs', type=int, default=1, help='number of contributer processes (default: 1)' ) parser.add_argument( '-N', '--node-num', type=int, default=0, help='node identification number (default: 0)' ) parser.add_argument( '--log-level', default=LogConfig.Level, help='the logging level of the application (default %s)' % LogConfig.Level ) parser.add_argument( '--log-file', help='an optional file to write the log output to' ) parser.add_argument( '--prometheus-dir', help='directory for prometheus configuration', default=None ) parser.add_argument( '--hutch', help='hutch for prometheus label', default=None ) subparsers = parser.add_subparsers(help='spawn workers', dest='worker') worker_subparser = subparsers.add_parser('worker', help='worker arguments') worker_subparser.add_argument( 'source', nargs='?', metavar='SOURCE', help='data source configuration (exampes: static://test.json, random://test.json, psana://exp=xcsdaq13:run=14)' ) worker_subparser.add_argument( '-e', '--export', type=int, default=Ports.Export, help='port for receiving exported graph results (default: %d)' % Ports.Export ) worker_subparser.add_argument( '-b', '--heartbeat', type=int, default=10, help='the heartbeat period (default: 10)' ) worker_subparser.add_argument( '-f', '--flags', action='append', default=[], help='extra flags as key=value pairs that are passed to the data source' ) args = parser.parse_args() collector_addr = "tcp://*:%d" % (args.collector) downstream_addr = "tcp://%s:%d" % (args.host, args.downstream) graph_addr = "tcp://%s:%d" % (args.host, args.graph) msg_addr = "tcp://%s:%d" % (args.host, args.message) log_handlers = [logging.StreamHandler()] if args.log_file is not None: log_handlers.append(logging.FileHandler(args.log_file)) log_level = getattr(logging, args.log_level.upper(), logging.INFO) if args.worker: logging.basicConfig(format=LogConfig.FullFormat, level=log_level, handlers=log_handlers) else: logging.basicConfig(format=LogConfig.Format, level=log_level, handlers=log_handlers) try: if color == Colors.LocalCollector: if args.worker: local_collector_addr = "tcp://localhost:%d" % args.collector export_addr = "tcp://%s:%d" % (args.host, args.export) flags, src_cfg = parse_args(args) for n in range(0, args.num_contribs): worker = mp.Process(name='worker', target=run_worker, args=(args.node_num*args.num_contribs+n, args.num_contribs, args.heartbeat, src_cfg, local_collector_addr, graph_addr, msg_addr, export_addr, flags, args.prometheus_dir, args.hutch), daemon=True) worker.start() return run_node_collector(args.node_num, args.num_contribs, collector_addr, downstream_addr, graph_addr, msg_addr, args.prometheus_dir, args.hutch) elif color == Colors.GlobalCollector: return run_global_collector(args.node_num, args.num_contribs, collector_addr, downstream_addr, graph_addr, msg_addr, args.prometheus_dir, args.hutch) else: logger.critical("Invalid option collector color '%s' chosen!", color) return 1 except KeyboardInterrupt: logger.info("collector killed by user...") return 0