Esempio n. 1
0
def run_client(graphmgr_addr, load):
    saved_cfg = None
    if load is not None:
        try:
            with open(load, 'rb') as cnf:
                saved_cfg = dill.load(cnf)
        except OSError:
            logger.exception(
                "ami-client: problem opening saved graph configuration file:")
            return 1
        except dill.UnpicklingError:
            logger.exception(
                "ami-client: problem parsing saved graph configuration file (%s):",
                load)
            return 1

    queue = mp.Queue()
    list_proc = mp.Process(target=run_main_window,
                           args=(queue, graphmgr_addr, saved_cfg))
    list_proc.start()
    widget_procs = []

    while True:
        window_type, name, topic = queue.get()
        if window_type == 'exit':
            logger.info("received exit signal - exiting!")
            break
        logger.debug("opening new widget: %s %s %s", window_type, name, topic)
        proc = mp.Process(target=run_widget,
                          args=(queue, window_type, name, topic,
                                graphmgr_addr))
        proc.start()
        widget_procs.append(proc)
Esempio n. 2
0
    async def process_messages(self):

        while True:
            topic = await self.broker_sub_sock.recv_string()
            msg = await self.broker_sub_sock.recv_pyobj()

            if isinstance(msg, fcMsgs.CreateNode):
                proc = mp.Process(
                    target=NodeProcess,
                    name=msg.name,
                    args=(msg, self.broker_pub_addr, self.graphmgr_addr, self.checkpoint_sub_addr),
                    kwargs={'library_paths': self.library_paths},
                    daemon=True
                )
                proc.start()
                logger.info("creating process: %s pid: %d", msg.name, proc.pid)
                async with self.lock:
                    self.widget_procs[msg.name] = (msg.node_type, proc)

            elif isinstance(msg, fcMsgs.Profiler):
                if self.profiler is None:
                    self.profiler = mp.Process(target=Profiler,
                                               args=(self.broker_pub_addr, self.graphmgr_addr.profile, msg.name),
                                               daemon=True)
                    self.profiler.start()
                    logger.info("creating process: Profiler pid: %d", self.profiler.pid)

                async with self.lock:
                    self.msgs[topic] = msg

                await self.broker_pub_sock.send_string(topic, zmq.SNDMORE)
                await self.broker_pub_sock.send_pyobj(msg)

            elif isinstance(msg, fcMsgs.DisplayNode):
                await self.forward_message_to_node(topic, msg)

            elif isinstance(msg, fcMsgs.ReloadLibrary):
                await self.forward_message_to_node(topic, msg)

            elif isinstance(msg, fcMsgs.CloseNode):
                await self.forward_message_to_node(topic, msg)

                async with self.lock:
                    if topic in self.widget_procs:
                        logger.info("deleting process: %s pid: %d", topic, proc.pid)
                        _, proc = self.widget_procs[topic]
                        proc.terminate()
                        proc.join()
                        del self.widget_procs[topic]

                    if topic in self.msgs:
                        del self.msgs[topic]

            elif isinstance(msg, fcMsgs.Library):
                self.library_paths.update(msg.paths)
Esempio n. 3
0
    async def monitor_processes(self):

        while True:
            await asyncio.sleep(0.25)

            dead_procs = []
            for name, ntp in self.widget_procs.items():
                node_type, proc = ntp
                if not proc.is_alive():
                    dead_procs.append(name)

            async with self.lock:
                for name in dead_procs:
                    typ, proc = self.widget_procs[name]

                    state = {}
                    if name in self.checkpoints:
                        state = self.checkpoints[name].state

                    msg = fcMsgs.CreateNode(name, typ, state)

                    # don't resend last message
                    del self.msgs[msg.name]

                    proc = mp.Process(
                        target=NodeProcess,
                        name=msg.name,
                        args=(msg, self.broker_pub_addr, self.graphmgr_addr, self.checkpoint_sub_addr),
                        kwargs={'library_paths': self.library_paths},
                        daemon=True
                    )
                    proc.start()
                    logger.info("restarting process: %s pid: %d", msg.name, proc.pid)
                    self.widget_procs[msg.name] = (msg.node_type, proc)
Esempio n. 4
0
    def launch_editor_window(self, configure):
        editor_proc = mp.Process(
            name='editor',
            target=run_editor_window,
            args=(self.broker_sub_addr, self.graphmgr_addr,
                  self.checkpoint_pub_addr, self.load, self.prometheus_dir,
                  self.prometheus_port, self.hutch, configure),
            daemon=True)
        editor_proc.start()

        self.editor = editor_proc
Esempio n. 5
0
def start_ami(request, workerjson):
    try:
        from pytest_cov.embed import cleanup_on_sigterm
        cleanup_on_sigterm()
    except ImportError:
        pass

    parser = build_parser()
    args = parser.parse_args([
        "-n", "1", '--headless', '--tcp',
        '%s://%s' % (request.param, workerjson)
    ])

    queue = mp.Queue()
    ami = mp.Process(name='ami', target=run_ami, args=(args, queue))
    ami.start()

    try:
        host = "127.0.0.1"
        comm_addr = "tcp://%s:%d" % (host, BasePort + Ports.Comm)
        with GraphCommHandler(args.graph_name, comm_addr) as comm_handler:
            yield comm_handler
    except Exception as e:
        # let the fixture exit 'gracefully' if it fails
        print(e)
        yield None
    finally:
        queue.put(None)
        ami.join(1)
        # if ami still hasn't exitted then kill it
        if ami.is_alive():
            ami.terminate()
            ami.join(1)

        if ami.exitcode == 0 or ami.exitcode == -signal.SIGTERM:
            return 0
        else:
            print('AMI exited with non-zero status code: %d' % ami.exitcode)
            return 1
Esempio n. 6
0
def run_ami(args, queue=None):
    xtcdir = None
    ipcdir = None
    owns_ipcdir = True
    flags = {}
    if queue is None:
        queue = mp.Queue()
    if args.ipc:
        ipcdir = tempfile.mkdtemp()
        owns_ipcdir = True
    elif args.ipc_dir is not None:
        ipcdir = args.ipc_dir
        owns_ipcdir = False
    if ipcdir is None:
        host = "127.0.0.1"
        comm_addr = "tcp://%s:%d" % (host, args.port)
        graph_addr = "tcp://%s:%d" % (host, args.port+1)
        collector_addr = "tcp://%s:%d" % (host, args.port+2)
        globalcol_addr = "tcp://%s:%d" % (host, args.port+3)
        results_addr = "tcp://%s:%d" % (host, args.port+4)
        export_addr = "tcp://%s:%d" % (host, args.port+5)
        msg_addr = "tcp://%s:%d" % (host, args.port+6)
        info_addr = "tcp://%s:%d" % (host, args.port+7)
        view_addr = "tcp://%s:%d" % (host, args.port+8)
        profile_addr = "tcp://%s:%d" % (host, args.port+9)
    else:
        collector_addr = "ipc://%s/node_collector" % ipcdir
        globalcol_addr = "ipc://%s/collector" % ipcdir
        graph_addr = "ipc://%s/graph" % ipcdir
        comm_addr = "ipc://%s/comm" % ipcdir
        results_addr = "ipc://%s/results" % ipcdir
        export_addr = "ipc://%s/export" % ipcdir
        msg_addr = "ipc://%s/message" % ipcdir
        info_addr = "ipc://%s/info" % ipcdir
        view_addr = "ipc://%s/view" % ipcdir
        profile_addr = "ipc://%s/profile" % ipcdir

    procs = []
    client_proc = None

    log_handlers = [logging.StreamHandler()]
    if args.headless or args.console:
        console_fmt = logging.Formatter(LogConfig.BasicFormat)
        log_handlers[0].setFormatter(console_fmt)
    if args.log_file is not None:
        log_handlers.append(logging.FileHandler(args.log_file))
    log_level = getattr(logging, args.log_level.upper(), logging.INFO)
    logging.basicConfig(format=LogConfig.FullFormat, level=log_level, handlers=log_handlers)

    try:
        for flag in args.flags:
            try:
                key, value = flag.split('=')
                flags[key] = value
            except ValueError:
                logger.exception("Problem parsing data source flag %s", flag)

        if args.source is not None:
            src_url_match = re.match('(?P<prot>.*)://(?P<body>.*)', args.source)
            if src_url_match:
                src_cfg = src_url_match.groups()
            else:
                logger.critical("Invalid data source config string: %s", args.source)
                return 1
        else:
            src_cfg = None

        for i in range(args.num_workers):
            proc = mp.Process(
                name='worker%03d-n0' % i,
                target=functools.partial(_sys_exit, run_worker),
                args=(i, args.num_workers, args.heartbeat, src_cfg,
                      collector_addr, graph_addr, msg_addr, export_addr, flags, args.prometheus_dir, args.hutch)
            )
            proc.daemon = True
            proc.start()
            procs.append(proc)

        collector_proc = mp.Process(
            name='nodecol-n0',
            target=functools.partial(_sys_exit, run_node_collector),
            args=(0, args.num_workers, collector_addr, globalcol_addr, graph_addr, msg_addr,
                  args.prometheus_dir, args.hutch)
        )
        collector_proc.daemon = True
        collector_proc.start()
        procs.append(collector_proc)

        globalcol_proc = mp.Process(
            name='globalcol',
            target=functools.partial(_sys_exit, run_global_collector),
            args=(0, 1, globalcol_addr, results_addr, graph_addr, msg_addr,
                  args.prometheus_dir, args.hutch)
        )
        globalcol_proc.daemon = True
        globalcol_proc.start()
        procs.append(globalcol_proc)

        manager_proc = mp.Process(
            name='manager',
            target=functools.partial(_sys_exit, run_manager),
            args=(args.num_workers, 1, results_addr, graph_addr, comm_addr, msg_addr, info_addr, export_addr,
                  view_addr, profile_addr, args.prometheus_dir, args.hutch)
        )
        manager_proc.daemon = True
        manager_proc.start()
        procs.append(manager_proc)

        if args.export:
            if run_export is None:
                logger.critical("Export module is not available: p4p needs to be installed to use the export feature!")
                return 1
            export_proc = mp.Process(
                name='export',
                target=functools.partial(_sys_exit, run_export),
                args=(args.export, comm_addr, export_addr, args.aggregate)
            )
            export_proc.daemon = True
            export_proc.start()
            procs.append(export_proc)

        if not (args.console or args.headless):
            client_proc = mp.Process(
                name='client',
                target=run_client,
                args=(args.graph_name, comm_addr, info_addr, view_addr, profile_addr, args.load, args.gui_mode,
                      args.prometheus_dir, args.hutch)
            )
            client_proc.daemon = False
            client_proc.start()
            procs.append(client_proc)

        # register a signal handler for cleanup on sigterm
        signal.signal(signal.SIGTERM, functools.partial(_sig_handler, procs))

        if args.console:
            run_console(args.graph_name, comm_addr, args.load)
        elif args.headless:
            if args.load:
                comm_handler = GraphCommHandler(args.graph_name, comm_addr)
                comm_handler.load(args.load)
            while queue.empty():
                pass
        else:
            client_proc.join()

    except KeyboardInterrupt:
        logger.info("Worker killed by user...")
    finally:
        failed_proc = cleanup(procs)
        # cleanup ipc directories
        if owns_ipcdir and ipcdir is not None and os.path.exists(ipcdir):
            shutil.rmtree(ipcdir)
        if xtcdir is not None and os.path.exists(xtcdir):
            shutil.rmtree(xtcdir)
        # return a non-zero status code if any workerss died
        if client_proc is not None and client_proc.exitcode != 0:
            return client_proc.exitcode
        elif failed_proc:
            return 1
Esempio n. 7
0
def run_ami(args, queue=None):
    flags = {}
    if queue is None:
        queue = mp.Queue()

    host = args.host
    graph_addr = "tcp://%s:%d" % (host, args.port + 1)
    collector_addr = "tcp://127.0.0.1:%d" % (args.port + 2)
    globalcol_addr = "tcp://%s:%d" % (host, args.port + 3)
    export_addr = "tcp://%s:%d" % (host, args.port + 5)
    msg_addr = "tcp://%s:%d" % (host, args.port + 6)

    procs = []

    log_handlers = [logging.StreamHandler()]
    if args.log_file is not None:
        log_handlers.append(logging.FileHandler(args.log_file))
    log_level = getattr(logging, args.log_level.upper(), logging.INFO)
    logging.basicConfig(format=LogConfig.FullFormat,
                        level=log_level,
                        handlers=log_handlers)

    try:
        for flag in args.flags:
            try:
                key, value = flag.split('=')
                flags[key] = value
            except ValueError:
                logger.exception("Problem parsing data source flag %s", flag)

        if args.source is not None:
            src_url_match = re.match('(?P<prot>.*)://(?P<body>.*)',
                                     args.source)
            if src_url_match:
                src_cfg = src_url_match.groups()
            else:
                logger.critical("Invalid data source config string: %s",
                                args.source)
                return 1
        else:
            src_cfg = None

        comm = MPI.COMM_WORLD
        size = comm.Get_size() - 2
        global_rank = comm.Get_rank()

        local_comm = comm.Split_type(MPI.COMM_TYPE_SHARED, global_rank,
                                     MPI.INFO_NULL)
        local_rank_size = local_comm.Get_size()
        local_rank = local_comm.Get_rank()
        node_rank = global_rank // local_rank_size

        # name = MPI.Get_processor_name()
        # print(f"SIZE: {size}, RANK: {global_rank}, LOCAL RANK: {local_rank}, NODE RANK: {node_rank} NAME: {name}")

        if local_rank == 0:
            collector_proc = mp.Process(
                name=f'nodecol-n{node_rank}',
                target=functools.partial(_sys_exit, run_node_collector),
                args=(node_rank, local_rank_size, collector_addr,
                      globalcol_addr, graph_addr, msg_addr,
                      args.prometheus_dir, args.hutch))
            collector_proc.daemon = True
            collector_proc.start()
            procs.append(collector_proc)

        run_worker(global_rank, size, args.heartbeat, src_cfg, collector_addr,
                   graph_addr, msg_addr, export_addr, flags,
                   args.prometheus_dir, args.hutch)

        # register a signal handler for cleanup on sigterm
        signal.signal(signal.SIGTERM, functools.partial(_sig_handler, procs))

        while True:
            pass

    except KeyboardInterrupt:
        logger.info("Worker killed by user...")
    finally:
        failed_proc = cleanup(procs)
        # return a non-zero status code if any workerss died
        if failed_proc:
            return 1
Esempio n. 8
0
def main(color, upstream_port, downstream_port):
    parser = argparse.ArgumentParser(description='AMII Collector App')

    parser.add_argument(
        '-H',
        '--host',
        default=Defaults.Host,
        help='hostname of the AMII Manager (default: %s)' % Defaults.Host
    )

    parser.add_argument(
        '-c',
        '--collector',
        type=int,
        default=upstream_port,
        help='port of the collector (default: %d)' % upstream_port
    )

    parser.add_argument(
        '-d',
        '--downstream',
        type=int,
        default=downstream_port,
        help='port for global collector (default: %d)' % downstream_port
    )

    parser.add_argument(
        '-g',
        '--graph',
        type=int,
        default=Ports.Graph,
        help='port for graph communication (default: %d)' % Ports.Graph
    )

    parser.add_argument(
        '-m',
        '--message',
        type=int,
        default=Ports.Message,
        help='port for sending out-of-band messages from nodes (default: %d)' % Ports.Message
    )

    parser.add_argument(
        '-n',
        '--num-contribs',
        type=int,
        default=1,
        help='number of contributer processes (default: 1)'
    )

    parser.add_argument(
        '-N',
        '--node-num',
        type=int,
        default=0,
        help='node identification number (default: 0)'
    )

    parser.add_argument(
        '--log-level',
        default=LogConfig.Level,
        help='the logging level of the application (default %s)' % LogConfig.Level
    )

    parser.add_argument(
        '--log-file',
        help='an optional file to write the log output to'
    )

    parser.add_argument(
        '--prometheus-dir',
        help='directory for prometheus configuration',
        default=None
    )

    parser.add_argument(
        '--hutch',
        help='hutch for prometheus label',
        default=None
    )

    subparsers = parser.add_subparsers(help='spawn workers', dest='worker')
    worker_subparser = subparsers.add_parser('worker', help='worker arguments')

    worker_subparser.add_argument(
        'source',
        nargs='?',
        metavar='SOURCE',
        help='data source configuration (exampes: static://test.json, random://test.json, psana://exp=xcsdaq13:run=14)'
    )

    worker_subparser.add_argument(
        '-e',
        '--export',
        type=int,
        default=Ports.Export,
        help='port for receiving exported graph results (default: %d)' % Ports.Export
    )

    worker_subparser.add_argument(
        '-b',
        '--heartbeat',
        type=int,
        default=10,
        help='the heartbeat period (default: 10)'
    )

    worker_subparser.add_argument(
        '-f',
        '--flags',
        action='append',
        default=[],
        help='extra flags as key=value pairs that are passed to the data source'
    )

    args = parser.parse_args()

    collector_addr = "tcp://*:%d" % (args.collector)
    downstream_addr = "tcp://%s:%d" % (args.host, args.downstream)
    graph_addr = "tcp://%s:%d" % (args.host, args.graph)
    msg_addr = "tcp://%s:%d" % (args.host, args.message)

    log_handlers = [logging.StreamHandler()]
    if args.log_file is not None:
        log_handlers.append(logging.FileHandler(args.log_file))
    log_level = getattr(logging, args.log_level.upper(), logging.INFO)
    if args.worker:
        logging.basicConfig(format=LogConfig.FullFormat, level=log_level, handlers=log_handlers)
    else:
        logging.basicConfig(format=LogConfig.Format, level=log_level, handlers=log_handlers)

    try:
        if color == Colors.LocalCollector:
            if args.worker:
                local_collector_addr = "tcp://localhost:%d" % args.collector
                export_addr = "tcp://%s:%d" % (args.host, args.export)
                flags, src_cfg = parse_args(args)
                for n in range(0, args.num_contribs):
                    worker = mp.Process(name='worker', target=run_worker,
                                        args=(args.node_num*args.num_contribs+n,
                                              args.num_contribs,
                                              args.heartbeat,
                                              src_cfg,
                                              local_collector_addr,
                                              graph_addr,
                                              msg_addr,
                                              export_addr,
                                              flags,
                                              args.prometheus_dir,
                                              args.hutch),
                                        daemon=True)
                    worker.start()

            return run_node_collector(args.node_num,
                                      args.num_contribs,
                                      collector_addr,
                                      downstream_addr,
                                      graph_addr,
                                      msg_addr,
                                      args.prometheus_dir,
                                      args.hutch)
        elif color == Colors.GlobalCollector:
            return run_global_collector(args.node_num,
                                        args.num_contribs,
                                        collector_addr,
                                        downstream_addr,
                                        graph_addr,
                                        msg_addr,
                                        args.prometheus_dir,
                                        args.hutch)
        else:
            logger.critical("Invalid option collector color '%s' chosen!", color)
            return 1

    except KeyboardInterrupt:
        logger.info("collector killed by user...")
        return 0