Beispiel #1
0
    def init_lmon(self, attach, **kwargs):
        """Initialize LaunchMON and deploy back-end daemons.

        attach is True to attach to a process.
        - Provide the keyword argument pid, the srun PID.
        attach is False to launch the job.
        - Provide keyword arguments launcher and launcher_args.

        """
        os.environ.update(gdbconf.environ)
        self.lmon = LMON_fe()
        try:
            self.lmon.init()
            self.lmon_session = self.lmon.createSession()
            self.lmon.putToBeDaemonEnv(self.lmon_session,
                                       gdbconf.environ.items())
            self.lmon.regPackForFeToBe(self.lmon_session, lmon.pack)
            self.lmon.regUnpackForBeToFe(self.lmon_session, lmon.unpack)
            if attach:
                host = kwargs["host"]
                if not host:
                    host = socket.getfqdn()
                self.lmon.attachAndSpawnDaemons(self.lmon_session, host,
                                                kwargs["pid"],
                                                gdbconf.backend_bin,
                                                gdbconf.backend_args, None,
                                                None)
            else:
                launcher_argv = [kwargs["launcher"]] + kwargs["launcher_args"]
                self.lmon.launchAndSpawnDaemons(self.lmon_session,
                                                socket.getfqdn(),
                                                kwargs["launcher"],
                                                launcher_argv,
                                                gdbconf.backend_bin,
                                                gdbconf.backend_args, None,
                                                None)
            self.proctab_size = self.lmon.getProctableSize(self.lmon_session)
            self.proctab, unused = self.lmon.getProctable(
                self.lmon_session, self.proctab_size)
        except lmon.LMONException as e:
            e.print_lmon_error()
            traceback.print_exc()
            return False
        # These are meaningless for the front-end.
        self.lmon_rank = None
        self.lmon_size = None
        self.lmon_master = None
        self._init_mpiranks()
        return True
Beispiel #2
0
    def init_lmon(self, attach, **kwargs):
        """Initialize LaunchMON and deploy back-end daemons.

        attach is True to attach to a process.
        - Provide the keyword argument pid, the srun PID.
        attach is False to launch the job.
        - Provide keyword arguments launcher and launcher_args.

        """
        os.environ.update(gdbconf.environ)
        self.lmon = LMON_fe()
        try:
            self.lmon.init()
            self.lmon_session = self.lmon.createSession()
            self.lmon.putToBeDaemonEnv(self.lmon_session, gdbconf.environ.items())
            self.lmon.regPackForFeToBe(self.lmon_session, lmon.pack)
            self.lmon.regUnpackForBeToFe(self.lmon_session, lmon.unpack)
            if attach:
                host = kwargs["host"]
                if not host:
                    host = socket.getfqdn()
                self.lmon.attachAndSpawnDaemons(self.lmon_session,
                                                host,
                                                kwargs["pid"],
                                                gdbconf.backend_bin,
                                                gdbconf.backend_args,
                                                None, None)
            else:
                launcher_argv = [kwargs["launcher"]] + kwargs["launcher_args"]
                self.lmon.launchAndSpawnDaemons(self.lmon_session,
                                                socket.getfqdn(),
                                                kwargs["launcher"],
                                                launcher_argv,
                                                gdbconf.backend_bin,
                                                gdbconf.backend_args,
                                                None, None)
            self.proctab_size = self.lmon.getProctableSize(self.lmon_session)
            self.proctab, unused = self.lmon.getProctable(self.lmon_session, self.proctab_size)
        except lmon.LMONException as e:
            e.print_lmon_error()
            traceback.print_exc()
            return False
        # These are meaningless for the front-end.
        self.lmon_rank = None
        self.lmon_size = None
        self.lmon_master = None
        self._init_mpiranks()
        return True
Beispiel #3
0
class CommunicatorFE (Communicator):
    """Communicator for the front-end."""

    def __init__(self, locking = False):
        Communicator.__init__(self, locking)

    def init_lmon(self, attach, **kwargs):
        """Initialize LaunchMON and deploy back-end daemons.

        attach is True to attach to a process.
        - Provide the keyword argument pid, the srun PID.
        attach is False to launch the job.
        - Provide keyword arguments launcher and launcher_args.

        """
        os.environ.update(gdbconf.environ)
        self.lmon = LMON_fe()
        try:
            self.lmon.init()
            self.lmon_session = self.lmon.createSession()
            self.lmon.putToBeDaemonEnv(self.lmon_session, gdbconf.environ.items())
            self.lmon.regPackForFeToBe(self.lmon_session, lmon.pack)
            self.lmon.regUnpackForBeToFe(self.lmon_session, lmon.unpack)
            if attach:
                host = kwargs["host"]
                if not host:
                    host = socket.getfqdn()
                self.lmon.attachAndSpawnDaemons(self.lmon_session,
                                                host,
                                                kwargs["pid"],
                                                gdbconf.backend_bin,
                                                gdbconf.backend_args,
                                                None, None)
            else:
                launcher_argv = [kwargs["launcher"]] + kwargs["launcher_args"]
                self.lmon.launchAndSpawnDaemons(self.lmon_session,
                                                socket.getfqdn(),
                                                kwargs["launcher"],
                                                launcher_argv,
                                                gdbconf.backend_bin,
                                                gdbconf.backend_args,
                                                None, None)
            self.proctab_size = self.lmon.getProctableSize(self.lmon_session)
            self.proctab, unused = self.lmon.getProctable(self.lmon_session, self.proctab_size)
        except lmon.LMONException as e:
            e.print_lmon_error()
            traceback.print_exc()
            return False
        # These are meaningless for the front-end.
        self.lmon_rank = None
        self.lmon_size = None
        self.lmon_master = None
        self._init_mpiranks()
        return True

    def _construct_mrnet_topology(self, comm_nodes = None):
        """Construct the topology to be used for MRNet.

        comm_nodes is a list of nodes to deploy comm nodes on. If none, the
        nodes are co-located on the same hosts as debuggers.

        """
        branch_factor = gdbconf.mrnet_branch_factor
        # Compute the minimum number of nodes we need given the branching factor.
        # This is the number of hosts LMON is deployed on, divided by the branching factor.
        lmon_hosts = list(set(map(lambda x: x.pd.host_name, self.proctab)))
        # Add 1 because this is integer division and we want the ceil.
        num_nodes = (len(lmon_hosts) / branch_factor) + 1
        host_list = comm_nodes
        if host_list:
            if len(host_list) < num_nodes:
                print "Not enough comm nodes: {0} < {1} (branch factor = {2})!".format(len(host_list), num_nodes, branch_factor)
                sys.exit(1)
        else:
            # We need to allocate comm nodes from among the back-end LMON hosts, so pick as many as needed.
            host_list = lmon_hosts[0:num_nodes]
        cur_host = socket.gethostname()
        if cur_host in host_list:
            print "Cannot have the front-end on the same machine as a back-end daemon."
            sys.exit(1)
        cur_parents = [cur_host]
        self.mrnet_topo_path = "{0}/topo_{1}".format(gdbconf.topology_path, os.getpid())
        fmt = "{0}:0"
        with open(self.mrnet_topo_path, "w+") as topo_file:
            while host_list:
                new_parents = []
                for parent in cur_parents:
                    children = host_list[:branch_factor]
                    new_parents += children
                    del host_list[:branch_factor]
                    if children:
                        topo_file.write(fmt.format(parent) + " => " +
                                        " ".join(map(lambda x: fmt.format(x), children)) + " ;\n")
                cur_parents = new_parents

    def _construct_local_node_topology(self):
        """Construct a topology for MRNet that just uses the local node."""
        cur_host = socket.gethostname()
        self.mrnet_topo_path = "{0}/topo_{1}".format(gdbconf.topology_path, os.getpid())
        with open(self.mrnet_topo_path, "w+") as topo_file:
            topo_file.write(cur_host + ":0 => " + cur_host + ":1 ;\n")

    def _assign_mrnet_leaves(self):
        """Assign debugger processes to MRNet leaves.

        For each leaf in the MRNet topology, assign up to the branching factor
        in debuggers for communication purposes.

        """
        topology = self.mrnet.get_NetworkTopology()
        # Note: This assumes that leaves gives us a list.
        mrnet_leaves = topology.get_Leaves()
        leaves = list(mrnet_leaves)
        num_nodes = topology.get_NumNodes() + 1 # Add 1 to make sure we're good.
        node_info = []
        local_rank = self.mrnet.get_LocalRank()
        leaf_idx = 0
        # be_rank is assigned to be greater than all the existing nodes.
        for i in range(0, len(self.get_lmon_hosts())):
            leaf = leaves[leaf_idx]
            # Check for root, since get_Parent fails on it.
            if leaf.get_Rank() == local_rank:
                node_info.append(NodeInfo(local_rank, leaf.get_HostName(),
                                          leaf.get_Port(), -1, num_nodes + i))
            else:
                node_info.append(NodeInfo(leaf.get_Rank(), leaf.get_HostName(),
                                          leaf.get_Port(), leaf.get_Parent(), num_nodes + i))
            if i % gdbconf.mrnet_branch_factor == (gdbconf.mrnet_branch_factor - 1):
                # Remove the leaf after we've given it mrnet_branch_factor children.
                leaf_idx += 1
        return node_info

    def _send_mrnet_topology(self):
        """Send the MRNet topology to the back-end daemons."""
        node_info = self._assign_mrnet_leaves()
        try:
            self.lmon.sendUsrDataBe(self.lmon_session, node_info)
        except lmon.LMONException as e:
            e.print_lmon_error()
            traceback.print_exc()
            return False
        self.mrnet_network_size = len(node_info)
        return True

    def _mrnet_node_joined_cb(self):
        """An MRNet callback invoked whenever a back-end node joins."""
        self.node_joins += 1

    def _mrnet_node_removed_cb(self):
        """An MRnet callback invoked whenever a back-end node leaves."""
        self.node_exits += 1

    def _wait_for_nodes(self):
        """Wait for all MRNet nodes to join the network."""
        while self.node_joins != self.mrnet_network_size: pass

    def _init_mrnet_streams(self):
        """Initialize basic MRNet streams."""
        self.broadcast_communicator = self.mrnet.get_BroadcastCommunicator()
        self.mrnet_broadcast_stream = self.mrnet.new_Stream(self.broadcast_communicator,
                                                            self.filter_ids[0],
                                                            MRN.SFILTER_WAITFORALL,
                                                            MRN.TFILTER_NULL)
        self.mrnet_frontend_stream = None # Not used here.

    def _send_mrnet_hello(self):
        """Send the HELLO message across MRNet."""
        self.send(GDBMessage(HELLO_MSG), self.broadcast)

    def _init_mrnet_rank_map(self):
        """Initialize the mappings from MPI ranks to MRNet ranks."""
        self.mpirank_to_mrnrank_map = {}
        hostname_to_mrnrank = {}
        mrnet_endpoints = self.broadcast_communicator.get_EndPoints()
        for endpoint in mrnet_endpoints:
            hostname_to_mrnrank[socket.getfqdn(endpoint.get_HostName())] = endpoint.get_Rank()
        for proc in self.get_proctab():
            self.mpirank_to_mrnrank_map[proc.mpirank] = hostname_to_mrnrank[socket.getfqdn(proc.pd.host_name)]

    def _load_mrnet_filters(self):
        """Load MRNet filters."""
        self.filter_ids = []
        for filter_path, filter_func in gdbconf.mrnet_filters:
            if os.path.isfile(filter_path):
                # Ensure the file actually still exists.
                try:
                    with open(filter_path):
                        ret_filter_id = self.mrnet.load_FilterFunc(filter_path, filter_func)
                        if ret_filter_id == -1:
                            print "Failed to load filter {0}:{1}!".format(filter_path, filter_func)
                            sys.exit(1)
                        self.filter_ids.append(ret_filter_id)
                except IOError:
                    print "Filter {0} disappeared!".format(filter_path)
                    sys.exit(1)
            else:
                print "Cannot find filter {0}!".format(filter_path)
                sys.exit(1)

    def init_mrnet(self, local = False):
        """Initialize MRNet.

        local is whether to initialize for a cluster or just this node.

        """
        if local:
            self._construct_local_node_topology()
        else:
            self._construct_mrnet_topology()
        self.mrnet = MRN.Network.CreateNetworkFE(self.mrnet_topo_path)
        self.node_joins = 0
        self.node_exits = 0
        self.mrnet.register_EventCallback(MRN.Event.TOPOLOGY_EVENT,
                                          MRN.TopologyEvent.TOPOL_ADD_BE,
                                          self._mrnet_node_joined_cb)
        self.mrnet.register_EventCallback(MRN.Event.TOPOLOGY_EVENT,
                                          MRN.TopologyEvent.TOPOL_REMOVE_NODE,
                                          self._mrnet_node_removed_cb)
        self._load_mrnet_filters()
        ret = self._send_mrnet_topology()
        if not ret:
            return False
        self._wait_for_nodes()
        self._init_shared_mrnet()
        self._enable_mrnet_perf_data()
        self._init_mrnet_rank_map()
        self._send_mrnet_hello()
        if gdbconf.mrnet_topology_dot:
            topo = self.mrnet.get_NetworkTopology()
            topo.print_DOTGraph(gdbconf.mrnet_topology_dot)
        return True

    def shutdown(self):
        """Shut down the communication infrastructure."""
        self._disable_mrnet_perf_data()
        self._log_mrnet_perf_data()
        # Shut this stream down.
        #del self.mrnet_broadcast_stream
        del self.mrnet
        try:
            self.lmon.shutdownDaemons(self.lmon_session)
        except lmon.LMONException as e:
            e.print_lmon_error()
            traceback.print_exc()
            return False
        self.been_shutdown = True
        return True

    def mpirank_to_mrnrank(self, rank):
        """Convert an MPI rank to an MRNet rank. Only works on front-end."""
        return self.mpirank_to_mrnrank_map[rank]

    def get_mrnet_network_size(self):
        """Return the size of the MRNet network."""
        return self.mrnet_network_size

    def get_exit_count(self):
        """Return the number of MRNet nodes that have exited."""
        return self.node_exits

    def all_nodes_exited(self):
        """Return whether all nodes have exited."""
        return self.node_exits == self.node_joins
Beispiel #4
0
class CommunicatorFE(Communicator):
    """Communicator for the front-end."""
    def __init__(self, locking=False):
        Communicator.__init__(self, locking)

    def init_lmon(self, attach, **kwargs):
        """Initialize LaunchMON and deploy back-end daemons.

        attach is True to attach to a process.
        - Provide the keyword argument pid, the srun PID.
        attach is False to launch the job.
        - Provide keyword arguments launcher and launcher_args.

        """
        os.environ.update(gdbconf.environ)
        self.lmon = LMON_fe()
        try:
            self.lmon.init()
            self.lmon_session = self.lmon.createSession()
            self.lmon.putToBeDaemonEnv(self.lmon_session,
                                       gdbconf.environ.items())
            self.lmon.regPackForFeToBe(self.lmon_session, lmon.pack)
            self.lmon.regUnpackForBeToFe(self.lmon_session, lmon.unpack)
            if attach:
                host = kwargs["host"]
                if not host:
                    host = socket.getfqdn()
                self.lmon.attachAndSpawnDaemons(self.lmon_session, host,
                                                kwargs["pid"],
                                                gdbconf.backend_bin,
                                                gdbconf.backend_args, None,
                                                None)
            else:
                launcher_argv = [kwargs["launcher"]] + kwargs["launcher_args"]
                self.lmon.launchAndSpawnDaemons(self.lmon_session,
                                                socket.getfqdn(),
                                                kwargs["launcher"],
                                                launcher_argv,
                                                gdbconf.backend_bin,
                                                gdbconf.backend_args, None,
                                                None)
            self.proctab_size = self.lmon.getProctableSize(self.lmon_session)
            self.proctab, unused = self.lmon.getProctable(
                self.lmon_session, self.proctab_size)
        except lmon.LMONException as e:
            e.print_lmon_error()
            traceback.print_exc()
            return False
        # These are meaningless for the front-end.
        self.lmon_rank = None
        self.lmon_size = None
        self.lmon_master = None
        self._init_mpiranks()
        return True

    def _construct_mrnet_topology(self, comm_nodes=None):
        """Construct the topology to be used for MRNet.

        comm_nodes is a list of nodes to deploy comm nodes on. If none, the
        nodes are co-located on the same hosts as debuggers.

        """
        branch_factor = gdbconf.mrnet_branch_factor
        # Compute the minimum number of nodes we need given the branching factor.
        # This is the number of hosts LMON is deployed on, divided by the branching factor.
        lmon_hosts = list(set(map(lambda x: x.pd.host_name, self.proctab)))
        # Add 1 because this is integer division and we want the ceil.
        num_nodes = (len(lmon_hosts) / branch_factor) + 1
        host_list = comm_nodes
        if host_list:
            if len(host_list) < num_nodes:
                print "Not enough comm nodes: {0} < {1} (branch factor = {2})!".format(
                    len(host_list), num_nodes, branch_factor)
                sys.exit(1)
        else:
            # We need to allocate comm nodes from among the back-end LMON hosts, so pick as many as needed.
            host_list = lmon_hosts[0:num_nodes]
        cur_host = socket.gethostname()
        if cur_host in host_list:
            print "Cannot have the front-end on the same machine as a back-end daemon."
            sys.exit(1)
        cur_parents = [cur_host]
        self.mrnet_topo_path = "{0}/topo_{1}".format(gdbconf.topology_path,
                                                     os.getpid())
        fmt = "{0}:0"
        with open(self.mrnet_topo_path, "w+") as topo_file:
            while host_list:
                new_parents = []
                for parent in cur_parents:
                    children = host_list[:branch_factor]
                    new_parents += children
                    del host_list[:branch_factor]
                    if children:
                        topo_file.write(
                            fmt.format(parent) + " => " +
                            " ".join(map(lambda x: fmt.format(x), children)) +
                            " ;\n")
                cur_parents = new_parents

    def _construct_local_node_topology(self):
        """Construct a topology for MRNet that just uses the local node."""
        cur_host = socket.gethostname()
        self.mrnet_topo_path = "{0}/topo_{1}".format(gdbconf.topology_path,
                                                     os.getpid())
        with open(self.mrnet_topo_path, "w+") as topo_file:
            topo_file.write(cur_host + ":0 => " + cur_host + ":1 ;\n")

    def _assign_mrnet_leaves(self):
        """Assign debugger processes to MRNet leaves.

        For each leaf in the MRNet topology, assign up to the branching factor
        in debuggers for communication purposes.

        """
        topology = self.mrnet.get_NetworkTopology()
        # Note: This assumes that leaves gives us a list.
        mrnet_leaves = topology.get_Leaves()
        leaves = list(mrnet_leaves)
        num_nodes = topology.get_NumNodes(
        ) + 1  # Add 1 to make sure we're good.
        node_info = []
        local_rank = self.mrnet.get_LocalRank()
        leaf_idx = 0
        # be_rank is assigned to be greater than all the existing nodes.
        for i in range(0, len(self.get_lmon_hosts())):
            leaf = leaves[leaf_idx]
            # Check for root, since get_Parent fails on it.
            if leaf.get_Rank() == local_rank:
                node_info.append(
                    NodeInfo(local_rank, leaf.get_HostName(), leaf.get_Port(),
                             -1, num_nodes + i))
            else:
                node_info.append(
                    NodeInfo(leaf.get_Rank(), leaf.get_HostName(),
                             leaf.get_Port(), leaf.get_Parent(),
                             num_nodes + i))
            if i % gdbconf.mrnet_branch_factor == (
                    gdbconf.mrnet_branch_factor - 1):
                # Remove the leaf after we've given it mrnet_branch_factor children.
                leaf_idx += 1
        return node_info

    def _send_mrnet_topology(self):
        """Send the MRNet topology to the back-end daemons."""
        node_info = self._assign_mrnet_leaves()
        try:
            self.lmon.sendUsrDataBe(self.lmon_session, node_info)
        except lmon.LMONException as e:
            e.print_lmon_error()
            traceback.print_exc()
            return False
        self.mrnet_network_size = len(node_info)
        return True

    def _mrnet_node_joined_cb(self):
        """An MRNet callback invoked whenever a back-end node joins."""
        self.node_joins += 1

    def _mrnet_node_removed_cb(self):
        """An MRnet callback invoked whenever a back-end node leaves."""
        self.node_exits += 1

    def _wait_for_nodes(self):
        """Wait for all MRNet nodes to join the network."""
        while self.node_joins != self.mrnet_network_size:
            pass

    def _init_mrnet_streams(self):
        """Initialize basic MRNet streams."""
        self.broadcast_communicator = self.mrnet.get_BroadcastCommunicator()
        self.mrnet_broadcast_stream = self.mrnet.new_Stream(
            self.broadcast_communicator, self.filter_ids[0],
            MRN.SFILTER_WAITFORALL, MRN.TFILTER_NULL)
        self.mrnet_frontend_stream = None  # Not used here.

    def _send_mrnet_hello(self):
        """Send the HELLO message across MRNet."""
        self.send(GDBMessage(HELLO_MSG), self.broadcast)

    def _init_mrnet_rank_map(self):
        """Initialize the mappings from MPI ranks to MRNet ranks."""
        self.mpirank_to_mrnrank_map = {}
        hostname_to_mrnrank = {}
        mrnet_endpoints = self.broadcast_communicator.get_EndPoints()
        for endpoint in mrnet_endpoints:
            hostname_to_mrnrank[socket.getfqdn(
                endpoint.get_HostName())] = endpoint.get_Rank()
        for proc in self.get_proctab():
            self.mpirank_to_mrnrank_map[proc.mpirank] = hostname_to_mrnrank[
                socket.getfqdn(proc.pd.host_name)]

    def _load_mrnet_filters(self):
        """Load MRNet filters."""
        self.filter_ids = []
        for filter_path, filter_func in gdbconf.mrnet_filters:
            if os.path.isfile(filter_path):
                # Ensure the file actually still exists.
                try:
                    with open(filter_path):
                        ret_filter_id = self.mrnet.load_FilterFunc(
                            filter_path, filter_func)
                        if ret_filter_id == -1:
                            print "Failed to load filter {0}:{1}!".format(
                                filter_path, filter_func)
                            sys.exit(1)
                        self.filter_ids.append(ret_filter_id)
                except IOError:
                    print "Filter {0} disappeared!".format(filter_path)
                    sys.exit(1)
            else:
                print "Cannot find filter {0}!".format(filter_path)
                sys.exit(1)

    def init_mrnet(self, local=False):
        """Initialize MRNet.

        local is whether to initialize for a cluster or just this node.

        """
        if local:
            self._construct_local_node_topology()
        else:
            self._construct_mrnet_topology()
        self.mrnet = MRN.Network.CreateNetworkFE(self.mrnet_topo_path)
        self.node_joins = 0
        self.node_exits = 0
        self.mrnet.register_EventCallback(MRN.Event.TOPOLOGY_EVENT,
                                          MRN.TopologyEvent.TOPOL_ADD_BE,
                                          self._mrnet_node_joined_cb)
        self.mrnet.register_EventCallback(MRN.Event.TOPOLOGY_EVENT,
                                          MRN.TopologyEvent.TOPOL_REMOVE_NODE,
                                          self._mrnet_node_removed_cb)
        self._load_mrnet_filters()
        ret = self._send_mrnet_topology()
        if not ret:
            return False
        self._wait_for_nodes()
        self._init_shared_mrnet()
        self._enable_mrnet_perf_data()
        self._init_mrnet_rank_map()
        self._send_mrnet_hello()
        if gdbconf.mrnet_topology_dot:
            topo = self.mrnet.get_NetworkTopology()
            topo.print_DOTGraph(gdbconf.mrnet_topology_dot)
        return True

    def shutdown(self):
        """Shut down the communication infrastructure."""
        self._disable_mrnet_perf_data()
        self._log_mrnet_perf_data()
        # Shut this stream down.
        #del self.mrnet_broadcast_stream
        del self.mrnet
        try:
            self.lmon.shutdownDaemons(self.lmon_session)
        except lmon.LMONException as e:
            e.print_lmon_error()
            traceback.print_exc()
            return False
        self.been_shutdown = True
        return True

    def mpirank_to_mrnrank(self, rank):
        """Convert an MPI rank to an MRNet rank. Only works on front-end."""
        return self.mpirank_to_mrnrank_map[rank]

    def get_mrnet_network_size(self):
        """Return the size of the MRNet network."""
        return self.mrnet_network_size

    def get_exit_count(self):
        """Return the number of MRNet nodes that have exited."""
        return self.node_exits

    def all_nodes_exited(self):
        """Return whether all nodes have exited."""
        return self.node_exits == self.node_joins