def init_lmon(self, attach, **kwargs): """Initialize LaunchMON and deploy back-end daemons. attach is True to attach to a process. - Provide the keyword argument pid, the srun PID. attach is False to launch the job. - Provide keyword arguments launcher and launcher_args. """ os.environ.update(gdbconf.environ) self.lmon = LMON_fe() try: self.lmon.init() self.lmon_session = self.lmon.createSession() self.lmon.putToBeDaemonEnv(self.lmon_session, gdbconf.environ.items()) self.lmon.regPackForFeToBe(self.lmon_session, lmon.pack) self.lmon.regUnpackForBeToFe(self.lmon_session, lmon.unpack) if attach: host = kwargs["host"] if not host: host = socket.getfqdn() self.lmon.attachAndSpawnDaemons(self.lmon_session, host, kwargs["pid"], gdbconf.backend_bin, gdbconf.backend_args, None, None) else: launcher_argv = [kwargs["launcher"]] + kwargs["launcher_args"] self.lmon.launchAndSpawnDaemons(self.lmon_session, socket.getfqdn(), kwargs["launcher"], launcher_argv, gdbconf.backend_bin, gdbconf.backend_args, None, None) self.proctab_size = self.lmon.getProctableSize(self.lmon_session) self.proctab, unused = self.lmon.getProctable( self.lmon_session, self.proctab_size) except lmon.LMONException as e: e.print_lmon_error() traceback.print_exc() return False # These are meaningless for the front-end. self.lmon_rank = None self.lmon_size = None self.lmon_master = None self._init_mpiranks() return True
def init_lmon(self, attach, **kwargs): """Initialize LaunchMON and deploy back-end daemons. attach is True to attach to a process. - Provide the keyword argument pid, the srun PID. attach is False to launch the job. - Provide keyword arguments launcher and launcher_args. """ os.environ.update(gdbconf.environ) self.lmon = LMON_fe() try: self.lmon.init() self.lmon_session = self.lmon.createSession() self.lmon.putToBeDaemonEnv(self.lmon_session, gdbconf.environ.items()) self.lmon.regPackForFeToBe(self.lmon_session, lmon.pack) self.lmon.regUnpackForBeToFe(self.lmon_session, lmon.unpack) if attach: host = kwargs["host"] if not host: host = socket.getfqdn() self.lmon.attachAndSpawnDaemons(self.lmon_session, host, kwargs["pid"], gdbconf.backend_bin, gdbconf.backend_args, None, None) else: launcher_argv = [kwargs["launcher"]] + kwargs["launcher_args"] self.lmon.launchAndSpawnDaemons(self.lmon_session, socket.getfqdn(), kwargs["launcher"], launcher_argv, gdbconf.backend_bin, gdbconf.backend_args, None, None) self.proctab_size = self.lmon.getProctableSize(self.lmon_session) self.proctab, unused = self.lmon.getProctable(self.lmon_session, self.proctab_size) except lmon.LMONException as e: e.print_lmon_error() traceback.print_exc() return False # These are meaningless for the front-end. self.lmon_rank = None self.lmon_size = None self.lmon_master = None self._init_mpiranks() return True
class CommunicatorFE (Communicator): """Communicator for the front-end.""" def __init__(self, locking = False): Communicator.__init__(self, locking) def init_lmon(self, attach, **kwargs): """Initialize LaunchMON and deploy back-end daemons. attach is True to attach to a process. - Provide the keyword argument pid, the srun PID. attach is False to launch the job. - Provide keyword arguments launcher and launcher_args. """ os.environ.update(gdbconf.environ) self.lmon = LMON_fe() try: self.lmon.init() self.lmon_session = self.lmon.createSession() self.lmon.putToBeDaemonEnv(self.lmon_session, gdbconf.environ.items()) self.lmon.regPackForFeToBe(self.lmon_session, lmon.pack) self.lmon.regUnpackForBeToFe(self.lmon_session, lmon.unpack) if attach: host = kwargs["host"] if not host: host = socket.getfqdn() self.lmon.attachAndSpawnDaemons(self.lmon_session, host, kwargs["pid"], gdbconf.backend_bin, gdbconf.backend_args, None, None) else: launcher_argv = [kwargs["launcher"]] + kwargs["launcher_args"] self.lmon.launchAndSpawnDaemons(self.lmon_session, socket.getfqdn(), kwargs["launcher"], launcher_argv, gdbconf.backend_bin, gdbconf.backend_args, None, None) self.proctab_size = self.lmon.getProctableSize(self.lmon_session) self.proctab, unused = self.lmon.getProctable(self.lmon_session, self.proctab_size) except lmon.LMONException as e: e.print_lmon_error() traceback.print_exc() return False # These are meaningless for the front-end. self.lmon_rank = None self.lmon_size = None self.lmon_master = None self._init_mpiranks() return True def _construct_mrnet_topology(self, comm_nodes = None): """Construct the topology to be used for MRNet. comm_nodes is a list of nodes to deploy comm nodes on. If none, the nodes are co-located on the same hosts as debuggers. """ branch_factor = gdbconf.mrnet_branch_factor # Compute the minimum number of nodes we need given the branching factor. # This is the number of hosts LMON is deployed on, divided by the branching factor. lmon_hosts = list(set(map(lambda x: x.pd.host_name, self.proctab))) # Add 1 because this is integer division and we want the ceil. num_nodes = (len(lmon_hosts) / branch_factor) + 1 host_list = comm_nodes if host_list: if len(host_list) < num_nodes: print "Not enough comm nodes: {0} < {1} (branch factor = {2})!".format(len(host_list), num_nodes, branch_factor) sys.exit(1) else: # We need to allocate comm nodes from among the back-end LMON hosts, so pick as many as needed. host_list = lmon_hosts[0:num_nodes] cur_host = socket.gethostname() if cur_host in host_list: print "Cannot have the front-end on the same machine as a back-end daemon." sys.exit(1) cur_parents = [cur_host] self.mrnet_topo_path = "{0}/topo_{1}".format(gdbconf.topology_path, os.getpid()) fmt = "{0}:0" with open(self.mrnet_topo_path, "w+") as topo_file: while host_list: new_parents = [] for parent in cur_parents: children = host_list[:branch_factor] new_parents += children del host_list[:branch_factor] if children: topo_file.write(fmt.format(parent) + " => " + " ".join(map(lambda x: fmt.format(x), children)) + " ;\n") cur_parents = new_parents def _construct_local_node_topology(self): """Construct a topology for MRNet that just uses the local node.""" cur_host = socket.gethostname() self.mrnet_topo_path = "{0}/topo_{1}".format(gdbconf.topology_path, os.getpid()) with open(self.mrnet_topo_path, "w+") as topo_file: topo_file.write(cur_host + ":0 => " + cur_host + ":1 ;\n") def _assign_mrnet_leaves(self): """Assign debugger processes to MRNet leaves. For each leaf in the MRNet topology, assign up to the branching factor in debuggers for communication purposes. """ topology = self.mrnet.get_NetworkTopology() # Note: This assumes that leaves gives us a list. mrnet_leaves = topology.get_Leaves() leaves = list(mrnet_leaves) num_nodes = topology.get_NumNodes() + 1 # Add 1 to make sure we're good. node_info = [] local_rank = self.mrnet.get_LocalRank() leaf_idx = 0 # be_rank is assigned to be greater than all the existing nodes. for i in range(0, len(self.get_lmon_hosts())): leaf = leaves[leaf_idx] # Check for root, since get_Parent fails on it. if leaf.get_Rank() == local_rank: node_info.append(NodeInfo(local_rank, leaf.get_HostName(), leaf.get_Port(), -1, num_nodes + i)) else: node_info.append(NodeInfo(leaf.get_Rank(), leaf.get_HostName(), leaf.get_Port(), leaf.get_Parent(), num_nodes + i)) if i % gdbconf.mrnet_branch_factor == (gdbconf.mrnet_branch_factor - 1): # Remove the leaf after we've given it mrnet_branch_factor children. leaf_idx += 1 return node_info def _send_mrnet_topology(self): """Send the MRNet topology to the back-end daemons.""" node_info = self._assign_mrnet_leaves() try: self.lmon.sendUsrDataBe(self.lmon_session, node_info) except lmon.LMONException as e: e.print_lmon_error() traceback.print_exc() return False self.mrnet_network_size = len(node_info) return True def _mrnet_node_joined_cb(self): """An MRNet callback invoked whenever a back-end node joins.""" self.node_joins += 1 def _mrnet_node_removed_cb(self): """An MRnet callback invoked whenever a back-end node leaves.""" self.node_exits += 1 def _wait_for_nodes(self): """Wait for all MRNet nodes to join the network.""" while self.node_joins != self.mrnet_network_size: pass def _init_mrnet_streams(self): """Initialize basic MRNet streams.""" self.broadcast_communicator = self.mrnet.get_BroadcastCommunicator() self.mrnet_broadcast_stream = self.mrnet.new_Stream(self.broadcast_communicator, self.filter_ids[0], MRN.SFILTER_WAITFORALL, MRN.TFILTER_NULL) self.mrnet_frontend_stream = None # Not used here. def _send_mrnet_hello(self): """Send the HELLO message across MRNet.""" self.send(GDBMessage(HELLO_MSG), self.broadcast) def _init_mrnet_rank_map(self): """Initialize the mappings from MPI ranks to MRNet ranks.""" self.mpirank_to_mrnrank_map = {} hostname_to_mrnrank = {} mrnet_endpoints = self.broadcast_communicator.get_EndPoints() for endpoint in mrnet_endpoints: hostname_to_mrnrank[socket.getfqdn(endpoint.get_HostName())] = endpoint.get_Rank() for proc in self.get_proctab(): self.mpirank_to_mrnrank_map[proc.mpirank] = hostname_to_mrnrank[socket.getfqdn(proc.pd.host_name)] def _load_mrnet_filters(self): """Load MRNet filters.""" self.filter_ids = [] for filter_path, filter_func in gdbconf.mrnet_filters: if os.path.isfile(filter_path): # Ensure the file actually still exists. try: with open(filter_path): ret_filter_id = self.mrnet.load_FilterFunc(filter_path, filter_func) if ret_filter_id == -1: print "Failed to load filter {0}:{1}!".format(filter_path, filter_func) sys.exit(1) self.filter_ids.append(ret_filter_id) except IOError: print "Filter {0} disappeared!".format(filter_path) sys.exit(1) else: print "Cannot find filter {0}!".format(filter_path) sys.exit(1) def init_mrnet(self, local = False): """Initialize MRNet. local is whether to initialize for a cluster or just this node. """ if local: self._construct_local_node_topology() else: self._construct_mrnet_topology() self.mrnet = MRN.Network.CreateNetworkFE(self.mrnet_topo_path) self.node_joins = 0 self.node_exits = 0 self.mrnet.register_EventCallback(MRN.Event.TOPOLOGY_EVENT, MRN.TopologyEvent.TOPOL_ADD_BE, self._mrnet_node_joined_cb) self.mrnet.register_EventCallback(MRN.Event.TOPOLOGY_EVENT, MRN.TopologyEvent.TOPOL_REMOVE_NODE, self._mrnet_node_removed_cb) self._load_mrnet_filters() ret = self._send_mrnet_topology() if not ret: return False self._wait_for_nodes() self._init_shared_mrnet() self._enable_mrnet_perf_data() self._init_mrnet_rank_map() self._send_mrnet_hello() if gdbconf.mrnet_topology_dot: topo = self.mrnet.get_NetworkTopology() topo.print_DOTGraph(gdbconf.mrnet_topology_dot) return True def shutdown(self): """Shut down the communication infrastructure.""" self._disable_mrnet_perf_data() self._log_mrnet_perf_data() # Shut this stream down. #del self.mrnet_broadcast_stream del self.mrnet try: self.lmon.shutdownDaemons(self.lmon_session) except lmon.LMONException as e: e.print_lmon_error() traceback.print_exc() return False self.been_shutdown = True return True def mpirank_to_mrnrank(self, rank): """Convert an MPI rank to an MRNet rank. Only works on front-end.""" return self.mpirank_to_mrnrank_map[rank] def get_mrnet_network_size(self): """Return the size of the MRNet network.""" return self.mrnet_network_size def get_exit_count(self): """Return the number of MRNet nodes that have exited.""" return self.node_exits def all_nodes_exited(self): """Return whether all nodes have exited.""" return self.node_exits == self.node_joins
class CommunicatorFE(Communicator): """Communicator for the front-end.""" def __init__(self, locking=False): Communicator.__init__(self, locking) def init_lmon(self, attach, **kwargs): """Initialize LaunchMON and deploy back-end daemons. attach is True to attach to a process. - Provide the keyword argument pid, the srun PID. attach is False to launch the job. - Provide keyword arguments launcher and launcher_args. """ os.environ.update(gdbconf.environ) self.lmon = LMON_fe() try: self.lmon.init() self.lmon_session = self.lmon.createSession() self.lmon.putToBeDaemonEnv(self.lmon_session, gdbconf.environ.items()) self.lmon.regPackForFeToBe(self.lmon_session, lmon.pack) self.lmon.regUnpackForBeToFe(self.lmon_session, lmon.unpack) if attach: host = kwargs["host"] if not host: host = socket.getfqdn() self.lmon.attachAndSpawnDaemons(self.lmon_session, host, kwargs["pid"], gdbconf.backend_bin, gdbconf.backend_args, None, None) else: launcher_argv = [kwargs["launcher"]] + kwargs["launcher_args"] self.lmon.launchAndSpawnDaemons(self.lmon_session, socket.getfqdn(), kwargs["launcher"], launcher_argv, gdbconf.backend_bin, gdbconf.backend_args, None, None) self.proctab_size = self.lmon.getProctableSize(self.lmon_session) self.proctab, unused = self.lmon.getProctable( self.lmon_session, self.proctab_size) except lmon.LMONException as e: e.print_lmon_error() traceback.print_exc() return False # These are meaningless for the front-end. self.lmon_rank = None self.lmon_size = None self.lmon_master = None self._init_mpiranks() return True def _construct_mrnet_topology(self, comm_nodes=None): """Construct the topology to be used for MRNet. comm_nodes is a list of nodes to deploy comm nodes on. If none, the nodes are co-located on the same hosts as debuggers. """ branch_factor = gdbconf.mrnet_branch_factor # Compute the minimum number of nodes we need given the branching factor. # This is the number of hosts LMON is deployed on, divided by the branching factor. lmon_hosts = list(set(map(lambda x: x.pd.host_name, self.proctab))) # Add 1 because this is integer division and we want the ceil. num_nodes = (len(lmon_hosts) / branch_factor) + 1 host_list = comm_nodes if host_list: if len(host_list) < num_nodes: print "Not enough comm nodes: {0} < {1} (branch factor = {2})!".format( len(host_list), num_nodes, branch_factor) sys.exit(1) else: # We need to allocate comm nodes from among the back-end LMON hosts, so pick as many as needed. host_list = lmon_hosts[0:num_nodes] cur_host = socket.gethostname() if cur_host in host_list: print "Cannot have the front-end on the same machine as a back-end daemon." sys.exit(1) cur_parents = [cur_host] self.mrnet_topo_path = "{0}/topo_{1}".format(gdbconf.topology_path, os.getpid()) fmt = "{0}:0" with open(self.mrnet_topo_path, "w+") as topo_file: while host_list: new_parents = [] for parent in cur_parents: children = host_list[:branch_factor] new_parents += children del host_list[:branch_factor] if children: topo_file.write( fmt.format(parent) + " => " + " ".join(map(lambda x: fmt.format(x), children)) + " ;\n") cur_parents = new_parents def _construct_local_node_topology(self): """Construct a topology for MRNet that just uses the local node.""" cur_host = socket.gethostname() self.mrnet_topo_path = "{0}/topo_{1}".format(gdbconf.topology_path, os.getpid()) with open(self.mrnet_topo_path, "w+") as topo_file: topo_file.write(cur_host + ":0 => " + cur_host + ":1 ;\n") def _assign_mrnet_leaves(self): """Assign debugger processes to MRNet leaves. For each leaf in the MRNet topology, assign up to the branching factor in debuggers for communication purposes. """ topology = self.mrnet.get_NetworkTopology() # Note: This assumes that leaves gives us a list. mrnet_leaves = topology.get_Leaves() leaves = list(mrnet_leaves) num_nodes = topology.get_NumNodes( ) + 1 # Add 1 to make sure we're good. node_info = [] local_rank = self.mrnet.get_LocalRank() leaf_idx = 0 # be_rank is assigned to be greater than all the existing nodes. for i in range(0, len(self.get_lmon_hosts())): leaf = leaves[leaf_idx] # Check for root, since get_Parent fails on it. if leaf.get_Rank() == local_rank: node_info.append( NodeInfo(local_rank, leaf.get_HostName(), leaf.get_Port(), -1, num_nodes + i)) else: node_info.append( NodeInfo(leaf.get_Rank(), leaf.get_HostName(), leaf.get_Port(), leaf.get_Parent(), num_nodes + i)) if i % gdbconf.mrnet_branch_factor == ( gdbconf.mrnet_branch_factor - 1): # Remove the leaf after we've given it mrnet_branch_factor children. leaf_idx += 1 return node_info def _send_mrnet_topology(self): """Send the MRNet topology to the back-end daemons.""" node_info = self._assign_mrnet_leaves() try: self.lmon.sendUsrDataBe(self.lmon_session, node_info) except lmon.LMONException as e: e.print_lmon_error() traceback.print_exc() return False self.mrnet_network_size = len(node_info) return True def _mrnet_node_joined_cb(self): """An MRNet callback invoked whenever a back-end node joins.""" self.node_joins += 1 def _mrnet_node_removed_cb(self): """An MRnet callback invoked whenever a back-end node leaves.""" self.node_exits += 1 def _wait_for_nodes(self): """Wait for all MRNet nodes to join the network.""" while self.node_joins != self.mrnet_network_size: pass def _init_mrnet_streams(self): """Initialize basic MRNet streams.""" self.broadcast_communicator = self.mrnet.get_BroadcastCommunicator() self.mrnet_broadcast_stream = self.mrnet.new_Stream( self.broadcast_communicator, self.filter_ids[0], MRN.SFILTER_WAITFORALL, MRN.TFILTER_NULL) self.mrnet_frontend_stream = None # Not used here. def _send_mrnet_hello(self): """Send the HELLO message across MRNet.""" self.send(GDBMessage(HELLO_MSG), self.broadcast) def _init_mrnet_rank_map(self): """Initialize the mappings from MPI ranks to MRNet ranks.""" self.mpirank_to_mrnrank_map = {} hostname_to_mrnrank = {} mrnet_endpoints = self.broadcast_communicator.get_EndPoints() for endpoint in mrnet_endpoints: hostname_to_mrnrank[socket.getfqdn( endpoint.get_HostName())] = endpoint.get_Rank() for proc in self.get_proctab(): self.mpirank_to_mrnrank_map[proc.mpirank] = hostname_to_mrnrank[ socket.getfqdn(proc.pd.host_name)] def _load_mrnet_filters(self): """Load MRNet filters.""" self.filter_ids = [] for filter_path, filter_func in gdbconf.mrnet_filters: if os.path.isfile(filter_path): # Ensure the file actually still exists. try: with open(filter_path): ret_filter_id = self.mrnet.load_FilterFunc( filter_path, filter_func) if ret_filter_id == -1: print "Failed to load filter {0}:{1}!".format( filter_path, filter_func) sys.exit(1) self.filter_ids.append(ret_filter_id) except IOError: print "Filter {0} disappeared!".format(filter_path) sys.exit(1) else: print "Cannot find filter {0}!".format(filter_path) sys.exit(1) def init_mrnet(self, local=False): """Initialize MRNet. local is whether to initialize for a cluster or just this node. """ if local: self._construct_local_node_topology() else: self._construct_mrnet_topology() self.mrnet = MRN.Network.CreateNetworkFE(self.mrnet_topo_path) self.node_joins = 0 self.node_exits = 0 self.mrnet.register_EventCallback(MRN.Event.TOPOLOGY_EVENT, MRN.TopologyEvent.TOPOL_ADD_BE, self._mrnet_node_joined_cb) self.mrnet.register_EventCallback(MRN.Event.TOPOLOGY_EVENT, MRN.TopologyEvent.TOPOL_REMOVE_NODE, self._mrnet_node_removed_cb) self._load_mrnet_filters() ret = self._send_mrnet_topology() if not ret: return False self._wait_for_nodes() self._init_shared_mrnet() self._enable_mrnet_perf_data() self._init_mrnet_rank_map() self._send_mrnet_hello() if gdbconf.mrnet_topology_dot: topo = self.mrnet.get_NetworkTopology() topo.print_DOTGraph(gdbconf.mrnet_topology_dot) return True def shutdown(self): """Shut down the communication infrastructure.""" self._disable_mrnet_perf_data() self._log_mrnet_perf_data() # Shut this stream down. #del self.mrnet_broadcast_stream del self.mrnet try: self.lmon.shutdownDaemons(self.lmon_session) except lmon.LMONException as e: e.print_lmon_error() traceback.print_exc() return False self.been_shutdown = True return True def mpirank_to_mrnrank(self, rank): """Convert an MPI rank to an MRNet rank. Only works on front-end.""" return self.mpirank_to_mrnrank_map[rank] def get_mrnet_network_size(self): """Return the size of the MRNet network.""" return self.mrnet_network_size def get_exit_count(self): """Return the number of MRNet nodes that have exited.""" return self.node_exits def all_nodes_exited(self): """Return whether all nodes have exited.""" return self.node_exits == self.node_joins