def test_nothing_starts(self): # Nothing should start now self.create_daemon(master=False, noNM=True, disable_zeroconf=True) self.assertFalse(utils.portIsOpen('localhost', constants.NODE_DEFAULT_REST_PORT, 0), 'NM started but it should not have') self.assertFalse(utils.portIsOpen('localhost', constants.MASTER_DEFAULT_REST_PORT, 0), 'NM started but it should not have')
def create_daemon(self, *args, **kwargs): self._daemon = DlgDaemon(*args, **kwargs) if 'noNM' not in kwargs or not kwargs['noNM']: self.assertTrue(utils.portIsOpen('localhost', constants.NODE_DEFAULT_REST_PORT, _TIMEOUT), 'The NM did not start successfully') if 'master' in kwargs and kwargs['master']: self.assertTrue(utils.portIsOpen('localhost', constants.MASTER_DEFAULT_REST_PORT, _TIMEOUT), 'The MM did not start successfully') self._daemon_t = threading.Thread(target=lambda: self._daemon.start('localhost', 9000)) self._daemon_t.start() # Wait until the daemon's server has started # We can't simply check if the port is opened, because the server binds # before it is returned to us. In some tests we don't interact with it, # and therefore the shutdown of the daemon can occur before the server # is even returned to us. This would happen because portIsOpen will # succeed with a bound server, even if we haven't serve_forever()'d it # yet. In these situations shutting down the daemon will not shut down # the http server, and therefore the test will fail when checking that # the self._daemon_t is not alive anymore # # To actually avoid this we need to do some actual HTTP talk, which will # ensure the server is actually serving requests, and therefore already # in the daemon's hand #self.assertTrue(utils.portIsOpen('localhost', 9000, _TIMEOUT)) try: restutils.RestClient('localhost', 9000, 10)._GET('/anything') except restutils.RestClientException: # We don't care about the result pass
def test_start_master_via_rest(self): self.create_daemon(master=False, noNM=False, disable_zeroconf=True) # Check that the master starts self._start('master', httplib.OK) self.assertTrue(utils.portIsOpen('localhost', constants.MASTER_DEFAULT_REST_PORT, _TIMEOUT), 'The MM did not start successfully')
def test_stop_start_node_via_rest(self): # test both stop and start of NM via REST self.create_daemon(master=True, noNM=False, disable_zeroconf=False) # Both managers started fine. If they zeroconf themselves correctly then # if we query the MM it should know about its nodes, which should have # one element mc = MasterManagerClient() nodes = _get_nodes_from_client(mc) self.assertIsNotNone(nodes) self.assertEqual( 1, len(nodes), "MasterManager didn't find the NodeManager running on the same node", ) # Check that the NM stops self._stop("node", http.HTTPStatus.OK, "") self.assertTrue( utils.portIsClosed('localhost', constants.NODE_DEFAULT_REST_PORT, _TIMEOUT), "The node did not stop successfully", ) # Check that the NM starts self._start("node", http.HTTPStatus.OK, {"pid": nodes}) self.assertTrue( utils.portIsOpen('localhost', constants.NODE_DEFAULT_REST_PORT, _TIMEOUT), "The node did not start successfully", )
def _start_manager_in_thread(self, port, manager_class, rest_class, *manager_args, **manager_kwargs): manager = manager_class(*manager_args, **manager_kwargs) server = rest_class(manager) thread = threading.Thread(target=server.start, args=("127.0.0.1", port)) thread.start() self.assertTrue(portIsOpen("127.0.0.1", port, 5)) return ManagerInfo(manager, server, thread, self)
def test_start_dataisland_via_rest(self): self.create_daemon(master=True, noNM=False, disable_zeroconf=False) # Both managers started fine. If they zeroconf themselves correctly then # if we query the MM it should know about its nodes, which should have # one element nodes = self._get_nodes_from_master(_TIMEOUT) self.assertIsNotNone(nodes) self.assertEqual(1, len(nodes), "MasterManager didn't find the NodeManager running on the same node") # Check that the DataIsland starts with the given nodes self._start('dataisland', httplib.OK, {'nodes': nodes}) self.assertTrue(utils.portIsOpen('localhost', constants.ISLAND_DEFAULT_REST_PORT, _TIMEOUT), 'The DIM did not start successfully')
def test_start_stop_master_via_rest(self): # test both stop and start of MASTER via REST self.create_daemon(master=False, noNM=False, disable_zeroconf=True) # Check that the MM starts self._start("master", http.HTTPStatus.OK) self.assertTrue( utils.portIsOpen("localhost", constants.MASTER_DEFAULT_REST_PORT, _TIMEOUT), "The MM did not start successfully", ) # Check that the MM stops self._stop("master", http.HTTPStatus.OK, "") self.assertTrue( utils.portIsClosed("localhost", constants.MASTER_DEFAULT_REST_PORT, _TIMEOUT), "The MM did not stop successfully", )
def check_host(host, port, timeout=5, check_with_session=False): """ Checks if a given host/port is up and running (i.e., it is open). If ``check_with_session`` is ``True`` then it is assumed that the host/port combination corresponds to a Node Manager and the check is performed by attempting to create and delete a session. """ if not check_with_session: return utils.portIsOpen(host, port, timeout) try: session_id = str(uuid.uuid4()) with NodeManagerClient(host, port, timeout=timeout) as c: c.create_session(session_id) c.destroy_session(session_id) return True except: return False
def main(): parser = optparse.OptionParser() parser.add_option("-l", "--log_dir", action="store", type="string", dest="log_dir", help="Log directory (required)") # if this parameter is present, it means we want to get monitored parser.add_option("-m", "--monitor_host", action="store", type="string", dest="monitor_host", help="Monitor host IP (optional)") parser.add_option("-o", "--monitor_port", action="store", type="int", dest="monitor_port", help="Monitor port", default=dfms_proxy.default_dlg_monitor_port) parser.add_option("-v", "--verbose-level", action="store", type="int", dest="verbose_level", help="Verbosity level (1-3) of the DIM/NM logging", default=1) parser.add_option("-z", "--zerorun", action="store_true", dest="zerorun", help="Generate a physical graph that takes no time to run", default=False) parser.add_option("--app", action="store", type="int", dest="app", help="The app to use in the PG. 1=SleepApp (default), 2=SleepAndCopy", default=0) parser.add_option("-t", "--max-threads", action="store", type="int", dest="max_threads", help="Max thread pool size used for executing drops. 0 (default) means no pool.", default=0) parser.add_option("-L", "--logical-graph", action="store", type="string", dest="logical_graph", help="The filename of the logical graph to deploy", default=None) parser.add_option("-P", "--physical-graph", action="store", type="string", dest="physical_graph", help="The filename of the physical graph (template) to deploy", default=None) parser.add_option('-s', '--num_islands', action='store', type='int', dest='num_islands', default=1, help='The number of Data Islands') parser.add_option('-d', '--dump', action='store_true', dest='dump', help = 'dump file base name?', default=False) parser.add_option("-c", "--loc", action="store", type="string", dest="loc", help="deployment location (e.g. 'Pawsey' or 'Tianhe2')", default="Pawsey") parser.add_option('--part-algo', type="string", dest='part_algo', help='Partition algorithms', default='metis') parser.add_option("-u", "--all_nics", action="store_true", dest="all_nics", help="Listen on all NICs for a node manager", default=False) parser.add_option('--check-interfaces', action='store_true', dest='check_interfaces', help = 'Run a small network interfaces test and exit', default=False) parser.add_option('--use-ifconfig', action='store_true', dest='use_ifconfig', help='Use ifconfig to find a suitable external interface/address for each host', default=False) parser.add_option("-S", "--check_with_session", action="store_true", dest="check_with_session", help="Check for node managers' availability by creating/destroy a session", default=False) (options, _) = parser.parse_args() if options.check_interfaces: print("From netifaces: %s" % get_ip_via_netifaces()) print("From ifconfig: %s" % get_ip_via_ifconfig()) sys.exit(0) if options.logical_graph and options.physical_graph: parser.error("Either a logical graph or physical graph filename must be specified") for p in (options.logical_graph, options.physical_graph): if p and not os.path.exists(p): parser.error("Cannot locate graph file at '{0}'".format(p)) if (options.monitor_host is not None and options.num_islands > 1): parser.error("We do not support proxy monitor multiple islands yet") logv = max(min(3, options.verbose_level), 1) from mpi4py import MPI # @UnresolvedImport comm = MPI.COMM_WORLD # @UndefinedVariable num_procs = comm.Get_size() rank = comm.Get_rank() log_dir = "{0}/{1}".format(options.log_dir, rank) os.makedirs(log_dir) logfile = log_dir + "/start_dlg_cluster.log" FORMAT = "%(asctime)-15s [%(levelname)5.5s] [%(threadName)15.15s] %(name)s#%(funcName)s:%(lineno)s %(message)s" logging.basicConfig(filename=logfile, level=logging.DEBUG, format=FORMAT) if (num_procs > 1 and options.monitor_host is not None): logger.info("Trying to start DALiuGE cluster with proxy") run_proxy = True threshold = 2 else: logger.info("Trying to start DALiuGE cluster without proxy") run_proxy = False threshold = 1 if (num_procs == threshold): logger.warning("No MPI processes left for running Drop Managers") run_node_mgr = False else: run_node_mgr = True # attach rank information at the end of IP address for multi-islands rank_str = '' if options.num_islands == 1 else ',%s' % rank find_ip = get_ip_via_ifconfig if options.use_ifconfig else get_ip_via_netifaces public_ip = find_ip(options.loc) ip_adds = '{0}{1}'.format(public_ip, rank_str) origin_ip = ip_adds.split(',')[0] ip_adds = comm.gather(ip_adds, root=0) proxy_ip = None if run_proxy: # send island/master manager's IP address to the DALiuGE proxy # also let island manager know the DALiuGE proxy's IP if rank == 0: mgr_ip = origin_ip comm.send(mgr_ip, dest=1) proxy_ip = comm.recv(source=1) elif rank == 1: mgr_ip = comm.recv(source=0) proxy_ip = origin_ip comm.send(proxy_ip, dest=0) set_env(rank) if (options.num_islands == 1): if (rank != 0): if (run_proxy and rank == 1): # Wait until the Island Manager is open if utils.portIsOpen(mgr_ip, ISLAND_DEFAULT_REST_PORT, 100): start_proxy(options.loc, mgr_ip, ISLAND_DEFAULT_REST_PORT, options.monitor_host, options.monitor_port) else: logger.warning("Couldn't connect to the main drop manager, proxy not started") elif (run_node_mgr): logger.info("Starting node manager on host {0}".format(origin_ip)) start_node_mgr(log_dir, logv=logv, max_threads=options.max_threads, host=None if options.all_nics else origin_ip) else: # 'no_nms' are known not to be NMs no_nms = [origin_ip, 'None'] if proxy_ip: no_nms += [proxy_ip] node_mgrs = [ip for ip in ip_adds if ip not in no_nms] # unroll the graph first (if any) while starting node managers on other nodes pgt = None if options.logical_graph or options.physical_graph: pip_name = utils.fname_to_pipname(options.logical_graph or options.physical_graph) if options.logical_graph: unrolled = tool.unroll(options.logical_graph, '1', options.zerorun, apps[options.app]) pgt = pg_generator.partition(unrolled, options.part_algo, num_partitions=len(node_mgrs)) pgt = pgt.to_pg_spec([], ret_str=False, num_islands=1, tpl_nodes_len=len(node_mgrs) + 1) del unrolled else: pgt = json.loads(options.physical_graph) # Check that which NMs are up and use only those form now on node_mgrs = check_hosts(node_mgrs, NODE_DEFAULT_REST_PORT, check_with_session=options.check_with_session, timeout=MM_WAIT_TIME) # We have a PGT, let's map it and submit it if pgt: pg = tool.resource_map(pgt, [origin_ip] + node_mgrs, pip_name, options.num_islands) del pgt def submit_and_monitor(): host, port = 'localhost', ISLAND_DEFAULT_REST_PORT tool.submit(host, port, pg) if options.dump: dump_path = '{0}/monitor'.format(log_dir) monitor_graph(host, port, dump_path) threading.Thread(target=submit_and_monitor).start() # Start the DIM logger.info("Starting island manager on host %s", origin_ip) start_dim(node_mgrs, log_dir, logv=logv) elif (options.num_islands > 1): if (rank == 0): # master manager # 1. use ip_adds to produce the physical graph ip_list = [] ip_rank_dict = dict() # k - ip, v - MPI rank for ipr in ip_adds: iprs = ipr.split(',') ip = iprs[0] r = iprs[1] if (ip == origin_ip or 'None' == ip): continue ip_list.append(ip) ip_rank_dict[ip] = int(r) if (len(ip_list) <= options.num_islands): raise Exception("Insufficient nodes available for node managers") # 2 broadcast dim ranks to all nodes to let them know who is the DIM dim_ranks = [] dim_ip_list = ip_list[0:options.num_islands] logger.info("A list of DIM IPs: {0}".format(dim_ip_list)) for dim_ip in dim_ip_list: dim_ranks.append(ip_rank_dict[dim_ip]) dim_ranks = comm.bcast(dim_ranks, root=0) # 3 unroll the graph while waiting for node managers to start pip_name = utils.fname_to_pipname(options.logical_graph or options.physical_graph) if options.logical_graph: unrolled = tool.unroll(options.logical_graph, '1', options.zerorun, apps[options.app]) pgt = pg_generator.partition(unrolled, options.part_algo, num_partitions=len(ip_list) - 1, num_islands=options.num_islands) pgt = pgt.to_pg_spec([], ret_str=False, num_islands=options.num_islands, tpl_nodes_len=len(ip_list) - 1 + options.num_islands) del unrolled else: pgt = json.loads(options.physical_graph) #logger.info("Waiting all node managers to start in %f seconds", MM_WAIT_TIME) node_mgrs = check_hosts(ip_list[options.num_islands:], NODE_DEFAULT_REST_PORT, check_with_session=options.check_with_session, timeout=MM_WAIT_TIME) # 4. produce the physical graph based on the available node managers # that have already been running (we have to assume island manager # will run smoothly in the future) logger.info("Master Manager producing the physical graph") pg = tool.resource_map(pgt, dim_ip_list + node_mgrs, pip_name, options.num_islands) # 5. parse the pg_spec to get the mapping from islands to node list dim_rank_nodes_dict = collections.defaultdict(set) for drop in pg: dim_ip = drop['island'] # if (not dim_ip in dim_ip_list): # raise Exception("'{0}' node is not in island list {1}".format(dim_ip, dim_ip_list)) r = ip_rank_dict[dim_ip] n = drop['node'] dim_rank_nodes_dict[r].add(n) # 6 send a node list to each DIM so that it can start for dim_ip in dim_ip_list: r = ip_rank_dict[dim_ip] logger.debug("Sending node list to rank {0}".format(r)) #TODO this should be in a thread since it is blocking! comm.send(list(dim_rank_nodes_dict[r]), dest=r) # 7. make sure all DIMs are up running dim_ips_up = check_hosts(dim_ip_list, ISLAND_DEFAULT_REST_PORT, timeout=MM_WAIT_TIME, retry=10) if len(dim_ips_up) < len(dim_ip_list): logger.warning("Not all DIMs were up and running: %d/%d", len(dim_ips_up), len(dim_ip_list)) # 8. submit the graph in a thread (wait for mm to start) def submit(): if not check_host('localhost', MASTER_DEFAULT_REST_PORT, timeout=GRAPH_SUBMIT_WAIT_TIME): logger.warning("Master Manager didn't come up in %d seconds", GRAPH_SUBMIT_WAIT_TIME) tool.submit('localhost', MASTER_DEFAULT_REST_PORT, pg) threading.Thread(target=submit).start() # 9. start dlgMM using islands IP addresses (this will block) start_mm(dim_ip_list, log_dir, logv=logv) else: dim_ranks = None dim_ranks = comm.bcast(dim_ranks, root=0) logger.debug("Receiving dim_ranks = {0}, my rank is {1}".format(dim_ranks, rank)) if (rank in dim_ranks): logger.debug("Rank {0} is a DIM preparing for receiving".format(rank)) # island manager # get a list of nodes that are its children from rank 0 (MM) nm_list = comm.recv(source=0) # no need to wait for node managers since the master manager # has already made sure they are up running logger.debug("nm_list for DIM {0} is {1}".format(rank, nm_list)) start_dim(nm_list, log_dir, logv=logv) else: # node manager logger.info("Starting node manager on host {0}".format(origin_ip)) start_node_mgr(log_dir, logv=logv, max_threads=options.max_threads, host=None if options.all_nics else origin_ip)
def test_fullRound(self): """ A test that exercises most of the REST interface exposed on top of the DataIslandManager """ sessionId = 'lala' restPort = 8888 args = ['--port', str(restPort), '-N', hostname, '-qqq'] dimProcess = tool.start_process('dim', args) with testutils.terminating(dimProcess, 10): # Wait until the REST server becomes alive self.assertTrue(utils.portIsOpen('localhost', restPort, 10), "REST server didn't come up in time") # The DIM is still empty sessions = testutils.get(self, '/sessions', restPort) self.assertEqual(0, len(sessions)) dimStatus = testutils.get(self, '', restPort) self.assertEqual(1, len(dimStatus['hosts'])) self.assertEqual(hostname, dimStatus['hosts'][0]) self.assertEqual(0, len(dimStatus['sessionIds'])) # Create a session and check it exists testutils.post(self, '/sessions', restPort, '{"sessionId":"%s"}' % (sessionId)) sessions = testutils.get(self, '/sessions', restPort) self.assertEqual(1, len(sessions)) self.assertEqual(sessionId, sessions[0]['sessionId']) self.assertDictEqual({hostname: SessionStates.PRISTINE}, sessions[0]['status']) # Add this complex graph spec to the session # The UID of the two leaf nodes of this complex.js graph are T and S # Since the original complexGraph doesn't have node information # we need to add it manually before submitting -- otherwise it will # get rejected by the DIM. with pkg_resources.resource_stream( 'test', 'graphs/complex.js') as f: # @UndefinedVariable complexGraphSpec = json.load(codecs.getreader('utf-8')(f)) for dropSpec in complexGraphSpec: dropSpec['node'] = hostname testutils.post(self, '/sessions/%s/graph/append' % (sessionId), restPort, json.dumps(complexGraphSpec)) self.assertEqual({hostname: SessionStates.BUILDING}, testutils.get(self, '/sessions/%s/status' % (sessionId), restPort)) # Now we deploy the graph... testutils.post(self, '/sessions/%s/deploy' % (sessionId), restPort, "completed=SL_A,SL_B,SL_C,SL_D,SL_K", mimeType='application/x-www-form-urlencoded') self.assertEqual({hostname: SessionStates.RUNNING}, testutils.get(self, '/sessions/%s/status' % (sessionId), restPort)) # ...and write to all 5 root nodes that are listening in ports # starting at 1111 msg = os.urandom(10) for i in range(5): utils.write_to( 'localhost', 1111 + i, msg, 2), "Couldn't write data to localhost:%d" % (1111 + i) # Wait until the graph has finished its execution. We'll know # it finished by polling the status of the session while SessionStates.RUNNING in testutils.get( self, '/sessions/%s/status' % (sessionId), restPort).values(): time.sleep(0.2) self.assertEqual({hostname: SessionStates.FINISHED}, testutils.get(self, '/sessions/%s/status' % (sessionId), restPort)) testutils.delete(self, '/sessions/%s' % (sessionId), restPort) sessions = testutils.get(self, '/sessions', restPort) self.assertEqual(0, len(sessions))
def test_fullRound(self): """ A test that exercises most of the REST interface exposed on top of the DataIslandManager """ sessionId = "lala" restPort = 8989 # don't interfere with EAGLE default port args = ["--port", str(restPort), "-N", hostname, "-qqq"] dimProcess = tool.start_process("dim", args) with testutils.terminating(dimProcess, timeout=10): # Wait until the REST server becomes alive self.assertTrue( utils.portIsOpen("localhost", restPort, timeout=10), "REST server didn't come up in time", ) # The DIM is still empty sessions = testutils.get(self, "/sessions", restPort) self.assertEqual(0, len(sessions)) dimStatus = testutils.get(self, "", restPort) self.assertEqual(1, len(dimStatus["hosts"])) self.assertEqual(hostname, dimStatus["hosts"][0]) self.assertEqual(0, len(dimStatus["sessionIds"])) # Create a session and check it exists testutils.post( self, "/sessions", restPort, '{"sessionId":"%s"}' % (sessionId) ) sessions = testutils.get(self, "/sessions", restPort) self.assertEqual(1, len(sessions)) self.assertEqual(sessionId, sessions[0]["sessionId"]) self.assertDictEqual( {hostname: SessionStates.PRISTINE}, sessions[0]["status"] ) # Add this complex graph spec to the session # The UID of the two leaf nodes of this complex.js graph are T and S # Since the original complexGraph doesn't have node information # we need to add it manually before submitting -- otherwise it will # get rejected by the DIM. with pkg_resources.resource_stream( "test", "graphs/complex.js" ) as f: # @UndefinedVariable complexGraphSpec = json.load(codecs.getreader("utf-8")(f)) logger.debug(f"Loaded graph: {f}") for dropSpec in complexGraphSpec: dropSpec["node"] = hostname testutils.post( self, "/sessions/%s/graph/append" % (sessionId), restPort, json.dumps(complexGraphSpec), ) self.assertEqual( {hostname: SessionStates.BUILDING}, testutils.get(self, "/sessions/%s/status" % (sessionId), restPort), ) # Now we deploy the graph... testutils.post( self, "/sessions/%s/deploy" % (sessionId), restPort, "completed=SL_A,SL_B,SL_C,SL_D,SL_K", mimeType="application/x-www-form-urlencoded", ) self.assertEqual( {hostname: SessionStates.RUNNING}, testutils.get(self, "/sessions/%s/status" % (sessionId), restPort), ) # ...and write to all 5 root nodes that are listening in ports # starting at 1111 msg = os.urandom(10) for i in range(5): utils.write_to( "localhost", 1111 + i, msg, 2 ), "Couldn't write data to localhost:%d" % (1111 + i) # Wait until the graph has finished its execution. We'll know # it finished by polling the status of the session while ( SessionStates.RUNNING in testutils.get( self, "/sessions/%s/status" % (sessionId), restPort ).values() ): time.sleep(0.2) self.assertEqual( {hostname: SessionStates.FINISHED}, testutils.get(self, "/sessions/%s/status" % (sessionId), restPort), ) testutils.delete(self, "/sessions/%s" % (sessionId), restPort) sessions = testutils.get(self, "/sessions", restPort) self.assertEqual(0, len(sessions))
def main(): parser = optparse.OptionParser() parser.add_option( "-l", "--log_dir", action="store", type="string", dest="log_dir", help="Log directory (required)", ) # if this parameter is present, it means we want to get monitored parser.add_option( "-m", "--monitor_host", action="store", type="string", dest="monitor_host", help="Monitor host IP (optional)", ) parser.add_option( "-o", "--monitor_port", action="store", type="int", dest="monitor_port", help="Monitor port", default=dlg_proxy.default_dlg_monitor_port, ) parser.add_option( "-v", "--verbose-level", action="store", type="int", dest="verbose_level", help="Verbosity level (1-3) of the DIM/NM logging", default=1, ) parser.add_option( "-z", "--zerorun", action="store_true", dest="zerorun", help="Generate a physical graph that takes no time to run", default=False, ) parser.add_option( "--app", action="store", type="int", dest="app", help="The app to use in the PG. 1=SleepApp (default), 2=SleepAndCopy", default=0, ) parser.add_option( "-t", "--max-threads", action="store", type="int", dest="max_threads", help= "Max thread pool size used for executing drops. 0 (default) means no pool.", default=0, ) parser.add_option( "-L", "--logical-graph", action="store", type="string", dest="logical_graph", help="The filename of the logical graph to deploy", default=None, ) parser.add_option( "-P", "--physical-graph", action="store", type="string", dest="physical_graph", help="The filename of the physical graph (template) to deploy", default=None, ) parser.add_option( "-s", "--num_islands", action="store", type="int", dest="num_islands", default=1, help="The number of Data Islands", ) parser.add_option( "-d", "--dump", action="store_true", dest="dump", help="dump file base name?", default=False, ) parser.add_option( "-i", "--interface", type="int", help= "Index of network interface to use as the external interface/address for each host", default=0, ) parser.add_option( "--part-algo", type="string", dest="part_algo", help="Partition algorithms", default="metis", ) parser.add_option( "-A", "--algo-param", action="append", dest="algo_params", help= "Extra name=value parameters used by the algorithms (algorithm-specific)", ) parser.add_option("--ssid", type="string", dest="ssid", help="session id", default="") parser.add_option( "-u", "--all_nics", action="store_true", dest="all_nics", help="Listen on all NICs for a node manager", default=True, ) parser.add_option( "--check-interfaces", action="store_true", dest="check_interfaces", help="Run a small network interfaces test and exit", default=False, ) parser.add_option( "--collect-interfaces", action="store_true", dest="collect_interfaces", help="Collect all interfaces and exit", default=False, ) parser.add_option( "--use-ifconfig", action="store_true", dest="use_ifconfig", help= "Use ifconfig to find a suitable external interface/address for each host", default=False, ) parser.add_option( "-S", "--check_with_session", action="store_true", dest="check_with_session", help= "Check for node managers' availability by creating/destroy a session", default=False, ) parser.add_option( "--event-listeners", action="store", type="string", dest="event_listeners", help="A colon-separated list of event listener classes to be used", default="", ) parser.add_option( "--sleep-after-execution", action="store", type="int", dest="sleep_after_execution", help="Sleep time interval after graph execution finished", default=0, ) parser.add_option( "--pg-modifiers", help= ("A colon-separated list of python functions that modify a PG before submission. " "Each specification is in the form of <funcname>[,[arg1=]val1][,[arg2=]val2]..." ), default="", ) parser.add_option( "-r", "--remote-mechanism", help="The mechanism used by this script to coordinate remote processes", choices=["mpi", "slurm", "dlg", "dlg-hybrid"], default="mpi", ) parser.add_option( "--co-host-dim", action="store_true", dest="co_host_dim", help="Start DIM on first NM node", default=True, ) (options, _) = parser.parse_args() if options.check_interfaces: try: print("From netifaces: %s" % get_ip_via_netifaces(options.interface)) except: LOGGER.exception("Failed to get information via netifaces") try: print("From ifconfig: %s" % get_ip_via_ifconfig(options.interface)) except: LOGGER.exception("Failed to get information via ifconfig") sys.exit(0) elif options.collect_interfaces: from mpi4py import MPI comm = MPI.COMM_WORLD # @UndefinedVariable ips = comm.allgather(get_ip(options)) if comm.Get_rank() == 0: print(" ".join(ips)) sys.exit(0) if bool(options.logical_graph) == bool(options.physical_graph): parser.error( "Either a logical graph or physical graph filename must be specified" ) for graph_file_name in (options.logical_graph, options.physical_graph): if graph_file_name and not os.path.exists(graph_file_name): parser.error( "Cannot locate graph file at '{0}'".format(graph_file_name)) if options.monitor_host is not None and options.num_islands > 1: parser.error("We do not support proxy monitor multiple islands yet") # if options.ssid == "": # options.ssid = time. remote = get_remote(options) log_dir = "{0}/{1}".format(options.log_dir, remote.my_ip) os.makedirs(log_dir) logfile = log_dir + "/start_dlg_cluster.log" log_format = ("%(asctime)-15s [%(levelname)5.5s] [%(threadName)15.15s] " "%(name)s#%(funcName)s:%(lineno)s %(message)s") logging.basicConfig(filename=logfile, level=logging.DEBUG, format=log_format) LOGGER.info("This node has IP address: %s", remote.my_ip) envfile_name = os.path.join(log_dir, "env.txt") LOGGER.debug("Dumping process' environment to %s", envfile_name) with open(envfile_name, "wt") as env_file: for name, value in sorted(os.environ.items()): env_file.write("%s=%s\n" % (name, value)) logv = max(min(3, options.verbose_level), 1) # need to dump nodes file first if remote.is_highest_level_manager: LOGGER.info( f"Node {remote.my_ip} is hosting the highest level manager") nodesfile = os.path.join(log_dir, "nodes.txt") LOGGER.debug("Dumping list of nodes to %s", nodesfile) with open(nodesfile, "wt") as env_file: env_file.write("\n".join(remote.sorted_peers)) dim_proc = None # start the NM if options.num_islands == 1: submit = True REST_PORT = ISLAND_DEFAULT_REST_PORT # need to check for NM first and go on of co-hosted if remote.is_nm: co_hosted = remote.my_ip in remote.dim_ips nm_proc = start_node_mgr( log_dir, remote.my_ip, logv=logv, max_threads=options.max_threads, host=None if options.all_nics else remote.my_ip, event_listeners=options.event_listeners, use_tool=co_hosted, ) if remote.is_proxy: # Wait until the Island Manager is open if utils.portIsOpen(remote.hl_mgr_ip, ISLAND_DEFAULT_REST_PORT, 100): start_proxy( remote.hl_mgr_ip, ISLAND_DEFAULT_REST_PORT, options.monitor_host, options.monitor_port, ) else: LOGGER.warning( "Couldn't connect to the main drop manager, proxy not started" ) elif remote.my_ip in remote.dim_ips: LOGGER.info(f"Starting island managers on nodes: {remote.dim_ips}") dim_proc = start_dim(remote.nm_ips, log_dir, remote.my_ip, logv=logv) # whichever way we came from, now we have to wait until session is finished # we always monitor the island, else we will have race conditions physical_graph = get_pg(options, remote.nm_ips, remote.dim_ips) monitoring_thread = submit_and_monitor(physical_graph, options, remote.dim_ips[0], REST_PORT, submit=co_hosted) monitoring_thread.join() # now the session is finished # still shutting DIM down first to avoid monitoring conflicts stop_dims(remote.dim_ips) # now stop all the NMs stop_nms(remote.nm_ips) # shouldn't need this in addition # if dim_proc is not None: # # Stop DALiuGE. # LOGGER.info("Stopping DALiuGE island manager on rank %d", remote.rank) # utils.terminate_or_kill(dim_proc, 5) elif remote.is_highest_level_manager: # TODO: In the case of more than one island the NMs are not yet started physical_graph = get_pg(options, remote.nm_ips, remote.dim_ips) remote.send_dim_nodes(physical_graph) # 7. make sure all DIMs are up running dim_ips_up = check_hosts(remote.dim_ips, ISLAND_DEFAULT_REST_PORT, timeout=MM_WAIT_TIME, retry=10) if len(dim_ips_up) < len(remote.dim_ips): LOGGER.warning( "Not all DIMs were up and running: %d/%d", len(dim_ips_up), len(remote.dim_ips), ) monitoring_thread = submit_and_monitor(physical_graph, options, remote.my_ip, MASTER_DEFAULT_REST_PORT) mm_proc = start_mm(remote.dim_ips, log_dir, logv=logv) monitoring_thread.join() stop_mm( remote.my_ip ) # TODO: I don't think we need this and least not in the single island case stop_dims(remote.dim_ips) else: nm_ips = remote.recv_dim_nodes() proc = start_dim(nm_ips, log_dir, remote.my_ip, logv=logv) utils.wait_or_kill(proc, 1e8, period=5) stop_nms(remote.nm_ips)