Ejemplo n.º 1
0
 def test_nothing_starts(self):
     # Nothing should start now
     self.create_daemon(master=False, noNM=True, disable_zeroconf=True)
     self.assertFalse(utils.portIsOpen('localhost', constants.NODE_DEFAULT_REST_PORT, 0),
                      'NM started but it should not have')
     self.assertFalse(utils.portIsOpen('localhost', constants.MASTER_DEFAULT_REST_PORT, 0),
                      'NM started but it should not have')
Ejemplo n.º 2
0
    def create_daemon(self, *args, **kwargs):
        self._daemon = DlgDaemon(*args, **kwargs)

        if 'noNM' not in kwargs or not kwargs['noNM']:
            self.assertTrue(utils.portIsOpen('localhost', constants.NODE_DEFAULT_REST_PORT, _TIMEOUT), 'The NM did not start successfully')
        if 'master' in kwargs and kwargs['master']:
            self.assertTrue(utils.portIsOpen('localhost', constants.MASTER_DEFAULT_REST_PORT, _TIMEOUT), 'The MM did not start successfully')

        self._daemon_t = threading.Thread(target=lambda: self._daemon.start('localhost', 9000))
        self._daemon_t.start()

        # Wait until the daemon's server has started
        # We can't simply check if the port is opened, because the server binds
        # before it is returned to us. In some tests we don't interact with it,
        # and therefore the shutdown of the daemon can occur before the server
        # is even returned to us. This would happen because portIsOpen will
        # succeed with a bound server, even if we haven't serve_forever()'d it
        # yet. In these situations shutting down the daemon will not shut down
        # the http server, and therefore the test will fail when checking that
        # the self._daemon_t is not alive anymore
        #
        # To actually avoid this we need to do some actual HTTP talk, which will
        # ensure the server is actually serving requests, and therefore already
        # in the daemon's hand
        #self.assertTrue(utils.portIsOpen('localhost', 9000, _TIMEOUT))
        try:
            restutils.RestClient('localhost', 9000, 10)._GET('/anything')
        except restutils.RestClientException:
            # We don't care about the result
            pass
Ejemplo n.º 3
0
    def test_start_master_via_rest(self):

        self.create_daemon(master=False, noNM=False, disable_zeroconf=True)

        # Check that the master starts
        self._start('master', httplib.OK)
        self.assertTrue(utils.portIsOpen('localhost', constants.MASTER_DEFAULT_REST_PORT, _TIMEOUT), 'The MM did not start successfully')
Ejemplo n.º 4
0
    def test_stop_start_node_via_rest(self):

        # test both stop and start of NM via REST
        self.create_daemon(master=True, noNM=False, disable_zeroconf=False)

        # Both managers started fine. If they zeroconf themselves correctly then
        # if we query the MM it should know about its nodes, which should have
        # one element
        mc = MasterManagerClient()
        nodes = _get_nodes_from_client(mc)
        self.assertIsNotNone(nodes)
        self.assertEqual(
            1,
            len(nodes),
            "MasterManager didn't find the NodeManager running on the same node",
        )

        # Check that the NM stops
        self._stop("node", http.HTTPStatus.OK, "")
        self.assertTrue(
            utils.portIsClosed('localhost', constants.NODE_DEFAULT_REST_PORT, _TIMEOUT),
            "The node did not stop successfully",
        )

        # Check that the NM starts
        self._start("node", http.HTTPStatus.OK, {"pid": nodes})
        self.assertTrue(
            utils.portIsOpen('localhost', constants.NODE_DEFAULT_REST_PORT, _TIMEOUT),
            "The node did not start successfully",
        )
Ejemplo n.º 5
0
 def _start_manager_in_thread(self, port, manager_class, rest_class,
                              *manager_args, **manager_kwargs):
     manager = manager_class(*manager_args, **manager_kwargs)
     server = rest_class(manager)
     thread = threading.Thread(target=server.start,
                               args=("127.0.0.1", port))
     thread.start()
     self.assertTrue(portIsOpen("127.0.0.1", port, 5))
     return ManagerInfo(manager, server, thread, self)
Ejemplo n.º 6
0
    def test_start_dataisland_via_rest(self):

        self.create_daemon(master=True, noNM=False, disable_zeroconf=False)

        # Both managers started fine. If they zeroconf themselves correctly then
        # if we query the MM it should know about its nodes, which should have
        # one element
        nodes = self._get_nodes_from_master(_TIMEOUT)
        self.assertIsNotNone(nodes)
        self.assertEqual(1, len(nodes), "MasterManager didn't find the NodeManager running on the same node")

        # Check that the DataIsland starts with the given nodes
        self._start('dataisland', httplib.OK, {'nodes': nodes})
        self.assertTrue(utils.portIsOpen('localhost', constants.ISLAND_DEFAULT_REST_PORT, _TIMEOUT), 'The DIM did not start successfully')
Ejemplo n.º 7
0
    def test_start_stop_master_via_rest(self):
        # test both stop and start of MASTER via REST
        self.create_daemon(master=False, noNM=False, disable_zeroconf=True)

        # Check that the MM starts
        self._start("master", http.HTTPStatus.OK)
        self.assertTrue(
            utils.portIsOpen("localhost", constants.MASTER_DEFAULT_REST_PORT, _TIMEOUT),
            "The MM did not start successfully",
        )

        # Check that the MM stops
        self._stop("master", http.HTTPStatus.OK, "")
        self.assertTrue(
            utils.portIsClosed("localhost", constants.MASTER_DEFAULT_REST_PORT, _TIMEOUT),
            "The MM did not stop successfully",
        )
Ejemplo n.º 8
0
def check_host(host, port, timeout=5, check_with_session=False):
    """
    Checks if a given host/port is up and running (i.e., it is open).
    If ``check_with_session`` is ``True`` then it is assumed that the
    host/port combination corresponds to a Node Manager and the check is
    performed by attempting to create and delete a session.
    """
    if not check_with_session:
        return utils.portIsOpen(host, port, timeout)

    try:
        session_id = str(uuid.uuid4())
        with NodeManagerClient(host, port, timeout=timeout) as c:
            c.create_session(session_id)
            c.destroy_session(session_id)
        return True
    except:
        return False
Ejemplo n.º 9
0
def main():

    parser = optparse.OptionParser()
    parser.add_option("-l", "--log_dir", action="store", type="string",
                    dest="log_dir", help="Log directory (required)")
    # if this parameter is present, it means we want to get monitored
    parser.add_option("-m", "--monitor_host", action="store", type="string",
                    dest="monitor_host", help="Monitor host IP (optional)")
    parser.add_option("-o", "--monitor_port", action="store", type="int",
                    dest="monitor_port", help="Monitor port",
                    default=dfms_proxy.default_dlg_monitor_port)
    parser.add_option("-v", "--verbose-level", action="store", type="int",
                    dest="verbose_level", help="Verbosity level (1-3) of the DIM/NM logging",
                    default=1)
    parser.add_option("-z", "--zerorun", action="store_true",
                      dest="zerorun", help="Generate a physical graph that takes no time to run", default=False)
    parser.add_option("--app", action="store", type="int",
                      dest="app", help="The app to use in the PG. 1=SleepApp (default), 2=SleepAndCopy", default=0)

    parser.add_option("-t", "--max-threads", action="store", type="int",
                      dest="max_threads", help="Max thread pool size used for executing drops. 0 (default) means no pool.", default=0)

    parser.add_option("-L", "--logical-graph", action="store", type="string",
                      dest="logical_graph", help="The filename of the logical graph to deploy", default=None)
    parser.add_option("-P", "--physical-graph", action="store", type="string",
                      dest="physical_graph", help="The filename of the physical graph (template) to deploy", default=None)

    parser.add_option('-s', '--num_islands', action='store', type='int',
                    dest='num_islands', default=1, help='The number of Data Islands')

    parser.add_option('-d', '--dump', action='store_true',
                    dest='dump', help = 'dump file base name?', default=False)

    parser.add_option("-c", "--loc", action="store", type="string",
                    dest="loc", help="deployment location (e.g. 'Pawsey' or 'Tianhe2')",
                    default="Pawsey")

    parser.add_option('--part-algo', type="string", dest='part_algo', help='Partition algorithms',
                      default='metis')

    parser.add_option("-u", "--all_nics", action="store_true",
                      dest="all_nics", help="Listen on all NICs for a node manager", default=False)

    parser.add_option('--check-interfaces', action='store_true',
                      dest='check_interfaces', help = 'Run a small network interfaces test and exit', default=False)
    parser.add_option('--use-ifconfig', action='store_true',
                      dest='use_ifconfig', help='Use ifconfig to find a suitable external interface/address for each host', default=False)
    parser.add_option("-S", "--check_with_session", action="store_true",
                      dest="check_with_session", help="Check for node managers' availability by creating/destroy a session", default=False)

    (options, _) = parser.parse_args()

    if options.check_interfaces:
        print("From netifaces: %s" % get_ip_via_netifaces())
        print("From ifconfig: %s" % get_ip_via_ifconfig())
        sys.exit(0)

    if options.logical_graph and options.physical_graph:
        parser.error("Either a logical graph or physical graph filename must be specified")
    for p in (options.logical_graph, options.physical_graph):
        if p and not os.path.exists(p):
            parser.error("Cannot locate graph file at '{0}'".format(p))

    if (options.monitor_host is not None and options.num_islands > 1):
        parser.error("We do not support proxy monitor multiple islands yet")

    logv = max(min(3, options.verbose_level), 1)

    from mpi4py import MPI  # @UnresolvedImport
    comm = MPI.COMM_WORLD  # @UndefinedVariable
    num_procs = comm.Get_size()
    rank = comm.Get_rank()

    log_dir = "{0}/{1}".format(options.log_dir, rank)
    os.makedirs(log_dir)
    logfile = log_dir + "/start_dlg_cluster.log"
    FORMAT = "%(asctime)-15s [%(levelname)5.5s] [%(threadName)15.15s] %(name)s#%(funcName)s:%(lineno)s %(message)s"
    logging.basicConfig(filename=logfile, level=logging.DEBUG, format=FORMAT)

    if (num_procs > 1 and options.monitor_host is not None):
        logger.info("Trying to start DALiuGE cluster with proxy")
        run_proxy = True
        threshold = 2
    else:
        logger.info("Trying to start DALiuGE cluster without proxy")
        run_proxy = False
        threshold = 1

    if (num_procs == threshold):
        logger.warning("No MPI processes left for running Drop Managers")
        run_node_mgr = False
    else:
        run_node_mgr = True

    # attach rank information at the end of IP address for multi-islands
    rank_str = '' if options.num_islands == 1 else ',%s' % rank
    find_ip = get_ip_via_ifconfig if options.use_ifconfig else get_ip_via_netifaces
    public_ip = find_ip(options.loc)
    ip_adds = '{0}{1}'.format(public_ip, rank_str)
    origin_ip = ip_adds.split(',')[0]
    ip_adds = comm.gather(ip_adds, root=0)

    proxy_ip = None
    if run_proxy:
        # send island/master manager's IP address to the DALiuGE proxy
        # also let island manager know the DALiuGE proxy's IP
        if rank == 0:
            mgr_ip = origin_ip
            comm.send(mgr_ip, dest=1)
            proxy_ip = comm.recv(source=1)
        elif rank == 1:
            mgr_ip = comm.recv(source=0)
            proxy_ip = origin_ip
            comm.send(proxy_ip, dest=0)

    set_env(rank)
    if (options.num_islands == 1):
        if (rank != 0):
            if (run_proxy and rank == 1):
                # Wait until the Island Manager is open
                if utils.portIsOpen(mgr_ip, ISLAND_DEFAULT_REST_PORT, 100):
                    start_proxy(options.loc, mgr_ip, ISLAND_DEFAULT_REST_PORT, options.monitor_host, options.monitor_port)
                else:
                    logger.warning("Couldn't connect to the main drop manager, proxy not started")
            elif (run_node_mgr):
                logger.info("Starting node manager on host {0}".format(origin_ip))
                start_node_mgr(log_dir, logv=logv,
                max_threads=options.max_threads,
                host=None if options.all_nics else origin_ip)
        else:

            # 'no_nms' are known not to be NMs
            no_nms = [origin_ip, 'None']
            if proxy_ip:
                no_nms += [proxy_ip]
            node_mgrs = [ip for ip in ip_adds if ip not in no_nms]

            # unroll the graph first (if any) while starting node managers on other nodes
            pgt = None
            if options.logical_graph or options.physical_graph:
                pip_name = utils.fname_to_pipname(options.logical_graph or options.physical_graph)
                if options.logical_graph:
                    unrolled = tool.unroll(options.logical_graph, '1', options.zerorun, apps[options.app])
                    pgt = pg_generator.partition(unrolled, options.part_algo, num_partitions=len(node_mgrs))
                    pgt = pgt.to_pg_spec([], ret_str=False, num_islands=1, tpl_nodes_len=len(node_mgrs) + 1)
                    del unrolled
                else:
                    pgt = json.loads(options.physical_graph)

            # Check that which NMs are up and use only those form now on
            node_mgrs = check_hosts(node_mgrs, NODE_DEFAULT_REST_PORT,
                                    check_with_session=options.check_with_session,
                                    timeout=MM_WAIT_TIME)

            # We have a PGT, let's map it and submit it
            if pgt:
                pg = tool.resource_map(pgt, [origin_ip] + node_mgrs, pip_name, options.num_islands)
                del pgt

                def submit_and_monitor():
                    host, port = 'localhost', ISLAND_DEFAULT_REST_PORT
                    tool.submit(host, port, pg)
                    if options.dump:
                        dump_path = '{0}/monitor'.format(log_dir)
                        monitor_graph(host, port, dump_path)

                threading.Thread(target=submit_and_monitor).start()

            # Start the DIM
            logger.info("Starting island manager on host %s", origin_ip)
            start_dim(node_mgrs, log_dir, logv=logv)

    elif (options.num_islands > 1):
        if (rank == 0):
            # master manager
            # 1. use ip_adds to produce the physical graph
            ip_list = []
            ip_rank_dict = dict() # k - ip, v - MPI rank
            for ipr in ip_adds:
                iprs = ipr.split(',')
                ip = iprs[0]
                r = iprs[1]
                if (ip == origin_ip or 'None' == ip):
                    continue
                ip_list.append(ip)
                ip_rank_dict[ip] = int(r)

            if (len(ip_list) <= options.num_islands):
                raise Exception("Insufficient nodes available for node managers")

            # 2 broadcast dim ranks to all nodes to let them know who is the DIM
            dim_ranks = []
            dim_ip_list = ip_list[0:options.num_islands]
            logger.info("A list of DIM IPs: {0}".format(dim_ip_list))
            for dim_ip in dim_ip_list:
                dim_ranks.append(ip_rank_dict[dim_ip])
            dim_ranks = comm.bcast(dim_ranks, root=0)

            # 3 unroll the graph while waiting for node managers to start
            pip_name = utils.fname_to_pipname(options.logical_graph or options.physical_graph)
            if options.logical_graph:
                unrolled = tool.unroll(options.logical_graph, '1', options.zerorun, apps[options.app])
                pgt = pg_generator.partition(unrolled, options.part_algo, num_partitions=len(ip_list) - 1, num_islands=options.num_islands)
                pgt = pgt.to_pg_spec([], ret_str=False, num_islands=options.num_islands,
                                     tpl_nodes_len=len(ip_list) - 1 + options.num_islands)
                del unrolled
            else:
                pgt = json.loads(options.physical_graph)

            #logger.info("Waiting all node managers to start in %f seconds", MM_WAIT_TIME)
            node_mgrs = check_hosts(ip_list[options.num_islands:], NODE_DEFAULT_REST_PORT,
                                    check_with_session=options.check_with_session,
                                    timeout=MM_WAIT_TIME)

            # 4.  produce the physical graph based on the available node managers
            # that have already been running (we have to assume island manager
            # will run smoothly in the future)
            logger.info("Master Manager producing the physical graph")
            pg = tool.resource_map(pgt, dim_ip_list + node_mgrs, pip_name, options.num_islands)

            # 5. parse the pg_spec to get the mapping from islands to node list
            dim_rank_nodes_dict = collections.defaultdict(set)
            for drop in pg:
                dim_ip = drop['island']
                # if (not dim_ip in dim_ip_list):
                #     raise Exception("'{0}' node is not in island list {1}".format(dim_ip, dim_ip_list))
                r = ip_rank_dict[dim_ip]
                n = drop['node']
                dim_rank_nodes_dict[r].add(n)

            # 6 send a node list to each DIM so that it can start
            for dim_ip in dim_ip_list:
                r = ip_rank_dict[dim_ip]
                logger.debug("Sending node list to rank {0}".format(r))
                #TODO this should be in a thread since it is blocking!
                comm.send(list(dim_rank_nodes_dict[r]), dest=r)

            # 7. make sure all DIMs are up running
            dim_ips_up = check_hosts(dim_ip_list, ISLAND_DEFAULT_REST_PORT, timeout=MM_WAIT_TIME, retry=10)
            if len(dim_ips_up) < len(dim_ip_list):
                logger.warning("Not all DIMs were up and running: %d/%d", len(dim_ips_up), len(dim_ip_list))

            # 8. submit the graph in a thread (wait for mm to start)
            def submit():
                if not check_host('localhost', MASTER_DEFAULT_REST_PORT, timeout=GRAPH_SUBMIT_WAIT_TIME):
                    logger.warning("Master Manager didn't come up in %d seconds", GRAPH_SUBMIT_WAIT_TIME)
                tool.submit('localhost', MASTER_DEFAULT_REST_PORT, pg)
            threading.Thread(target=submit).start()

            # 9. start dlgMM using islands IP addresses (this will block)
            start_mm(dim_ip_list, log_dir, logv=logv)

        else:
            dim_ranks = None
            dim_ranks = comm.bcast(dim_ranks, root=0)
            logger.debug("Receiving dim_ranks = {0}, my rank is {1}".format(dim_ranks, rank))
            if (rank in dim_ranks):
                logger.debug("Rank {0} is a DIM preparing for receiving".format(rank))
                # island manager
                # get a list of nodes that are its children from rank 0 (MM)
                nm_list = comm.recv(source=0)
                # no need to wait for node managers since the master manager
                # has already made sure they are up running
                logger.debug("nm_list for DIM {0} is {1}".format(rank, nm_list))
                start_dim(nm_list, log_dir, logv=logv)
            else:
                # node manager
                logger.info("Starting node manager on host {0}".format(origin_ip))
                start_node_mgr(log_dir, logv=logv,
                max_threads=options.max_threads,
                host=None if options.all_nics else origin_ip)
Ejemplo n.º 10
0
    def test_fullRound(self):
        """
        A test that exercises most of the REST interface exposed on top of the
        DataIslandManager
        """

        sessionId = 'lala'
        restPort = 8888
        args = ['--port', str(restPort), '-N', hostname, '-qqq']
        dimProcess = tool.start_process('dim', args)

        with testutils.terminating(dimProcess, 10):

            # Wait until the REST server becomes alive
            self.assertTrue(utils.portIsOpen('localhost', restPort, 10),
                            "REST server didn't come up in time")

            # The DIM is still empty
            sessions = testutils.get(self, '/sessions', restPort)
            self.assertEqual(0, len(sessions))
            dimStatus = testutils.get(self, '', restPort)
            self.assertEqual(1, len(dimStatus['hosts']))
            self.assertEqual(hostname, dimStatus['hosts'][0])
            self.assertEqual(0, len(dimStatus['sessionIds']))

            # Create a session and check it exists
            testutils.post(self, '/sessions', restPort,
                           '{"sessionId":"%s"}' % (sessionId))
            sessions = testutils.get(self, '/sessions', restPort)
            self.assertEqual(1, len(sessions))
            self.assertEqual(sessionId, sessions[0]['sessionId'])
            self.assertDictEqual({hostname: SessionStates.PRISTINE},
                                 sessions[0]['status'])

            # Add this complex graph spec to the session
            # The UID of the two leaf nodes of this complex.js graph are T and S
            # Since the original complexGraph doesn't have node information
            # we need to add it manually before submitting -- otherwise it will
            # get rejected by the DIM.
            with pkg_resources.resource_stream(
                    'test', 'graphs/complex.js') as f:  # @UndefinedVariable
                complexGraphSpec = json.load(codecs.getreader('utf-8')(f))
            for dropSpec in complexGraphSpec:
                dropSpec['node'] = hostname
            testutils.post(self, '/sessions/%s/graph/append' % (sessionId),
                           restPort, json.dumps(complexGraphSpec))
            self.assertEqual({hostname: SessionStates.BUILDING},
                             testutils.get(self,
                                           '/sessions/%s/status' % (sessionId),
                                           restPort))

            # Now we deploy the graph...
            testutils.post(self,
                           '/sessions/%s/deploy' % (sessionId),
                           restPort,
                           "completed=SL_A,SL_B,SL_C,SL_D,SL_K",
                           mimeType='application/x-www-form-urlencoded')
            self.assertEqual({hostname: SessionStates.RUNNING},
                             testutils.get(self,
                                           '/sessions/%s/status' % (sessionId),
                                           restPort))

            # ...and write to all 5 root nodes that are listening in ports
            # starting at 1111
            msg = os.urandom(10)
            for i in range(5):
                utils.write_to(
                    'localhost', 1111 + i, msg,
                    2), "Couldn't write data to localhost:%d" % (1111 + i)

            # Wait until the graph has finished its execution. We'll know
            # it finished by polling the status of the session
            while SessionStates.RUNNING in testutils.get(
                    self, '/sessions/%s/status' % (sessionId),
                    restPort).values():
                time.sleep(0.2)

            self.assertEqual({hostname: SessionStates.FINISHED},
                             testutils.get(self,
                                           '/sessions/%s/status' % (sessionId),
                                           restPort))
            testutils.delete(self, '/sessions/%s' % (sessionId), restPort)
            sessions = testutils.get(self, '/sessions', restPort)
            self.assertEqual(0, len(sessions))
Ejemplo n.º 11
0
    def test_fullRound(self):
        """
        A test that exercises most of the REST interface exposed on top of the
        DataIslandManager
        """

        sessionId = "lala"
        restPort = 8989  # don't interfere with EAGLE default port
        args = ["--port", str(restPort), "-N", hostname, "-qqq"]
        dimProcess = tool.start_process("dim", args)

        with testutils.terminating(dimProcess, timeout=10):

            # Wait until the REST server becomes alive
            self.assertTrue(
                utils.portIsOpen("localhost", restPort, timeout=10),
                "REST server didn't come up in time",
            )

            # The DIM is still empty
            sessions = testutils.get(self, "/sessions", restPort)
            self.assertEqual(0, len(sessions))
            dimStatus = testutils.get(self, "", restPort)
            self.assertEqual(1, len(dimStatus["hosts"]))
            self.assertEqual(hostname, dimStatus["hosts"][0])
            self.assertEqual(0, len(dimStatus["sessionIds"]))

            # Create a session and check it exists
            testutils.post(
                self, "/sessions", restPort, '{"sessionId":"%s"}' % (sessionId)
            )
            sessions = testutils.get(self, "/sessions", restPort)
            self.assertEqual(1, len(sessions))
            self.assertEqual(sessionId, sessions[0]["sessionId"])
            self.assertDictEqual(
                {hostname: SessionStates.PRISTINE}, sessions[0]["status"]
            )

            # Add this complex graph spec to the session
            # The UID of the two leaf nodes of this complex.js graph are T and S
            # Since the original complexGraph doesn't have node information
            # we need to add it manually before submitting -- otherwise it will
            # get rejected by the DIM.
            with pkg_resources.resource_stream(
                "test", "graphs/complex.js"
            ) as f:  # @UndefinedVariable
                complexGraphSpec = json.load(codecs.getreader("utf-8")(f))
                logger.debug(f"Loaded graph: {f}")
            for dropSpec in complexGraphSpec:
                dropSpec["node"] = hostname
            testutils.post(
                self,
                "/sessions/%s/graph/append" % (sessionId),
                restPort,
                json.dumps(complexGraphSpec),
            )
            self.assertEqual(
                {hostname: SessionStates.BUILDING},
                testutils.get(self, "/sessions/%s/status" % (sessionId), restPort),
            )

            # Now we deploy the graph...
            testutils.post(
                self,
                "/sessions/%s/deploy" % (sessionId),
                restPort,
                "completed=SL_A,SL_B,SL_C,SL_D,SL_K",
                mimeType="application/x-www-form-urlencoded",
            )
            self.assertEqual(
                {hostname: SessionStates.RUNNING},
                testutils.get(self, "/sessions/%s/status" % (sessionId), restPort),
            )

            # ...and write to all 5 root nodes that are listening in ports
            # starting at 1111
            msg = os.urandom(10)
            for i in range(5):
                utils.write_to(
                    "localhost", 1111 + i, msg, 2
                ), "Couldn't write data to localhost:%d" % (1111 + i)

            # Wait until the graph has finished its execution. We'll know
            # it finished by polling the status of the session
            while (
                SessionStates.RUNNING
                in testutils.get(
                    self, "/sessions/%s/status" % (sessionId), restPort
                ).values()
            ):
                time.sleep(0.2)

            self.assertEqual(
                {hostname: SessionStates.FINISHED},
                testutils.get(self, "/sessions/%s/status" % (sessionId), restPort),
            )
            testutils.delete(self, "/sessions/%s" % (sessionId), restPort)
            sessions = testutils.get(self, "/sessions", restPort)
            self.assertEqual(0, len(sessions))
Ejemplo n.º 12
0
def main():
    parser = optparse.OptionParser()
    parser.add_option(
        "-l",
        "--log_dir",
        action="store",
        type="string",
        dest="log_dir",
        help="Log directory (required)",
    )
    # if this parameter is present, it means we want to get monitored
    parser.add_option(
        "-m",
        "--monitor_host",
        action="store",
        type="string",
        dest="monitor_host",
        help="Monitor host IP (optional)",
    )
    parser.add_option(
        "-o",
        "--monitor_port",
        action="store",
        type="int",
        dest="monitor_port",
        help="Monitor port",
        default=dlg_proxy.default_dlg_monitor_port,
    )
    parser.add_option(
        "-v",
        "--verbose-level",
        action="store",
        type="int",
        dest="verbose_level",
        help="Verbosity level (1-3) of the DIM/NM logging",
        default=1,
    )
    parser.add_option(
        "-z",
        "--zerorun",
        action="store_true",
        dest="zerorun",
        help="Generate a physical graph that takes no time to run",
        default=False,
    )
    parser.add_option(
        "--app",
        action="store",
        type="int",
        dest="app",
        help="The app to use in the PG. 1=SleepApp (default), 2=SleepAndCopy",
        default=0,
    )

    parser.add_option(
        "-t",
        "--max-threads",
        action="store",
        type="int",
        dest="max_threads",
        help=
        "Max thread pool size used for executing drops. 0 (default) means no pool.",
        default=0,
    )

    parser.add_option(
        "-L",
        "--logical-graph",
        action="store",
        type="string",
        dest="logical_graph",
        help="The filename of the logical graph to deploy",
        default=None,
    )
    parser.add_option(
        "-P",
        "--physical-graph",
        action="store",
        type="string",
        dest="physical_graph",
        help="The filename of the physical graph (template) to deploy",
        default=None,
    )

    parser.add_option(
        "-s",
        "--num_islands",
        action="store",
        type="int",
        dest="num_islands",
        default=1,
        help="The number of Data Islands",
    )

    parser.add_option(
        "-d",
        "--dump",
        action="store_true",
        dest="dump",
        help="dump file base name?",
        default=False,
    )

    parser.add_option(
        "-i",
        "--interface",
        type="int",
        help=
        "Index of network interface to use as the external interface/address for each host",
        default=0,
    )

    parser.add_option(
        "--part-algo",
        type="string",
        dest="part_algo",
        help="Partition algorithms",
        default="metis",
    )
    parser.add_option(
        "-A",
        "--algo-param",
        action="append",
        dest="algo_params",
        help=
        "Extra name=value parameters used by the algorithms (algorithm-specific)",
    )

    parser.add_option("--ssid",
                      type="string",
                      dest="ssid",
                      help="session id",
                      default="")

    parser.add_option(
        "-u",
        "--all_nics",
        action="store_true",
        dest="all_nics",
        help="Listen on all NICs for a node manager",
        default=True,
    )

    parser.add_option(
        "--check-interfaces",
        action="store_true",
        dest="check_interfaces",
        help="Run a small network interfaces test and exit",
        default=False,
    )
    parser.add_option(
        "--collect-interfaces",
        action="store_true",
        dest="collect_interfaces",
        help="Collect all interfaces and exit",
        default=False,
    )
    parser.add_option(
        "--use-ifconfig",
        action="store_true",
        dest="use_ifconfig",
        help=
        "Use ifconfig to find a suitable external interface/address for each host",
        default=False,
    )
    parser.add_option(
        "-S",
        "--check_with_session",
        action="store_true",
        dest="check_with_session",
        help=
        "Check for node managers' availability by creating/destroy a session",
        default=False,
    )

    parser.add_option(
        "--event-listeners",
        action="store",
        type="string",
        dest="event_listeners",
        help="A colon-separated list of event listener classes to be used",
        default="",
    )

    parser.add_option(
        "--sleep-after-execution",
        action="store",
        type="int",
        dest="sleep_after_execution",
        help="Sleep time interval after graph execution finished",
        default=0,
    )

    parser.add_option(
        "--pg-modifiers",
        help=
        ("A colon-separated list of python functions that modify a PG before submission. "
         "Each specification is in the form of <funcname>[,[arg1=]val1][,[arg2=]val2]..."
         ),
        default="",
    )

    parser.add_option(
        "-r",
        "--remote-mechanism",
        help="The mechanism used by this script to coordinate remote processes",
        choices=["mpi", "slurm", "dlg", "dlg-hybrid"],
        default="mpi",
    )

    parser.add_option(
        "--co-host-dim",
        action="store_true",
        dest="co_host_dim",
        help="Start DIM on first NM node",
        default=True,
    )

    (options, _) = parser.parse_args()

    if options.check_interfaces:
        try:
            print("From netifaces: %s" %
                  get_ip_via_netifaces(options.interface))
        except:
            LOGGER.exception("Failed to get information via netifaces")
        try:
            print("From ifconfig: %s" % get_ip_via_ifconfig(options.interface))
        except:
            LOGGER.exception("Failed to get information via ifconfig")
        sys.exit(0)
    elif options.collect_interfaces:
        from mpi4py import MPI

        comm = MPI.COMM_WORLD  # @UndefinedVariable
        ips = comm.allgather(get_ip(options))
        if comm.Get_rank() == 0:
            print(" ".join(ips))
        sys.exit(0)

    if bool(options.logical_graph) == bool(options.physical_graph):
        parser.error(
            "Either a logical graph or physical graph filename must be specified"
        )
    for graph_file_name in (options.logical_graph, options.physical_graph):
        if graph_file_name and not os.path.exists(graph_file_name):
            parser.error(
                "Cannot locate graph file at '{0}'".format(graph_file_name))

    if options.monitor_host is not None and options.num_islands > 1:
        parser.error("We do not support proxy monitor multiple islands yet")

    # if options.ssid == "":
    #     options.ssid = time.

    remote = get_remote(options)

    log_dir = "{0}/{1}".format(options.log_dir, remote.my_ip)
    os.makedirs(log_dir)
    logfile = log_dir + "/start_dlg_cluster.log"
    log_format = ("%(asctime)-15s [%(levelname)5.5s] [%(threadName)15.15s] "
                  "%(name)s#%(funcName)s:%(lineno)s %(message)s")
    logging.basicConfig(filename=logfile,
                        level=logging.DEBUG,
                        format=log_format)

    LOGGER.info("This node has IP address: %s", remote.my_ip)

    envfile_name = os.path.join(log_dir, "env.txt")
    LOGGER.debug("Dumping process' environment to %s", envfile_name)
    with open(envfile_name, "wt") as env_file:
        for name, value in sorted(os.environ.items()):
            env_file.write("%s=%s\n" % (name, value))

    logv = max(min(3, options.verbose_level), 1)

    # need to dump nodes file first
    if remote.is_highest_level_manager:
        LOGGER.info(
            f"Node {remote.my_ip} is hosting the highest level manager")
        nodesfile = os.path.join(log_dir, "nodes.txt")
        LOGGER.debug("Dumping list of nodes to %s", nodesfile)
        with open(nodesfile, "wt") as env_file:
            env_file.write("\n".join(remote.sorted_peers))
    dim_proc = None
    # start the NM
    if options.num_islands == 1:
        submit = True
        REST_PORT = ISLAND_DEFAULT_REST_PORT

        # need to check for NM first and go on of co-hosted
        if remote.is_nm:
            co_hosted = remote.my_ip in remote.dim_ips
            nm_proc = start_node_mgr(
                log_dir,
                remote.my_ip,
                logv=logv,
                max_threads=options.max_threads,
                host=None if options.all_nics else remote.my_ip,
                event_listeners=options.event_listeners,
                use_tool=co_hosted,
            )

        if remote.is_proxy:
            # Wait until the Island Manager is open
            if utils.portIsOpen(remote.hl_mgr_ip, ISLAND_DEFAULT_REST_PORT,
                                100):
                start_proxy(
                    remote.hl_mgr_ip,
                    ISLAND_DEFAULT_REST_PORT,
                    options.monitor_host,
                    options.monitor_port,
                )
            else:
                LOGGER.warning(
                    "Couldn't connect to the main drop manager, proxy not started"
                )
        elif remote.my_ip in remote.dim_ips:
            LOGGER.info(f"Starting island managers on nodes: {remote.dim_ips}")
            dim_proc = start_dim(remote.nm_ips,
                                 log_dir,
                                 remote.my_ip,
                                 logv=logv)
            # whichever way we came from, now we have to wait until session is finished
            # we always monitor the island, else we will have race conditions
            physical_graph = get_pg(options, remote.nm_ips, remote.dim_ips)
            monitoring_thread = submit_and_monitor(physical_graph,
                                                   options,
                                                   remote.dim_ips[0],
                                                   REST_PORT,
                                                   submit=co_hosted)
            monitoring_thread.join()
            # now the session is finished

            # still shutting DIM down first to avoid monitoring conflicts
            stop_dims(remote.dim_ips)
            # now stop all the NMs
            stop_nms(remote.nm_ips)

        # shouldn't need this in addition
        # if dim_proc is not None:
        #     # Stop DALiuGE.
        #     LOGGER.info("Stopping DALiuGE island manager on rank %d", remote.rank)
        #     utils.terminate_or_kill(dim_proc, 5)

    elif remote.is_highest_level_manager:
        # TODO: In the case of more than one island the NMs are not yet started

        physical_graph = get_pg(options, remote.nm_ips, remote.dim_ips)
        remote.send_dim_nodes(physical_graph)

        # 7. make sure all DIMs are up running
        dim_ips_up = check_hosts(remote.dim_ips,
                                 ISLAND_DEFAULT_REST_PORT,
                                 timeout=MM_WAIT_TIME,
                                 retry=10)
        if len(dim_ips_up) < len(remote.dim_ips):
            LOGGER.warning(
                "Not all DIMs were up and running: %d/%d",
                len(dim_ips_up),
                len(remote.dim_ips),
            )

        monitoring_thread = submit_and_monitor(physical_graph, options,
                                               remote.my_ip,
                                               MASTER_DEFAULT_REST_PORT)
        mm_proc = start_mm(remote.dim_ips, log_dir, logv=logv)
        monitoring_thread.join()
        stop_mm(
            remote.my_ip
        )  # TODO: I don't think we need this and least not in the single island case
        stop_dims(remote.dim_ips)
    else:
        nm_ips = remote.recv_dim_nodes()
        proc = start_dim(nm_ips, log_dir, remote.my_ip, logv=logv)
        utils.wait_or_kill(proc, 1e8, period=5)
        stop_nms(remote.nm_ips)