Example #1
0
def get_pg(opts, nms, dims):
    """Gets the Physical Graph that is eventually submitted to the cluster, if any"""

    if not opts.logical_graph and not opts.physical_graph:
        return []

    num_nms = len(nms)
    num_dims = len(dims)
    if opts.logical_graph:
        unrolled = pg_generator.unroll(opts.logical_graph, opts.ssid,
                                       opts.zerorun, APPS[opts.app])
        algo_params = parse_partition_algo_params(opts.algo_params)
        pgt = pg_generator.partition(
            unrolled,
            opts.part_algo,
            num_partitions=num_nms,
            num_islands=num_dims,
            **algo_params,
        )
        del unrolled  # quickly dispose of potentially big object
    else:
        with open(opts.physical_graph, "rb") as pg_file:
            pgt = json.load(pg_file)

    # modify the PG as necessary
    for modifier in opts.pg_modifiers.split(":"):
        if modifier is not None and modifier != "":
            modify_pg(pgt, modifier)

    # Check which NMs are up and use only those form now on
    nms = check_hosts(
        nms,
        NODE_DEFAULT_REST_PORT,
        check_with_session=opts.check_with_session,
        timeout=MM_WAIT_TIME,
        retry=3,
    )
    LOGGER.info(
        f"Mapping graph to available resources: nms {nms}, dims {dims}")
    physical_graph = pg_generator.resource_map(pgt,
                                               dims + nms,
                                               num_islands=num_dims,
                                               co_host_dim=opts.co_host_dim)
    graph_name = os.path.basename(opts.log_dir)
    graph_name = f"{graph_name.split('_')[0]}.json"  # get just the graph name
    with open(os.path.join(opts.log_dir, graph_name), "wt") as pg_file:
        json.dump(physical_graph, pg_file)
    return physical_graph
Example #2
0
def unroll_and_partition_with_params(lg, algo_params_source):
    # Get the 'test' parameter
    # NB: the test parameter is a string, so convert to boolean
    test = algo_params_source.get("test", "false")
    test = test.lower() == "true"

    # Based on 'test' parameter, decide whether to use a replacement app
    app = "dlg.apps.simple.SleepApp" if test else None

    # Unrolling LG to PGT.
    pgt = init_pgt_unroll_repro_data(unroll(lg, app=app))
    # Define partitioning parameters.
    algo = algo_params_source.get("algo", "none")
    num_partitions = algo_params_source.get("num_par", default=1, type=int)
    num_islands = algo_params_source.get("num_islands", default=0, type=int)
    par_label = algo_params_source.get("par_label", "Partition")

    # Build a map with extra parameters, more specific to some par_algoithms.
    algo_params = {}
    for name, typ in ALGO_PARAMS:
        if name in algo_params_source:
            algo_params[name] = algo_params_source.get(name, type=typ)
    reprodata = pgt.pop()
    # Partition the PGT
    pgt = partition(
        pgt,
        algo=algo,
        num_partitions=num_partitions,
        num_islands=num_islands,
        partition_label=par_label,
        show_gojs=True,
        **algo_params,
    )

    pgt_spec = pgt.to_pg_spec(
        [],
        ret_str=False,
        num_islands=num_islands,
        tpl_nodes_len=num_partitions + num_islands,
    )
    pgt_spec.append(reprodata)
    init_pgt_partition_repro_data(pgt_spec)
    reprodata = pgt_spec.pop()
    pgt.reprodata = reprodata
    logger.info(reprodata)
    return pgt
Example #3
0
def get_pg(opts, num_node_managers, num_data_island_managers):
    if not opts.logical_graph and not opts.physical_graph:
        return []

    if opts.logical_graph:
        unrolled_graph = pg_generator.unroll(opts.logical_graph)
        pgt = pg_generator.partition(
            unrolled_graph,
            algo="metis",
            num_partitions=num_node_managers,
            num_islands=num_data_island_managers,
        )
        del unrolled_graph
    else:
        with open(opts.physical_graph, "r", encoding="utf-8") as pg_file:
            pgt = json.load(pg_file)
    return pgt
Example #4
0
def _create_pg(logical_graph, processing_block, node_managers,
               data_island_manager, zero_cost_run):

    logical_graph = pg_generator.fill(logical_graph,
                                      processing_block.parameters)

    unroll_kwargs = {}
    if zero_cost_run:
        unroll_kwargs['zerorun'] = True
        unroll_kwargs['app'] = 'dlg.apps.simple.SleepApp'
    physical_graph_template = pg_generator.unroll(logical_graph,
                                                  **unroll_kwargs)
    physical_graph = pg_generator.partition(physical_graph_template,
                                            'mysarkar',
                                            num_partitions=len(node_managers),
                                            num_islands=1)
    physical_graph = pg_generator.resource_map(
        physical_graph, [data_island_manager] + node_managers, num_islands=1)
    return physical_graph
Example #5
0
def start_helm(physical_graph_template, num_nodes: int, deploy_dir: str):
    # TODO: Dynamic helm chart logging dir
    pgt = json.loads(physical_graph_template)
    pgt = pg_generator.partition(pgt,
                                 algo="metis",
                                 num_partitons=1,
                                 num_islands=1)
    helm_client = HelmClient(deploy_name="daliuge-daemon",
                             chart_name="daliuge-daemon",
                             deploy_dir=deploy_dir)
    helm_client.create_helm_chart(json.dumps(pgt), co_host=True)
    try:
        helm_client.launch_helm()
        helm_client.submit_pgt()
    except dlg.restutils.RestClientException as exp:
        raise exp
    except dlg.exceptions.InvalidGraphException as exp2:
        raise exp2
    finally:
        helm_client.teardown()
        logging.info("Finished deployment")
Example #6
0
    def test_cwl_translate(self):
        import git
        import os
        import shutil
        import uuid
        import zipfile
        import subprocess
        import tempfile

        output_list = []

        # create a temporary directory to contain files created during test
        cwl_output = tempfile.mkdtemp()

        # create a temporary directory to contain a clone of EAGLE_test_repo
        direct = tempfile.mkdtemp()

        REPO = "https://github.com/ICRAR/EAGLE_test_repo"
        git.Git(direct).clone(REPO)

        cwl_dir = os.getenv("CWL_GRAPHS", "SP-602")

        graph_dir = direct + "/EAGLE_test_repo/" + cwl_dir + "/"
        for subdir, dirs, files in os.walk(graph_dir):
            for file in files:
                f = os.path.join(subdir, file)
                if not f.endswith(".graph"):
                    continue
                pgt = unroll(f, 1)
                pg_generator.partition(pgt,
                                       algo='metis',
                                       num_partitions=1,
                                       num_islands=1,
                                       partition_label='partition')

                uid = str(uuid.uuid4())
                cwl_output_dir = cwl_output + '/' + uid
                os.mkdir(cwl_output_dir)
                cwl_out = cwl_output_dir + '/workflow.cwl'
                cwl_out_zip = cwl_output_dir + '/workflow.zip'
                output_list.append((cwl_out, cwl_out_zip))

                create_workflow(pgt, 'workflow.cwl', cwl_out_zip)

        for out, zip in output_list:
            zip_ref = zipfile.ZipFile(zip)
            zip_ref.extractall(os.path.dirname(zip))
            zip_ref.close()

            cmd = ['cwltool', '--validate', out]
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            stdout, stderr = p.communicate()
            self.assertEqual(p.returncode, 0,
                             b'stdout:\n' + stdout + b'\nstderr:\n' + stderr)

        # delete the clone of EAGLE_test_repo
        shutil.rmtree(direct, ignore_errors=True)

        # delete the temporary output directory
        shutil.rmtree(cwl_output, ignore_errors=True)
Example #7
0
def main():

    parser = optparse.OptionParser()
    parser.add_option("-l", "--log_dir", action="store", type="string",
                    dest="log_dir", help="Log directory (required)")
    # if this parameter is present, it means we want to get monitored
    parser.add_option("-m", "--monitor_host", action="store", type="string",
                    dest="monitor_host", help="Monitor host IP (optional)")
    parser.add_option("-o", "--monitor_port", action="store", type="int",
                    dest="monitor_port", help="Monitor port",
                    default=dfms_proxy.default_dlg_monitor_port)
    parser.add_option("-v", "--verbose-level", action="store", type="int",
                    dest="verbose_level", help="Verbosity level (1-3) of the DIM/NM logging",
                    default=1)
    parser.add_option("-z", "--zerorun", action="store_true",
                      dest="zerorun", help="Generate a physical graph that takes no time to run", default=False)
    parser.add_option("--app", action="store", type="int",
                      dest="app", help="The app to use in the PG. 1=SleepApp (default), 2=SleepAndCopy", default=0)

    parser.add_option("-t", "--max-threads", action="store", type="int",
                      dest="max_threads", help="Max thread pool size used for executing drops. 0 (default) means no pool.", default=0)

    parser.add_option("-L", "--logical-graph", action="store", type="string",
                      dest="logical_graph", help="The filename of the logical graph to deploy", default=None)
    parser.add_option("-P", "--physical-graph", action="store", type="string",
                      dest="physical_graph", help="The filename of the physical graph (template) to deploy", default=None)

    parser.add_option('-s', '--num_islands', action='store', type='int',
                    dest='num_islands', default=1, help='The number of Data Islands')

    parser.add_option('-d', '--dump', action='store_true',
                    dest='dump', help = 'dump file base name?', default=False)

    parser.add_option("-c", "--loc", action="store", type="string",
                    dest="loc", help="deployment location (e.g. 'Pawsey' or 'Tianhe2')",
                    default="Pawsey")

    parser.add_option('--part-algo', type="string", dest='part_algo', help='Partition algorithms',
                      default='metis')

    parser.add_option("-u", "--all_nics", action="store_true",
                      dest="all_nics", help="Listen on all NICs for a node manager", default=False)

    parser.add_option('--check-interfaces', action='store_true',
                      dest='check_interfaces', help = 'Run a small network interfaces test and exit', default=False)
    parser.add_option('--use-ifconfig', action='store_true',
                      dest='use_ifconfig', help='Use ifconfig to find a suitable external interface/address for each host', default=False)
    parser.add_option("-S", "--check_with_session", action="store_true",
                      dest="check_with_session", help="Check for node managers' availability by creating/destroy a session", default=False)

    (options, _) = parser.parse_args()

    if options.check_interfaces:
        print("From netifaces: %s" % get_ip_via_netifaces())
        print("From ifconfig: %s" % get_ip_via_ifconfig())
        sys.exit(0)

    if options.logical_graph and options.physical_graph:
        parser.error("Either a logical graph or physical graph filename must be specified")
    for p in (options.logical_graph, options.physical_graph):
        if p and not os.path.exists(p):
            parser.error("Cannot locate graph file at '{0}'".format(p))

    if (options.monitor_host is not None and options.num_islands > 1):
        parser.error("We do not support proxy monitor multiple islands yet")

    logv = max(min(3, options.verbose_level), 1)

    from mpi4py import MPI  # @UnresolvedImport
    comm = MPI.COMM_WORLD  # @UndefinedVariable
    num_procs = comm.Get_size()
    rank = comm.Get_rank()

    log_dir = "{0}/{1}".format(options.log_dir, rank)
    os.makedirs(log_dir)
    logfile = log_dir + "/start_dlg_cluster.log"
    FORMAT = "%(asctime)-15s [%(levelname)5.5s] [%(threadName)15.15s] %(name)s#%(funcName)s:%(lineno)s %(message)s"
    logging.basicConfig(filename=logfile, level=logging.DEBUG, format=FORMAT)

    if (num_procs > 1 and options.monitor_host is not None):
        logger.info("Trying to start DALiuGE cluster with proxy")
        run_proxy = True
        threshold = 2
    else:
        logger.info("Trying to start DALiuGE cluster without proxy")
        run_proxy = False
        threshold = 1

    if (num_procs == threshold):
        logger.warning("No MPI processes left for running Drop Managers")
        run_node_mgr = False
    else:
        run_node_mgr = True

    # attach rank information at the end of IP address for multi-islands
    rank_str = '' if options.num_islands == 1 else ',%s' % rank
    find_ip = get_ip_via_ifconfig if options.use_ifconfig else get_ip_via_netifaces
    public_ip = find_ip(options.loc)
    ip_adds = '{0}{1}'.format(public_ip, rank_str)
    origin_ip = ip_adds.split(',')[0]
    ip_adds = comm.gather(ip_adds, root=0)

    proxy_ip = None
    if run_proxy:
        # send island/master manager's IP address to the DALiuGE proxy
        # also let island manager know the DALiuGE proxy's IP
        if rank == 0:
            mgr_ip = origin_ip
            comm.send(mgr_ip, dest=1)
            proxy_ip = comm.recv(source=1)
        elif rank == 1:
            mgr_ip = comm.recv(source=0)
            proxy_ip = origin_ip
            comm.send(proxy_ip, dest=0)

    set_env(rank)
    if (options.num_islands == 1):
        if (rank != 0):
            if (run_proxy and rank == 1):
                # Wait until the Island Manager is open
                if utils.portIsOpen(mgr_ip, ISLAND_DEFAULT_REST_PORT, 100):
                    start_proxy(options.loc, mgr_ip, ISLAND_DEFAULT_REST_PORT, options.monitor_host, options.monitor_port)
                else:
                    logger.warning("Couldn't connect to the main drop manager, proxy not started")
            elif (run_node_mgr):
                logger.info("Starting node manager on host {0}".format(origin_ip))
                start_node_mgr(log_dir, logv=logv,
                max_threads=options.max_threads,
                host=None if options.all_nics else origin_ip)
        else:

            # 'no_nms' are known not to be NMs
            no_nms = [origin_ip, 'None']
            if proxy_ip:
                no_nms += [proxy_ip]
            node_mgrs = [ip for ip in ip_adds if ip not in no_nms]

            # unroll the graph first (if any) while starting node managers on other nodes
            pgt = None
            if options.logical_graph or options.physical_graph:
                pip_name = utils.fname_to_pipname(options.logical_graph or options.physical_graph)
                if options.logical_graph:
                    unrolled = tool.unroll(options.logical_graph, '1', options.zerorun, apps[options.app])
                    pgt = pg_generator.partition(unrolled, options.part_algo, num_partitions=len(node_mgrs))
                    pgt = pgt.to_pg_spec([], ret_str=False, num_islands=1, tpl_nodes_len=len(node_mgrs) + 1)
                    del unrolled
                else:
                    pgt = json.loads(options.physical_graph)

            # Check that which NMs are up and use only those form now on
            node_mgrs = check_hosts(node_mgrs, NODE_DEFAULT_REST_PORT,
                                    check_with_session=options.check_with_session,
                                    timeout=MM_WAIT_TIME)

            # We have a PGT, let's map it and submit it
            if pgt:
                pg = tool.resource_map(pgt, [origin_ip] + node_mgrs, pip_name, options.num_islands)
                del pgt

                def submit_and_monitor():
                    host, port = 'localhost', ISLAND_DEFAULT_REST_PORT
                    tool.submit(host, port, pg)
                    if options.dump:
                        dump_path = '{0}/monitor'.format(log_dir)
                        monitor_graph(host, port, dump_path)

                threading.Thread(target=submit_and_monitor).start()

            # Start the DIM
            logger.info("Starting island manager on host %s", origin_ip)
            start_dim(node_mgrs, log_dir, logv=logv)

    elif (options.num_islands > 1):
        if (rank == 0):
            # master manager
            # 1. use ip_adds to produce the physical graph
            ip_list = []
            ip_rank_dict = dict() # k - ip, v - MPI rank
            for ipr in ip_adds:
                iprs = ipr.split(',')
                ip = iprs[0]
                r = iprs[1]
                if (ip == origin_ip or 'None' == ip):
                    continue
                ip_list.append(ip)
                ip_rank_dict[ip] = int(r)

            if (len(ip_list) <= options.num_islands):
                raise Exception("Insufficient nodes available for node managers")

            # 2 broadcast dim ranks to all nodes to let them know who is the DIM
            dim_ranks = []
            dim_ip_list = ip_list[0:options.num_islands]
            logger.info("A list of DIM IPs: {0}".format(dim_ip_list))
            for dim_ip in dim_ip_list:
                dim_ranks.append(ip_rank_dict[dim_ip])
            dim_ranks = comm.bcast(dim_ranks, root=0)

            # 3 unroll the graph while waiting for node managers to start
            pip_name = utils.fname_to_pipname(options.logical_graph or options.physical_graph)
            if options.logical_graph:
                unrolled = tool.unroll(options.logical_graph, '1', options.zerorun, apps[options.app])
                pgt = pg_generator.partition(unrolled, options.part_algo, num_partitions=len(ip_list) - 1, num_islands=options.num_islands)
                pgt = pgt.to_pg_spec([], ret_str=False, num_islands=options.num_islands,
                                     tpl_nodes_len=len(ip_list) - 1 + options.num_islands)
                del unrolled
            else:
                pgt = json.loads(options.physical_graph)

            #logger.info("Waiting all node managers to start in %f seconds", MM_WAIT_TIME)
            node_mgrs = check_hosts(ip_list[options.num_islands:], NODE_DEFAULT_REST_PORT,
                                    check_with_session=options.check_with_session,
                                    timeout=MM_WAIT_TIME)

            # 4.  produce the physical graph based on the available node managers
            # that have already been running (we have to assume island manager
            # will run smoothly in the future)
            logger.info("Master Manager producing the physical graph")
            pg = tool.resource_map(pgt, dim_ip_list + node_mgrs, pip_name, options.num_islands)

            # 5. parse the pg_spec to get the mapping from islands to node list
            dim_rank_nodes_dict = collections.defaultdict(set)
            for drop in pg:
                dim_ip = drop['island']
                # if (not dim_ip in dim_ip_list):
                #     raise Exception("'{0}' node is not in island list {1}".format(dim_ip, dim_ip_list))
                r = ip_rank_dict[dim_ip]
                n = drop['node']
                dim_rank_nodes_dict[r].add(n)

            # 6 send a node list to each DIM so that it can start
            for dim_ip in dim_ip_list:
                r = ip_rank_dict[dim_ip]
                logger.debug("Sending node list to rank {0}".format(r))
                #TODO this should be in a thread since it is blocking!
                comm.send(list(dim_rank_nodes_dict[r]), dest=r)

            # 7. make sure all DIMs are up running
            dim_ips_up = check_hosts(dim_ip_list, ISLAND_DEFAULT_REST_PORT, timeout=MM_WAIT_TIME, retry=10)
            if len(dim_ips_up) < len(dim_ip_list):
                logger.warning("Not all DIMs were up and running: %d/%d", len(dim_ips_up), len(dim_ip_list))

            # 8. submit the graph in a thread (wait for mm to start)
            def submit():
                if not check_host('localhost', MASTER_DEFAULT_REST_PORT, timeout=GRAPH_SUBMIT_WAIT_TIME):
                    logger.warning("Master Manager didn't come up in %d seconds", GRAPH_SUBMIT_WAIT_TIME)
                tool.submit('localhost', MASTER_DEFAULT_REST_PORT, pg)
            threading.Thread(target=submit).start()

            # 9. start dlgMM using islands IP addresses (this will block)
            start_mm(dim_ip_list, log_dir, logv=logv)

        else:
            dim_ranks = None
            dim_ranks = comm.bcast(dim_ranks, root=0)
            logger.debug("Receiving dim_ranks = {0}, my rank is {1}".format(dim_ranks, rank))
            if (rank in dim_ranks):
                logger.debug("Rank {0} is a DIM preparing for receiving".format(rank))
                # island manager
                # get a list of nodes that are its children from rank 0 (MM)
                nm_list = comm.recv(source=0)
                # no need to wait for node managers since the master manager
                # has already made sure they are up running
                logger.debug("nm_list for DIM {0} is {1}".format(rank, nm_list))
                start_dim(nm_list, log_dir, logv=logv)
            else:
                # node manager
                logger.info("Starting node manager on host {0}".format(origin_ip))
                start_node_mgr(log_dir, logv=logv,
                max_threads=options.max_threads,
                host=None if options.all_nics else origin_ip)