コード例 #1
0
ファイル: multi_launcher.py プロジェクト: altynbekm/fireworks
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1):
    """
    Launch the jobs in the job packing mode.
    :param launchpad: (LaunchPad) object
    :param fworker: (FWorker) object
    :param loglvl: (str) level at which to output logs
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param num_jobs: (int) number of sub jobs
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param total_node_list: ([str]) contents of NODEFILE (doesn't affect execution)
    :param ppn: (int) processors per node (doesn't affect execution)
    """
    # parse node file contents
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list)

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
コード例 #2
0
ファイル: multi_launcher.py プロジェクト: flxb/fireworks
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1):
    """
    Launch the jobs in the job packing mode.
    :param launchpad: (LaunchPad) object
    :param fworker: (FWorker) object
    :param loglvl: (str) level at which to output logs
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param num_jobs: (int) number of sub jobs
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param total_node_list: ([str]) contents of NODEFILE (doesn't affect execution)
    :param ppn: (int) processors per node (doesn't affect execution)
    """
    # parse node file contents
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list)

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
コード例 #3
0
ファイル: multi_launcher.py プロジェクト: xhqu1981/fireworks
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False,
                        local_redirect=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
        local_redirect (bool): redirect standard input and output to local file
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()
    firing_state_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict,
                              local_redirect=local_redirect, firing_state_dict=firing_state_dict)
    FWData().Running_IDs = running_ids_dict
    FWData().FiringState = firing_state_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
コード例 #4
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict)
    FWData().Running_IDs = running_ids_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()