Esempio n. 1
0
def ping_multilaunch(port, stop_event):
    """
    A single manager to ping all launches during multiprocess launches

    Args:
        port (int): Listening port number of the DataServer
        stop_event (Thread.Event): stop event
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    fd = FWData()

    lp = ds.LaunchPad()
    while not stop_event.is_set():
        for pid, lid in fd.Running_IDs.items():
            if lid:
                try:
                    os.kill(pid, 0)  # throws OSError if the process is dead
                    lp.ping_launch(lid)
                except OSError:
                    fd.Running_IDs[pid] = None
                    fd.FiringState[pid] = False
                    pass  # means this process is dead!

        stop_event.wait(PING_TIME_SECS)
Esempio n. 2
0
def ping_multilaunch(port, stop_event):
    """
    A single manager to ping all launches during multiprocess launches

    Args:
        port (int): Listening port number of the DataServer
        stop_event (Thread.Event): stop event
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    fd = FWData()

    lp = ds.LaunchPad()
    while not stop_event.is_set():
        for pid, lid in fd.Running_IDs.items():
            if lid:
                try:
                    os.kill(pid, 0)  # throws OSError if the process is dead
                    lp.ping_launch(lid)
                except OSError:
                    fd.Running_IDs[pid] = None
                    fd.FiringState[pid] = False
                    pass  # means this process is dead!

        stop_event.wait(PING_TIME_SECS)
Esempio n. 3
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list,
                      sub_nproc, timeout, running_ids_dict):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    FWData().Running_IDs = running_ids_dict
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout)
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        launch_ids = FWData().Running_IDs.values()
        live_ids = list(set(launch_ids) - {None})
        if len(live_ids) > 0:
            # Some other sub jobs are still running
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    sleep_time))
            time.sleep(sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(sleep_time))
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout)
        else:
            break
    log_multi(l_logger, 'Sub job finished')
Esempio n. 4
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list,
                      sub_nproc, timeout):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire

    :param fworker: (FWorker) object
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param sleep: (int) secs to sleep between rapidfire loop iterations
    :param loglvl: (str) level at which to output logs to stdout
    :param port: (int) Listening port number of the shared object manage
    :param password: (str) security password to access the server
    :param node_list: ([str]) computer node list
    :param sub_nproc: (int) number of processors of the sub job
    :param timeout: (int) # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout)
Esempio n. 5
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    FWData().Running_IDs = running_ids_dict
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger("rocket.launcher", l_dir=l_dir, stream_level=loglvl)
    rapidfire(
        launchpad,
        fworker=fworker,
        m_dir=None,
        nlaunches=nlaunches,
        max_loops=-1,
        sleep_time=sleep,
        strm_lvl=loglvl,
        timeout=timeout,
    )
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        launch_ids = FWData().Running_IDs.values()
        live_ids = list(set(launch_ids) - {None})
        if len(live_ids) > 0:
            # Some other sub jobs are still running
            log_multi(l_logger, "Sleeping for {} secs before resubmit sub job".format(sleep_time))
            time.sleep(sleep_time)
            log_multi(l_logger, "Resubmit sub job".format(sleep_time))
            rapidfire(
                launchpad,
                fworker=fworker,
                m_dir=None,
                nlaunches=nlaunches,
                max_loops=-1,
                sleep_time=sleep,
                strm_lvl=loglvl,
                timeout=timeout,
            )
        else:
            break
    log_multi(l_logger, "Sub job finished")
Esempio n. 6
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout,
                      running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
        macro_sleep_time (int): secs to sleep between sub job resubmit
        local_redirect (bool): redirect standard input and output to local file
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    fw_data = FWData()
    fw_data.DATASERVER = ds
    fw_data.MULTIPROCESSING = True
    fw_data.NODE_LIST = node_list
    fw_data.SUB_NPROCS = sub_nproc
    fw_data.Running_IDs = running_ids_dict
    fw_data.FiringState = firing_state_dict
    fw_data.lp = launchpad
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
    fw_data.FiringState[os.getpid()] = True
    rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches,
              max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout,
              local_redirect=local_redirect)
    fw_data.FiringState[os.getpid()] = False
    while nlaunches == 0:
        time.sleep(1.5) # wait for LaunchPad to be initialized
        firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing]
        if len(firing_pids) > 0:
            # Some other sub jobs are still running
            macro_sleep_time = macro_sleep_time if macro_sleep_time \
                else sleep_time * len(fw_data.FiringState)
            log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time))
            time.sleep(macro_sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time))
            fw_data.FiringState[os.getpid()] = True
            rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches,
                      max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout,
                      local_redirect=local_redirect)
            fw_data.FiringState[os.getpid()] = False
        else:
            break
    log_multi(l_logger, 'Sub job finished')
Esempio n. 7
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1):
    """
    Launch the jobs in the job packing mode.
    :param launchpad: (LaunchPad) object
    :param fworker: (FWorker) object
    :param loglvl: (str) level at which to output logs
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param num_jobs: (int) number of sub jobs
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param total_node_list: ([str]) contents of NODEFILE (doesn't affect execution)
    :param ppn: (int) processors per node (doesn't affect execution)
    """
    # parse node file contents
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list)

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Esempio n. 8
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1):
    """
    Launch the jobs in the job packing mode.
    :param launchpad: (LaunchPad) object
    :param fworker: (FWorker) object
    :param loglvl: (str) level at which to output logs
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param num_jobs: (int) number of sub jobs
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param total_node_list: ([str]) contents of NODEFILE (doesn't affect execution)
    :param ppn: (int) processors per node (doesn't affect execution)
    """
    # parse node file contents
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list)

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Esempio n. 9
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False,
                        local_redirect=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
        local_redirect (bool): redirect standard input and output to local file
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()
    firing_state_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict,
                              local_redirect=local_redirect, firing_state_dict=firing_state_dict)
    FWData().Running_IDs = running_ids_dict
    FWData().FiringState = firing_state_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Esempio n. 10
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire

    :param fworker: (FWorker) object
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param sleep: (int) secs to sleep between rapidfire loop iterations
    :param loglvl: (str) level at which to output logs to stdout
    :param port: (int) Listening port number of the shared object manage
    :param password: (str) security password to access the server
    :param node_list: ([str]) computer node list
    :param sub_nproc: (int) number of processors of the sub job
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    rapidfire(launchpad, fworker, None, nlaunches, -1, sleep, loglvl)
Esempio n. 11
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict)
    FWData().Running_IDs = running_ids_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Esempio n. 12
0
def ping_multilaunch(port, stop_event):
    """
    A single manager to ping all launches during multiprocess launches

    :param port: (int) Listening port number of the DataServer
    :param stop_event: (Thread.Event) stop event
    """

    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()

    lp = ds.LaunchPad()
    while not stop_event.is_set():
        for pid, lid in ds.Running_IDs().items():
            if lid:
                try:
                    os.kill(pid, 0)  # throws OSError if the process is dead
                    lp.ping_launch(lid)
                except OSError:
                    pass  # means this process is dead!

        stop_event.wait(PING_TIME_SECS)
Esempio n. 13
0
def ping_multilaunch(port, stop_event):
    """
    A single manager to ping all launches during multiprocess launches

    :param port: (int) Listening port number of the DataServer
    :param stop_event: (Thread.Event) stop event
    """

    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()

    lp = ds.LaunchPad()
    while not stop_event.is_set():
        for pid, lid in ds.Running_IDs().items():
            if lid:
                try:
                    os.kill(pid, 0)  # throws OSError if the process is dead
                    lp.ping_launch(lid)
                except OSError:
                    pass  # means this process is dead!

        stop_event.wait(PING_TIME_SECS)
Esempio n. 14
0
def rapidfire_process(fworker,
                      nlaunches,
                      sleep,
                      loglvl,
                      port,
                      node_list,
                      sub_nproc,
                      timeout,
                      running_ids_dict,
                      local_redirect,
                      firing_state_dict,
                      macro_sleep_time=None):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
        macro_sleep_time (int): secs to sleep between sub job resubmit
        local_redirect (bool): redirect standard input and output to local file
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    fw_data = FWData()
    fw_data.DATASERVER = ds
    fw_data.MULTIPROCESSING = True
    fw_data.NODE_LIST = node_list
    fw_data.SUB_NPROCS = sub_nproc
    fw_data.Running_IDs = running_ids_dict
    fw_data.FiringState = firing_state_dict
    fw_data.lp = launchpad
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    fw_data.FiringState[os.getpid()] = True
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout,
              local_redirect=local_redirect)
    fw_data.FiringState[os.getpid()] = False
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        firing_pids = [
            pid for pid, is_firing in fw_data.FiringState.items() if is_firing
        ]
        if len(firing_pids) > 0:
            # Some other sub jobs are still running
            macro_sleep_time = macro_sleep_time if macro_sleep_time \
                else sleep_time * len(fw_data.FiringState)
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    macro_sleep_time))
            time.sleep(macro_sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time))
            fw_data.FiringState[os.getpid()] = True
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout,
                      local_redirect=local_redirect)
            fw_data.FiringState[os.getpid()] = False
        else:
            break
    log_multi(l_logger, 'Sub job finished')