Example #1
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list,
                      sub_nproc, timeout, running_ids_dict):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    FWData().Running_IDs = running_ids_dict
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout)
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        launch_ids = FWData().Running_IDs.values()
        live_ids = list(set(launch_ids) - {None})
        if len(live_ids) > 0:
            # Some other sub jobs are still running
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    sleep_time))
            time.sleep(sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(sleep_time))
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout)
        else:
            break
    log_multi(l_logger, 'Sub job finished')
Example #2
0
def ping_multilaunch(port, stop_event):
    """
    A single manager to ping all launches during multiprocess launches

    Args:
        port (int): Listening port number of the DataServer
        stop_event (Thread.Event): stop event
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    fd = FWData()

    lp = ds.LaunchPad()
    while not stop_event.is_set():
        for pid, lid in fd.Running_IDs.items():
            if lid:
                try:
                    os.kill(pid, 0)  # throws OSError if the process is dead
                    lp.ping_launch(lid)
                except OSError:
                    fd.Running_IDs[pid] = None
                    fd.FiringState[pid] = False
                    pass  # means this process is dead!

        stop_event.wait(PING_TIME_SECS)
Example #3
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list,
                      sub_nproc, timeout):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire

    :param fworker: (FWorker) object
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param sleep: (int) secs to sleep between rapidfire loop iterations
    :param loglvl: (str) level at which to output logs to stdout
    :param port: (int) Listening port number of the shared object manage
    :param password: (str) security password to access the server
    :param node_list: ([str]) computer node list
    :param sub_nproc: (int) number of processors of the sub job
    :param timeout: (int) # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout)
Example #4
0
def ping_multilaunch(port, stop_event):
    """
    A single manager to ping all launches during multiprocess launches

    :param port: (int) Listening port number of the DataServer
    :param stop_event: (Thread.Event) stop event
    """

    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()

    lp = ds.LaunchPad()
    while not stop_event.is_set():
        for pid, lid in ds.Running_IDs().items():
            if lid:
                try:
                    os.kill(pid, 0)  # throws OSError if the process is dead
                    lp.ping_launch(lid)
                except OSError:
                    pass  # means this process is dead!

        stop_event.wait(PING_TIME_SECS)
Example #5
0
def rapidfire_process(fworker,
                      nlaunches,
                      sleep,
                      loglvl,
                      port,
                      node_list,
                      sub_nproc,
                      timeout,
                      running_ids_dict,
                      local_redirect,
                      firing_state_dict,
                      macro_sleep_time=None):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
        macro_sleep_time (int): secs to sleep between sub job resubmit
        local_redirect (bool): redirect standard input and output to local file
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    fw_data = FWData()
    fw_data.DATASERVER = ds
    fw_data.MULTIPROCESSING = True
    fw_data.NODE_LIST = node_list
    fw_data.SUB_NPROCS = sub_nproc
    fw_data.Running_IDs = running_ids_dict
    fw_data.FiringState = firing_state_dict
    fw_data.lp = launchpad
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    fw_data.FiringState[os.getpid()] = True
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout,
              local_redirect=local_redirect)
    fw_data.FiringState[os.getpid()] = False
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        firing_pids = [
            pid for pid, is_firing in fw_data.FiringState.items() if is_firing
        ]
        if len(firing_pids) > 0:
            # Some other sub jobs are still running
            macro_sleep_time = macro_sleep_time if macro_sleep_time \
                else sleep_time * len(fw_data.FiringState)
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    macro_sleep_time))
            time.sleep(macro_sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time))
            fw_data.FiringState[os.getpid()] = True
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout,
                      local_redirect=local_redirect)
            fw_data.FiringState[os.getpid()] = False
        else:
            break
    log_multi(l_logger, 'Sub job finished')