def ping_multilaunch(port, stop_event): """ A single manager to ping all launches during multiprocess launches Args: port (int): Listening port number of the DataServer stop_event (Thread.Event): stop event """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() fd = FWData() lp = ds.LaunchPad() while not stop_event.is_set(): for pid, lid in fd.Running_IDs.items(): if lid: try: os.kill(pid, 0) # throws OSError if the process is dead lp.ping_launch(lid) except OSError: fd.Running_IDs[pid] = None fd.FiringState[pid] = False pass # means this process is dead! stop_event.wait(PING_TIME_SECS)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( sleep_time)) time.sleep(sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(sleep_time)) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) else: break log_multi(l_logger, 'Sub job finished')
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout): """ Initializes shared data with multiprocessing parameters and starts a rapidfire :param fworker: (FWorker) object :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param sleep: (int) secs to sleep between rapidfire loop iterations :param loglvl: (str) level at which to output logs to stdout :param port: (int) Listening port number of the shared object manage :param password: (str) security password to access the server :param node_list: ([str]) computer node list :param sub_nproc: (int) number of processors of the sub job :param timeout: (int) # of seconds after which to stop the rapidfire process """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger("rocket.launcher", l_dir=l_dir, stream_level=loglvl) rapidfire( launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, ) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi(l_logger, "Sleeping for {} secs before resubmit sub job".format(sleep_time)) time.sleep(sleep_time) log_multi(l_logger, "Resubmit sub job".format(sleep_time)) rapidfire( launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, ) else: break log_multi(l_logger, "Sub job finished")
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc): """ Initializes shared data with multiprocessing parameters and starts a rapidfire :param fworker: (FWorker) object :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param sleep: (int) secs to sleep between rapidfire loop iterations :param loglvl: (str) level at which to output logs to stdout :param port: (int) Listening port number of the shared object manage :param password: (str) security password to access the server :param node_list: ([str]) computer node list :param sub_nproc: (int) number of processors of the sub job """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc rapidfire(launchpad, fworker, None, nlaunches, -1, sleep, loglvl)
def ping_multilaunch(port, stop_event): """ A single manager to ping all launches during multiprocess launches :param port: (int) Listening port number of the DataServer :param stop_event: (Thread.Event) stop event """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() lp = ds.LaunchPad() while not stop_event.is_set(): for pid, lid in ds.Running_IDs().items(): if lid: try: os.kill(pid, 0) # throws OSError if the process is dead lp.ping_launch(lid) except OSError: pass # means this process is dead! stop_event.wait(PING_TIME_SECS)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [ pid for pid, is_firing in fw_data.FiringState.items() if is_firing ] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')