def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO', pdb_on_exception=False): """ Run a single rocket in the current directory. Args: launchpad (LaunchPad) fworker (FWorker) fw_id (int): if set, a particular Firework to run strm_lvl (str): level at which to output logs to stdout pdb_on_exception (bool): if set to True, python will start the debugger on a firework exception Returns: bool """ fworker = get_fworker(fworker) l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=strm_lvl) log_multi(l_logger, 'Launching Rocket') rocket = Rocket(launchpad, fworker, fw_id) rocket_ran = rocket.run(pdb_on_exception=pdb_on_exception) log_multi(l_logger, 'Rocket finished') return rocket_ran
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False, local_redirect=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node local_redirect (bool): redirect standard input and output to local file """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() firing_state_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict, local_redirect=local_redirect, firing_state_dict=firing_state_dict) FWData().Running_IDs = running_ids_dict FWData().FiringState = firing_state_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param max_loops: (int) maximum number of loops :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 num_loops = 0 while num_loops != max_loops: while launchpad.run_exists(fworker): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break time.sleep( 0.15 ) # add a small amount of buffer breathing time for DB to refresh, etc. if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop until max_loops :param max_loops: (int) maximum number of loops (default -1 is infinite) :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout :param timeout: (int) # of seconds after which to stop the rapidfire process """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: time.sleep(0.15) # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict) FWData().Running_IDs = running_ids_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param max_loops: (int) maximum number of loops :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 num_loops = 0 while num_loops != max_loops: skip_check = False # this is used to speed operation while skip_check or launchpad.run_exists(fworker): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: time.sleep(0.15) # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO'): """ Run a single rocket in the current directory :param launchpad: (LaunchPad) :param fworker: (FWorker) :param fw_id: (int) if set, a particular Firework to run :param strm_lvl: (str) level at which to output logs to stdout """ fworker = fworker if fworker else FWorker() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=strm_lvl) log_multi(l_logger, 'Launching Rocket') rocket = Rocket(launchpad, fworker, fw_id) rocket_ran = rocket.run() log_multi(l_logger, 'Rocket finished') return rocket_ran
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( sleep_time)) time.sleep(sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(sleep_time)) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) else: break log_multi(l_logger, 'Sub job finished')
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger("rocket.launcher", l_dir=l_dir, stream_level=loglvl) rapidfire( launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, ) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi(l_logger, "Sleeping for {} secs before resubmit sub job".format(sleep_time)) time.sleep(sleep_time) log_multi(l_logger, "Resubmit sub job".format(sleep_time)) rapidfire( launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, ) else: break log_multi(l_logger, "Sub job finished")
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None, local_redirect=False, pdb_on_exception=False): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process local_redirect (bool): redirect standard input and output to local file """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 def time_ok(): # has the rapidfire run timed out? return (timeout is None or (datetime.now() - start_time).total_seconds() < timeout) while num_loops != max_loops and time_ok(): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and time_ok(): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) if local_redirect: with redirect_local(): rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl, pdb_on_exception=pdb_on_exception) else: rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl, pdb_on_exception=pdb_on_exception) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if nlaunches > 0 and num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if nlaunches == 0: if not launchpad.future_run_exists(fworker): break elif num_launched == nlaunches: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...') os.chdir(curdir)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [ pid for pid, is_firing in fw_data.FiringState.items() if is_firing ] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')