def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [ pid for pid, is_firing in fw_data.FiringState.items() if is_firing ] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None, local_redirect=False): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process local_redirect (bool): redirect standard input and output to local file """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and ( not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) if local_redirect: with redirect_local(): rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) else: rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) fd = FWData() if fd.MULTIPROCESSING: # sleeping time is not firing fd.FiringState = False time.sleep(sleep_time) if fd.MULTIPROCESSING: fd.FiringState = True num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None, local_redirect=False): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process local_redirect (bool): redirect standard input and output to local file """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) if local_redirect: with redirect_local(): rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) else: rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) fd = FWData() if fd.MULTIPROCESSING: # sleeping time is not firing fd.FiringState = False time.sleep(sleep_time) if fd.MULTIPROCESSING: fd.FiringState = True num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)