def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def __init__(self, launchpad, opt_label): self.logger = get_fw_logger("rocketsled") self.config = None self.launchpad = launchpad self.opt_label = opt_label self.c = getattr(self.launchpad.db, opt_label) self.is_configured = False
def __init__(self, host='localhost', port=27017, name='fireworks', username=None, password=None, logdir=None, strm_lvl=None): """ :param host: :param port: :param name: :param username: :param password: :param logdir: :param strm_lvl: """ self.host = host self.port = port self.name = name self.username = username self.password = password # set up logger self.logdir = logdir self.strm_lvl = strm_lvl if strm_lvl else 'INFO' self.m_logger = get_fw_logger('launchpad', l_dir=self.logdir, stream_level=self.strm_lvl) self.connection = MongoClient(host, port, j=True) self.database = self.connection[name] if username: self.database.authenticate(username, password) self.fireworks = self.database.fireworks self.launches = self.database.launches self.fw_id_assigner = self.database.fw_id_assigner self.links = self.database.links
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( sleep_time)) time.sleep(sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(sleep_time)) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) else: break log_multi(l_logger, 'Sub job finished')
def __init__(self, host='localhost', port=27017, name='fireworks', username=None, password=None, logdir=None, strm_lvl=None, user_indices=None, wf_user_indices=None): """ :param host: :param port: :param name: :param username: :param password: :param logdir: :param strm_lvl: :param user_indices: :param wf_user_indices: """ self.host = host self.port = port self.name = name self.username = username self.password = password # set up logger self.logdir = logdir self.strm_lvl = strm_lvl if strm_lvl else 'INFO' self.m_logger = get_fw_logger('launchpad', l_dir=self.logdir, stream_level=self.strm_lvl) self.user_indices = user_indices if user_indices else [] self.wf_user_indices = wf_user_indices if wf_user_indices else [] # get connection self.client = MongoClient(host, port, j=True)
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO', pdb_on_exception=False): """ Run a single rocket in the current directory. Args: launchpad (LaunchPad) fworker (FWorker) fw_id (int): if set, a particular Firework to run strm_lvl (str): level at which to output logs to stdout pdb_on_exception (bool): if set to True, python will start the debugger on a firework exception Returns: bool """ fworker = get_fworker(fworker) l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=strm_lvl) log_multi(l_logger, 'Launching Rocket') rocket = Rocket(launchpad, fworker, fw_id) rocket_ran = rocket.run(pdb_on_exception=pdb_on_exception) log_multi(l_logger, 'Rocket finished') return rocket_ran
def rlaunch(): m_description = 'This program launches one or more Rockets. A Rocket grabs a job from the central database and ' \ 'runs it. The "single-shot" option launches a single Rocket, ' \ 'whereas the "rapidfire" option loops until all FireWorks are completed.' parser = ArgumentParser(description=m_description) subparsers = parser.add_subparsers(help='command', dest='command') single_parser = subparsers.add_parser('singleshot', help='launch a single Rocket') rapid_parser = subparsers.add_parser('rapidfire', help='launch multiple Rockets (loop until all FireWorks complete)') single_parser.add_argument('-f', '--fw_id', help='specific fw_id to run', default=None, type=int) single_parser.add_argument('--offline', help='run in offline mode (FW.json required)', action='store_true') rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite"; default 0 is all jobs in DB)', default=0) rapid_parser.add_argument('--sleep', help='sleep time between loops (secs)', default=None, type=int) parser.add_argument('-l', '--launchpad_file', help='path to launchpad file', default=LAUNCHPAD_LOC) parser.add_argument('-w', '--fworker_file', help='path to fworker file', default=FWORKER_LOC) parser.add_argument('-c', '--config_dir', help='path to a directory containing the config file (used if -l, -w unspecified)', default=CONFIG_FILE_DIR) parser.add_argument('--loglvl', help='level to print log messages', default='INFO') parser.add_argument('-s', '--silencer', help='shortcut to mute log messages', action='store_true') args = parser.parse_args() signal.signal(signal.SIGINT, handle_interrupt) # graceful exist on ^C if not args.launchpad_file and os.path.exists(os.path.join(args.config_dir, 'my_launchpad.yaml')): args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml') if not args.fworker_file and os.path.exists(os.path.join(args.config_dir, 'my_fworker.yaml')): args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml') args.loglvl = 'CRITICAL' if args.silencer else args.loglvl if args.command == 'singleshot' and args.offline: launchpad = None else: launchpad = LaunchPad.from_file(args.launchpad_file) if args.launchpad_file else LaunchPad(strm_lvl=args.loglvl) if args.fworker_file: fworker = FWorker.from_file(args.fworker_file) else: fworker = FWorker() # prime addr lookups _log = get_fw_logger("rlaunch", stream_level="INFO") _log.info("Hostname/IP lookup (this will take a few seconds)") get_my_host() get_my_ip() if args.command == 'rapidfire': rapidfire(launchpad, fworker, None, args.nlaunches, -1, args.sleep, args.loglvl) else: launch_rocket(launchpad, fworker, args.fw_id, args.loglvl)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger("rocket.launcher", l_dir=l_dir, stream_level=loglvl) rapidfire( launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, ) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi(l_logger, "Sleeping for {} secs before resubmit sub job".format(sleep_time)) time.sleep(sleep_time) log_multi(l_logger, "Resubmit sub job".format(sleep_time)) rapidfire( launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, ) else: break log_multi(l_logger, "Sub job finished")
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False, local_redirect=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node local_redirect (bool): redirect standard input and output to local file """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() firing_state_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict, local_redirect=local_redirect, firing_state_dict=firing_state_dict) FWData().Running_IDs = running_ids_dict FWData().FiringState = firing_state_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param max_loops: (int) maximum number of loops :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 num_loops = 0 while num_loops != max_loops: while launchpad.run_exists(fworker): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break time.sleep( 0.15 ) # add a small amount of buffer breathing time for DB to refresh, etc. if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
def __init__(self, host='localhost', port=27017, database='fireworks', username=None, password=None, filepad_coll_name="filepad", gridfs_coll_name="filepad_gfs", logdir=None, strm_lvl=None, text_mode=False): """ Args: host (str): hostname port (int): port number database (str): database name username (str) password (str) filepad_coll_name (str): filepad collection name gridfs_coll_name (str): gridfs collection name logdir (str): path to the log directory strm_lvl (str): the logger stream level text_mode (bool): whether to use text_mode for file read/write (instead of binary). Might be useful if working only with text files between Windows and Unix systems """ self.host = host self.port = int(port) self.database = database self.username = username self.password = password self.gridfs_coll_name = gridfs_coll_name self.text_mode = text_mode try: self.connection = MongoClient(self.host, self.port) self.db = self.connection[database] except Exception: raise Exception("connection failed") try: if self.username: self.db.authenticate(self.username, self.password) except Exception: raise Exception("authentication failed") # set collections: filepad and gridfs self.filepad = self.db[filepad_coll_name] self.gridfs = gridfs.GridFS(self.db, gridfs_coll_name) # logging self.logdir = logdir self.strm_lvl = strm_lvl if strm_lvl else 'INFO' self.logger = get_fw_logger('filepad', l_dir=self.logdir, stream_level=self.strm_lvl) # build indexes self.build_indexes()
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop until max_loops :param max_loops: (int) maximum number of loops (default -1 is infinite) :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout :param timeout: (int) # of seconds after which to stop the rapidfire process """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: time.sleep(0.15) # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def launch_rocket(launchpad, fworker=None, logdir=None, strm_lvl=None, fw_id=None): """ Run a single rocket in the current directory :param launchpad: a LaunchPad object :param fworker: a FWorker object """ fworker = fworker if fworker else FWorker() l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=strm_lvl) l_logger.info('Launching Rocket') rocket = Rocket(launchpad, fworker, fw_id) rocket.run() l_logger.info('Rocket finished')
def __init__(self, host='localhost', port=27017, database='fireworks', username=None, password=None, filepad_coll_name="filepad", gridfs_coll_name="filepad_gfs", logdir=None, strm_lvl=None): """ Args: host (str): hostname port (int): port number database (str): database name username (str) password (str) filepad_coll_name (str): filepad collection name gridfs_coll_name (str): gridfs collection name logdir (str): path to the log directory strm_lvl (str): the logger stream level """ self.host = host self.port = int(port) self.database = database self.username = username self.password = password self.gridfs_coll_name = gridfs_coll_name try: self.connection = MongoClient(self.host, self.port) self.db = self.connection[database] except: raise Exception("connection failed") try: if self.username: self.db.authenticate(self.username, self.password) except: raise Exception("authentication failed") # set collections: filepad and gridfs self.filepad = self.db[filepad_coll_name] self.gridfs = gridfs.GridFS(self.db, gridfs_coll_name) # logging self.logdir = logdir self.strm_lvl = strm_lvl if strm_lvl else 'INFO' self.logger = get_fw_logger('filepad', l_dir=self.logdir, stream_level=self.strm_lvl) # build indexes self.build_indexes()
def __init__( self, host="localhost", port=27017, database="fireworks", username=None, password=None, filepad_coll_name="filepad", gridfs_coll_name="filepad_gfs", logdir=None, strm_lvl=None, ): """ Args: host (str): hostname port (int): port number database (str): database name username (str) password (str) filepad_coll_name (str): filepad collection name gridfs_coll_name (str): gridfs collection name logdir (str): path to the log directory strm_lvl (str): the logger stream level """ self.host = host self.port = int(port) self.database = database self.username = username self.password = password self.gridfs_coll_name = gridfs_coll_name try: self.connection = MongoClient(self.host, self.port) self.db = self.connection[database] except: raise Exception("connection failed") try: if self.username: self.db.authenticate(self.username, self.password) except: raise Exception("authentication failed") # set collections: filepad and gridfs self.filepad = self.db[filepad_coll_name] self.gridfs = gridfs.GridFS(self.db, gridfs_coll_name) # logging self.logdir = logdir self.strm_lvl = strm_lvl if strm_lvl else "INFO" self.logger = get_fw_logger("filepad", l_dir=self.logdir, stream_level=self.strm_lvl) # build indexes self.build_indexes()
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict) FWData().Running_IDs = running_ids_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param max_loops: (int) maximum number of loops :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 num_loops = 0 while num_loops != max_loops: skip_check = False # this is used to speed operation while skip_check or launchpad.run_exists(fworker): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: time.sleep(0.15) # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def rapidfire(launchpad, fworker=None, m_dir=None, logdir=None, strm_lvl=None, nlaunches=0, sleep_time=60, max_loops=-1): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: a LaunchPad object :param fworker: a FWorker object :param m_dir: the directory in which to loop Rocket running :param nlaunches: 0 means 'until completion', -1 means 'infinity' """ curdir = m_dir if m_dir else os.getcwd() fworker = fworker if fworker else FWorker() # initialize logger l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) # TODO: wrap in try-except. Use log_exception for exceptions EXCEPT running out of jobs. # TODO: always chdir() back to curdir when finished...then delete cruft from MongoTests num_launched = 0 num_loops = 0 while num_loops != max_loops: while launchpad.run_exists(): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) launch_rocket(launchpad, fworker, logdir, strm_lvl) num_launched += 1 if num_launched == nlaunches: break time.sleep( 0.1 ) # add a small amount of buffer breathing time for DB to refresh, etc. if num_launched == nlaunches or nlaunches == 0: break l_logger.info('Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 l_logger.info('Checking for FWs to run...'.format(sleep_time))
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO'): """ Run a single rocket in the current directory :param launchpad: (LaunchPad) :param fworker: (FWorker) :param fw_id: (int) if set, a particular FireWork to run :param strm_lvl: (str) level at which to output logs to stdout """ fworker = fworker if fworker else FWorker() if launchpad: l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) else: # offline mode l_logger = get_fw_logger('rocket.launcher', l_dir=None, stream_level=strm_lvl) log_multi(l_logger, 'Launching Rocket') rocket = Rocket(launchpad, fworker, fw_id) rocket_ran = rocket.run() log_multi(l_logger, 'Rocket finished') return rocket_ran
def submit_to_queue(self, queue_params, script_file): """ for documentation, see parent object """ if not os.path.exists(script_file): raise ValueError( 'Cannot find script file located at: {}'.format(script_file)) # initialize logger slurm_logger = get_fw_logger('rocket.slurm', queue_params.logging_dir) # submit the job try: cmd = ['sbatch', script_file] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() # grab the returncode. SLURM returns 0 if the job was successful if p.returncode == 0: try: # output should of the form '2561553.sdb' or '352353.jessup' - just grab the first part for job id job_id = int(p.stdout.read().split()[3]) slurm_logger.info( 'Job submission was successful and job_id is {}'. format(job_id)) return job_id except: # probably error parsing job code log_exception(slurm_logger, 'Could not parse job id following slurm...') else: # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc... msgs = [ 'Error in job submission with SLURM file {f} and cmd {c}'. format(f=script_file, c=cmd) ] msgs.append('The error response reads: {}'.format( p.stderr.read())) log_fancy(slurm_logger, 'error', msgs) except: # random error, e.g. no qsub on machine! log_exception(slurm_logger, 'Running slurm caused an error...')
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO'): """ Run a single rocket in the current directory :param launchpad: (LaunchPad) :param fworker: (FWorker) :param fw_id: (int) if set, a particular Firework to run :param strm_lvl: (str) level at which to output logs to stdout """ fworker = fworker if fworker else FWorker() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=strm_lvl) log_multi(l_logger, 'Launching Rocket') rocket = Rocket(launchpad, fworker, fw_id) rocket_ran = rocket.run() log_multi(l_logger, 'Rocket finished') return rocket_ran
def get_njobs_in_queue(self, rocket_params, username=None): """ for documentation, see parent object """ # TODO: (low-priority) parse the qstat -x output as an alternate way to get this working # tmp_file_name = 'tmp_qstat.xml' # cmd = ['qstat', '-x']\n # initialize logger pbs_logger = get_fw_logger('rocket.pbs', rocket_params.logging_dir) # initialize username if username is None: username = getpass.getuser() # run qstat cmd = ['qstat', '-a', '-u', username] p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE) p.wait() # parse the result if p.returncode == 0: # lines should have this form # '1339044.sdb username queuename 2012-02-29-16-43 20460 -- -- -- 00:20 C 00:09' # count lines that include the username in it # TODO: only count running or queued jobs. or rather, *don't* count jobs that are 'C'. outs = p.stdout.readlines() rx = re.compile(username) njobs = len( [line.split() for line in outs if rx.search(line) is not None]) pbs_logger.info( 'The number of jobs currently in the queue is: {}'.format( njobs)) return njobs # there's a problem talking to qstat server? msgs = [ 'Error trying to get the number of jobs in the queue using qstat service' ] msgs.append('The error response reads: {}'.format(p.stderr.read())) log_fancy(pbs_logger, 'error', msgs) return None
def get_njobs_in_queue(self, queue_params, username=None): """ for documentation, see parent object """ # TODO: (low-priority) parse the qstat -x output as an alternate way to get this working # tmp_file_name = 'tmp_qstat.xml' # cmd = ['qstat', '-x']\n # initialize logger slurm_logger = get_fw_logger('rocket.slurm', queue_params.logging_dir) # initialize username if username is None: username = getpass.getuser() # run qstat cmd = ['squeue', '-o "%u"', '-u', username] p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE) p.wait() # parse the result if p.returncode == 0: # lines should have this form # username # count lines that include the username in it outs = p.stdout.readlines() rx = re.compile(username) njobs = len( [line.split() for line in outs if rx.search(line) is not None]) slurm_logger.info( 'The number of jobs currently in the queue is: {}'.format( njobs)) return njobs # there's a problem talking to qstat server? msgs = [ 'Error trying to get the number of jobs in the queue using squeue service' ] msgs.append('The error response reads: {}'.format(p.stderr.read())) log_fancy(slurm_logger, 'error', msgs) return None
def __init__(self, host='localhost', port=27017, name='fireworks', username=None, password=None, logdir=None, strm_lvl=None, user_indices=None, wf_user_indices=None): """ :param host: :param port: :param name: :param username: :param password: :param logdir: :param strm_lvl: :param user_indices: :param wf_user_indices: """ self.host = host self.port = port self.name = name self.username = username self.password = password # set up logger self.logdir = logdir self.strm_lvl = strm_lvl if strm_lvl else 'INFO' self.m_logger = get_fw_logger('launchpad', l_dir=self.logdir, stream_level=self.strm_lvl) self.user_indices = user_indices if user_indices else [] self.wf_user_indices = wf_user_indices if wf_user_indices else [] # get connection self.connection = MongoClient(host, port, j=True) self.db = self.connection[name] if username: self.db.authenticate(username, password) self.fireworks = self.db.fireworks self.launches = self.db.launches self.offline_runs = self.db.offline_runs self.fw_id_assigner = self.db.fw_id_assigner self.workflows = self.db.workflows
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None, local_redirect=False, pdb_on_exception=False): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process local_redirect (bool): redirect standard input and output to local file """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 def time_ok(): # has the rapidfire run timed out? return (timeout is None or (datetime.now() - start_time).total_seconds() < timeout) while num_loops != max_loops and time_ok(): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and time_ok(): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) if local_redirect: with redirect_local(): rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl, pdb_on_exception=pdb_on_exception) else: rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl, pdb_on_exception=pdb_on_exception) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if nlaunches > 0 and num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if nlaunches == 0: if not launchpad.future_run_exists(fworker): break elif num_launched == nlaunches: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...') os.chdir(curdir)
def __init__( self, host="localhost", port=27017, database="fireworks", username=None, password=None, authsource=None, uri_mode=False, mongoclient_kwargs=None, filepad_coll_name="filepad", gridfs_coll_name="filepad_gfs", logdir=None, strm_lvl=None, text_mode=False, ): """ Args: host (str): hostname port (int): port number database (str): database name username (str) password (str) authsource (str): authSource parameter for MongoDB authentication; defaults to "name" (i.e., db name) if not set uri_mode (bool): if set True, all Mongo connection parameters occur through a MongoDB URI string (set as the host). filepad_coll_name (str): filepad collection name gridfs_coll_name (str): gridfs collection name logdir (str): path to the log directory strm_lvl (str): the logger stream level text_mode (bool): whether to use text_mode for file read/write (instead of binary). Might be useful if working only with text files between Windows and Unix systems """ self.host = host self.port = int(port) self.database = database self.username = username self.password = password self.authsource = authsource or self.database self.mongoclient_kwargs = mongoclient_kwargs or {} self.uri_mode = uri_mode self.gridfs_coll_name = gridfs_coll_name self.text_mode = text_mode # get connection if uri_mode: self.connection = MongoClient(host) dbname = host.split("/")[-1].split("?")[ 0] # parse URI to extract dbname self.db = self.connection[dbname] else: self.connection = MongoClient( self.host, self.port, socketTimeoutMS=MONGO_SOCKET_TIMEOUT_MS, username=self.username, password=self.password, authSource=self.authsource, **self.mongoclient_kwargs, ) self.db = self.connection[self.database] # except Exception: # raise Exception("connection failed") # try: # if self.username: # self.db.authenticate(self.username, self.password) # except Exception: # raise Exception("authentication failed") # set collections: filepad and gridfs self.filepad = self.db[filepad_coll_name] self.gridfs = gridfs.GridFS(self.db, gridfs_coll_name) # logging self.logdir = logdir self.strm_lvl = strm_lvl if strm_lvl else "INFO" self.logger = get_fw_logger("filepad", l_dir=self.logdir, stream_level=self.strm_lvl) # build indexes self.build_indexes()
def arlaunch(): """ Function rapid-fire job launching """ m_description = 'This program launches one or more Rockets. A Rocket retrieves a job from the ' \ 'central database and runs it. The "single-shot" option launches a single Rocket, ' \ 'whereas the "rapidfire" option loops until all FireWorks are completed.' parser = ArgumentParser(description=m_description) subparsers = parser.add_subparsers(help='command', dest='command') single_parser = subparsers.add_parser('singleshot', help='launch a single Rocket') rapid_parser = subparsers.add_parser( 'rapidfire', help='launch multiple Rockets (loop until all FireWorks complete)') multi_parser = subparsers.add_parser( 'multi', help='launches multiple Rockets simultaneously') single_parser.add_argument('-f', '--fw_id', help='specific fw_id to run', default=None, type=int) single_parser.add_argument('--offline', help='run in offline mode (FW.json required)', action='store_true') single_parser.add_argument('--pdb', help='shortcut to invoke debugger on error', action='store_true') rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite"; ' 'default 0 is all jobs in DB)', default=0) rapid_parser.add_argument( '--timeout', help='timeout (secs) after which to quit (default None)', default=None, type=int) rapid_parser.add_argument( '--max_loops', help='after this many sleep loops, quit even in ' 'infinite nlaunches mode (default -1 is infinite loops)', default=-1, type=int) rapid_parser.add_argument('--sleep', help='sleep time between loops (secs)', default=None, type=int) rapid_parser.add_argument( '--local_redirect', help="Redirect stdout and stderr to the launch directory", action="store_true") multi_parser.add_argument('num_jobs', help='the number of jobs to run in parallel', type=int) multi_parser.add_argument('--nlaunches', help='number of FireWorks to run in series per ' 'parallel job (int or "infinite"; default 0 is ' 'all jobs in DB)', default=0) multi_parser.add_argument( '--sleep', help='sleep time between loops in infinite launch mode' '(secs)', default=None, type=int) multi_parser.add_argument( '--timeout', help='timeout (secs) after which to quit (default None)', default=None, type=int) multi_parser.add_argument( '--nodefile', help='nodefile name or environment variable name ' 'containing the node file name (for populating' ' FWData only)', default=None, type=str) multi_parser.add_argument( '--ppn', help='processors per node (for populating FWData only)', default=1, type=int) multi_parser.add_argument('--exclude_current_node', help="Don't use the script launching node" "as compute node", action="store_true") multi_parser.add_argument( '--local_redirect', help="Redirect stdout and stderr to the launch directory", action="store_true") parser.add_argument('-l', '--launchpad_file', help='path to launchpad file') parser.add_argument('-w', '--fworker_file', required=True, help='path to fworker file') parser.add_argument('-c', '--config_dir', help='path to a directory containing the config file ' '(used if -l, -w unspecified)', default=CONFIG_FILE_DIR) parser.add_argument('--loglvl', help='level to print log messages', default='INFO') parser.add_argument('-s', '--silencer', help='shortcut to mute log messages', action='store_true') try: import argcomplete argcomplete.autocomplete(parser) # This supports bash autocompletion. To enable this, pip install # argcomplete, activate global completion, or add # eval "$(register-python-argcomplete rlaunch)" # into your .bash_profile or .bashrc except ImportError: pass args = parser.parse_args() signal.signal(signal.SIGINT, handle_interrupt) # graceful exit on ^C if not args.launchpad_file and os.path.exists( os.path.join(args.config_dir, 'my_launchpad.yaml')): args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml') elif not args.launchpad_file: args.launchpad_file = LAUNCHPAD_LOC args.loglvl = 'CRITICAL' if args.silencer else args.loglvl if args.command == 'singleshot' and args.offline: launchpad = None else: launchpad = LaunchPad.from_file( args.launchpad_file) if args.launchpad_file else LaunchPad( strm_lvl=args.loglvl) fworker = AiiDAFWorker.from_file(args.fworker_file) # prime addr lookups _log = get_fw_logger("rlaunch", stream_level="INFO") _log.info("Hostname/IP lookup (this will take a few seconds)") get_my_host() get_my_ip() if args.command == 'rapidfire': rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=args.nlaunches, max_loops=args.max_loops, sleep_time=args.sleep, strm_lvl=args.loglvl, timeout=args.timeout, local_redirect=args.local_redirect) elif args.command == 'multi': total_node_list = None if args.nodefile: if args.nodefile in os.environ: args.nodefile = os.environ[args.nodefile] with open(args.nodefile, 'r') as fhandle: total_node_list = [ line.strip() for line in fhandle.readlines() ] launch_multiprocess(launchpad, fworker, args.loglvl, args.nlaunches, args.num_jobs, args.sleep, total_node_list, args.ppn, timeout=args.timeout, exclude_current_node=args.exclude_current_node, local_redirect=args.local_redirect) else: launch_rocket(launchpad, fworker, args.fw_id, args.loglvl, pdb_on_exception=args.pdb)
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=10, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO'): """ Submit many jobs to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launch_dir: directory where we want to write the blocks :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath( os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while jobs_in_queue < njobs_queue and launchpad.run_exists( fworker): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info( 'Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job if not launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True): raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format( QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info( 'Finished a round of launches, sleeping for {} secs'.format( sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
from fireworks.utilities.fw_serializers import DATETIME_HANDLER from fireworks.utilities.fw_utilities import get_fw_logger from fireworks.core.launchpad import LaunchPad from fireworks.fw_config import WEBSERVER_PERFWARNINGS import fireworks.flask_site.helpers as fwapp_util from fireworks.flask_site.util import jsonify app = Flask(__name__) app.use_reloader = True app.secret_key = os.environ.get("FWAPP_SECRET_KEY", os.urandom(24)) hello = __name__ app.BASE_Q = {} app.BASE_Q_WF = {} logger = get_fw_logger('app') PER_PAGE = 20 STATES = sorted(Firework.STATE_RANKS, key=Firework.STATE_RANKS.get) def check_auth(username, password): """ This function is called to check if a username / password combination is valid. """ AUTH_USER = app.config.get("WEBGUI_USERNAME") AUTH_PASSWD = app.config.get("WEBGUI_PASSWORD") if (AUTH_USER is None) or (AUTH_PASSWD is None): return True
app = Flask(__name__) # Allow application to run under a service prefix url if os.environ.get("FW_APPLICATION_ROOT"): app.config["APPLICATION_ROOT"] = os.environ.get("FW_APPLICATION_ROOT") app.use_reloader = True app.secret_key = os.environ.get("FWAPP_SECRET_KEY", os.urandom(24)) hello = __name__ app.BASE_Q = {} app.BASE_Q_WF = {} logger = get_fw_logger("app") PER_PAGE = 20 STATES = sorted(Firework.STATE_RANKS, key=Firework.STATE_RANKS.get) def check_auth(username, password): """ This function is called to check if a username / password combination is valid. """ AUTH_USER = app.config.get("WEBGUI_USERNAME") AUTH_PASSWD = app.config.get("WEBGUI_PASSWORD") if (AUTH_USER is None) or (AUTH_PASSWD is None): return True
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False): """ Submit a single job to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launcher_dir: (str) The directory where to submit the job :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages :param create_launcher_dir: (bool) Whether to create a subfolder launcher+timestamp, if needed """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict( )) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError( "Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!" ) if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) if launchpad.run_exists(fworker): try: if reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir) if not fw: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!' ) return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug( 'updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir try: os.makedirs(launcher_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: if reserve: l_logger.info( 'Un-reserving FW with fw_id, launch_id: {}, {}'. format(fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) raise RuntimeError( 'queue script could not be submitted, check queue script/queue adapter/queue server status!' ) elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') return False else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!') return False
def get_qlogger(self, name): if "logdir" in self: return get_fw_logger(name, self["logdir"]) else: return get_fw_logger(name, stream_level="CRITICAL")
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False, fill_mode=False, fw_id=None): """ Submit a single job to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launcher_dir (str): The directory where to submit the job reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) fw_id (int): specific fw_id to reserve (reservation mode only) """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict( )) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError("Must use reservation mode (-r option) of qlaunch " "when using offline option of rlaunch!!") if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) if fill_mode and reserve: raise ValueError( "Fill_mode cannot be used in conjunction with reserve mode!") if fw_id and not reserve: raise ValueError( "qlaunch for specific fireworks may only be used in reservation mode." ) if fill_mode or launchpad.run_exists(fworker): launch_id = None try: if reserve: if fw_id: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir, fw_id=fw_id) if not fw: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!' ) return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug( 'updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir makedirs_p(launcher_dir) launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError( 'queue script could not be submitted, check queue ' 'script/queue adapter/queue server status!') elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') if reserve and launch_id is not None: try: l_logger.info( 'Un-reserving FW with fw_id, launch_id: {}, {}'.format( fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) launchpad.forget_offline(launch_id) except: log_exception( l_logger, 'Error unreserving FW with fw_id {}'.format(fw.fw_id)) return False else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!') return None # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO'): """ Submit a single job to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launcher_dir: (str) The directory where to submit the job :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # get the queue adapter l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict( )) # make a defensive copy, mainly for reservation mode # make sure launch_dir exists: if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) if launchpad.run_exists(fworker): try: # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) os.chdir(launcher_dir) oldlaunch_dir = None if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError( "Must use reservation mode (-r option) of qlaunch when using offline mode (--offline option) of rlaunch!!" ) elif reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad._reserve_fw(fworker, launcher_dir) if not fw: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!' ) return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # set job name to the FW name job_name = get_slug(fw.name) job_name = job_name[0:20] if len(job_name) > 20 else job_name qadapter.update({'job_name': job_name}) # set the job name to FW name if '_queueadapter' in fw.spec: l_logger.debug( 'updating queue params using FireWork spec..') qadapter.update(fw.spec['_queueadapter']) # update the exe to include the FW_id if 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) if '--offline' in qadapter['rocket_launch']: # handle _launch_dir parameter early... if '_launch_dir' in fw.spec: os.chdir(fw.spec['_launch_dir']) oldlaunch_dir = launcher_dir launcher_dir = os.path.abspath(os.getcwd()) launchpad._change_launch_dir(launch_id, launcher_dir) # write FW.json fw.to_file("FW.json") # write Launchid with open('FW_offline.json', 'w') as f: f.write('{"launch_id":%s}' % launch_id) launchpad.add_offline_run(launch_id, fw.fw_id, fw.name) # write and submit the queue script using the queue adapter l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError( 'queue script could not be submitted, check queue adapter and queue server status!' ) elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') return False finally: if oldlaunch_dir: os.chdir( oldlaunch_dir ) # this only matters in --offline mode with _launch_dir! else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!') return False
def get_qlogger(self, name): if 'logdir' in self: return get_fw_logger(name, self['logdir']) else: return get_fw_logger(name, stream_level='CRITICAL')
def run(self, pdb_on_exception=False): """ Run the rocket (check out a job from the database and execute it) Args: pdb_on_exception (bool): whether to invoke the debugger on a caught exception. Default False. """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) logdir = lp.get_logdir() if lp else None l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=ROCKET_STREAM_LOGLEVEL) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['started_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False final_state = None ping_stop = None btask_stops = [] try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except Exception: pass recovery = m_fw.spec.get('_recovery', None) if recovery: recovery_dir = recovery.get('_prev_dir') recovery_mode = recovery.get('_mode') starting_task = recovery.get('_task_n') all_stored_data.update(recovery.get('_all_stored_data')) all_update_spec.update(recovery.get('_all_update_spec')) all_mod_spec.extend(recovery.get('_all_mod_spec')) if lp: l_logger.log( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recovery_dir)) if recovery_mode == 'cp' and launch_dir != recovery_dir: if lp: l_logger.log( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recovery_dir, launch_dir)) distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1) else: starting_task = 0 files_in = m_fw.spec.get("_files_in", {}) prev_files = m_fw.spec.get("_files_prev", {}) for f in set(files_in.keys()).intersection(prev_files.keys()): # We use zopen for the file objects for transparent handling # of zipped files. shutil.copyfileobj does the actual copy # in chunks that avoid memory issues. with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout: shutil.copyfileobj(fin, fout) if lp: message = 'RUNNING fw_id: {} in directory: {}'. \ format(m_fw.fw_id, os.getcwd()) l_logger.log(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the Firetasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): checkpoint = {'_task_n': t_counter, '_all_stored_data': all_stored_data, '_all_update_spec': all_update_spec, '_all_mod_spec': all_mod_spec} Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint) if lp: l_logger.log(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.fw_id = m_fw.fw_id if FWData().MULTIPROCESSING: # hack because AutoProxy manager can't access attributes t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict()) else: t.launchpad = self.launchpad if my_spec.get("_add_fworker"): t.fworker = self.fworker try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details if pdb_on_exception: pdb.post_mortem() try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: l_logger.log(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except Exception: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'FIZZLED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True # read in a FWAction from a file, in case the task is not Python and cannot return # it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this # particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'COMPLETED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} reached final state {} but couldn't complete the update of " "the database. Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True except Exception: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except Exception: traceback.print_exc() if lp: try: lp.complete_launch(launch_id, m_action, 'FIZZLED') except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} fizzled but couldn't complete the update of the database." " Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=0, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None, fill_mode=False): """ Submit many jobs to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launch_dir (str): directory where we want to write the blocks nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit njobs_block (int): automatically write a new block when njobs_block jobs are in a single block sleep_time (int): secs to sleep between rapidfire loop iterations reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages timeout (int): # of seconds after which to stop the rapidfire process fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 start_time = datetime.now() try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath( os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while (not njobs_queue or jobs_in_queue < njobs_queue) and \ (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \ and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info( 'Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job return_code = launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode) if return_code is None: l_logger.info('No READY jobs detected...') break elif not return_code: raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format( QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0 or \ (timeout and (datetime.now() - start_time).total_seconds() >= timeout): break l_logger.info( 'Finished a round of launches, sleeping for {} secs'.format( sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')