def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param max_loops: (int) maximum number of loops :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 num_loops = 0 while num_loops != max_loops: while launchpad.run_exists(fworker): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break time.sleep( 0.15 ) # add a small amount of buffer breathing time for DB to refresh, etc. if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop until max_loops :param max_loops: (int) maximum number of loops (default -1 is infinite) :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout :param timeout: (int) # of seconds after which to stop the rapidfire process """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: time.sleep(0.15) # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: (LaunchPad) :param fworker: (FWorker object) :param m_dir: (str) the directory in which to loop Rocket running :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param max_loops: (int) maximum number of loops :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param strm_lvl: (str) level at which to output logs to stdout """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = fworker if fworker else FWorker() num_launched = 0 num_loops = 0 while num_loops != max_loops: skip_check = False # this is used to speed operation while skip_check or launchpad.run_exists(fworker): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: time.sleep(0.15) # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def rapidfire(launchpad, fworker=None, m_dir=None, logdir=None, strm_lvl=None, nlaunches=0, sleep_time=60, max_loops=-1): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: a LaunchPad object :param fworker: a FWorker object :param m_dir: the directory in which to loop Rocket running :param nlaunches: 0 means 'until completion', -1 means 'infinity' """ curdir = m_dir if m_dir else os.getcwd() fworker = fworker if fworker else FWorker() # initialize logger l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) # TODO: wrap in try-except. Use log_exception for exceptions EXCEPT running out of jobs. # TODO: always chdir() back to curdir when finished...then delete cruft from MongoTests num_launched = 0 num_loops = 0 while num_loops != max_loops: while launchpad.run_exists(): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) launch_rocket(launchpad, fworker, logdir, strm_lvl) num_launched += 1 if num_launched == nlaunches: break time.sleep( 0.1 ) # add a small amount of buffer breathing time for DB to refresh, etc. if num_launched == nlaunches or nlaunches == 0: break l_logger.info('Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 l_logger.info('Checking for FWs to run...'.format(sleep_time))
def rapidfire(launchpad, fworker=None, m_dir=None, logdir=None, strm_lvl=None, nlaunches=0, sleep_time=60, max_loops=-1): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. :param launchpad: a LaunchPad object :param fworker: a FWorker object :param m_dir: the directory in which to loop Rocket running :param nlaunches: 0 means 'until completion', -1 means 'infinity' """ curdir = m_dir if m_dir else os.getcwd() fworker = fworker if fworker else FWorker() # initialize logger l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) # TODO: wrap in try-except. Use log_exception for exceptions EXCEPT running out of jobs. # TODO: always chdir() back to curdir when finished...then delete cruft from MongoTests num_launched = 0 num_loops = 0 while num_loops != max_loops: while launchpad.run_exists(): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) launch_rocket(launchpad, fworker, logdir, strm_lvl) num_launched += 1 if num_launched == nlaunches: break time.sleep(0.1) # add a small amount of buffer breathing time for DB to refresh, etc. if num_launched == nlaunches or nlaunches == 0: break l_logger.info('Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 l_logger.info('Checking for FWs to run...'.format(sleep_time))
def rapidfire(queue_params, launch_dir='.', njobs_queue=10, njobs_block=500, strm_lvl=None, nlaunches=0, sleep_time=60, launchpad=None, fworker=None, reserve=False): """ Submit many jobs to the queue. :param queue_params: A QueueParams instance :param launch_dir: directory where we want to write the blocks :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block """ # convert launch_dir to absolute path launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) # initialize logger l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue(queue_params, njobs_queue, l_logger) jobs_exist = not launchpad or launchpad.run_exists() while jobs_in_queue < njobs_queue and jobs_exist: l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info('Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # create launcher_dir launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_') # launch a single job launch_rocket_to_queue(queue_params, launcher_dir, strm_lvl, launchpad, fworker, reserve) # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format(FWConfig().QUEUE_UPDATE_INTERVAL)) time.sleep(FWConfig().QUEUE_UPDATE_INTERVAL) num_launched += 1 if num_launched == nlaunches: break jobs_exist = not launchpad or launchpad.run_exists() jobs_in_queue = _get_number_of_jobs_in_queue(queue_params, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
'--push-wf', dest='push_wf', action='store_true', help='Push demo workflow to DB.') parser.add_argument('-b', '--block', action='store_true', help='Create a new output block.') parser.add_argument('-u', '--update-only', action='store_true', help='Update the DB, but do not process jobs.') parser.add_argument('-d', '--daemon-mode', action='store_true', help='Update the DB, but do not process jobs.') args = parser.parse_args() if args.reset: launchpad.reset('', require_password=False) if args.push_wf: launchpad.add_wf(workflow) if args.block: from fireworks.utilities.fw_utilities import create_datestamp_dir create_datestamp_dir(out_dir, launchpad.m_logger) if not args.update_only: process_offline(launchpad, launcher_args, args.daemon_mode)
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False): """ Submit a single job to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launcher_dir: (str) The directory where to submit the job :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages :param create_launcher_dir: (bool) Whether to create a subfolder launcher+timestamp, if needed """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict()) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError("Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!") if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!') if launchpad.run_exists(fworker): try: if reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir) if not fw: l_logger.info('No jobs exist in the LaunchPad for submission to queue!') return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug('updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir try: os.makedirs(launcher_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: if reserve: l_logger.info('Un-reserving FW with fw_id, launch_id: {}, {}'.format(fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) raise RuntimeError('queue script could not be submitted, check queue script/queue adapter/queue server status!') elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') return False else: l_logger.info('No jobs exist in the LaunchPad for submission to queue!') return False
def rapidfire(queue_params, launch_dir='.', njobs_queue=10, njobs_block=500, strm_lvl=None, nlaunches=0, sleep_time=60, launchpad=None, fworker=None, reserve=False): """ Submit many jobs to the queue. :param queue_params: A QueueParams instance :param launch_dir: directory where we want to write the blocks :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block """ # convert launch_dir to absolute path launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) # initialize logger l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue( queue_params, njobs_queue, l_logger) jobs_exist = not launchpad or launchpad.run_exists() while jobs_in_queue < njobs_queue and jobs_exist: l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info( 'Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # create launcher_dir launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_') # launch a single job launch_rocket_to_queue(queue_params, launcher_dir, strm_lvl, launchpad, fworker, reserve) # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format( FWConfig().QUEUE_UPDATE_INTERVAL)) time.sleep(FWConfig().QUEUE_UPDATE_INTERVAL) num_launched += 1 if num_launched == nlaunches: break jobs_exist = not launchpad or launchpad.run_exists() jobs_in_queue = _get_number_of_jobs_in_queue( queue_params, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info( 'Finished a round of launches, sleeping for {} secs'.format( sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=0, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None, fill_mode=False): """ Submit many jobs to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launch_dir (str): directory where we want to write the blocks nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit njobs_block (int): automatically write a new block when njobs_block jobs are in a single block sleep_time (int): secs to sleep between rapidfire loop iterations reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages timeout (int): # of seconds after which to stop the rapidfire process fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 start_time = datetime.now() try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while (not njobs_queue or jobs_in_queue < njobs_queue) and \ (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \ and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info('Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job return_code = launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode) if return_code is None: l_logger.info('No READY jobs detected...') break elif not return_code: raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format(QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0 or \ (timeout and (datetime.now() - start_time).total_seconds() >= timeout): break l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False, fill_mode=False, fw_id=None): """ Submit a single job to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launcher_dir (str): The directory where to submit the job reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) fw_id (int): specific fw_id to reserve (reservation mode only) """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict( )) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError("Must use reservation mode (-r option) of qlaunch " "when using offline option of rlaunch!!") if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) if fill_mode and reserve: raise ValueError( "Fill_mode cannot be used in conjunction with reserve mode!") if fw_id and not reserve: raise ValueError( "qlaunch for specific fireworks may only be used in reservation mode." ) if fill_mode or launchpad.run_exists(fworker): launch_id = None try: if reserve: if fw_id: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir, fw_id=fw_id) if not fw: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!' ) return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug( 'updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir makedirs_p(launcher_dir) launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError( 'queue script could not be submitted, check queue ' 'script/queue adapter/queue server status!') elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') if reserve and launch_id is not None: try: l_logger.info( 'Un-reserving FW with fw_id, launch_id: {}, {}'.format( fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) launchpad.forget_offline(launch_id) except: log_exception( l_logger, 'Error unreserving FW with fw_id {}'.format(fw.fw_id)) return False else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!') return None # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=0, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None, fill_mode=False): """ Submit many jobs to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launch_dir (str): directory where we want to write the blocks nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit njobs_block (int): automatically write a new block when njobs_block jobs are in a single block sleep_time (int): secs to sleep between rapidfire loop iterations reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages timeout (int): # of seconds after which to stop the rapidfire process fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 start_time = datetime.now() try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath( os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while (not njobs_queue or jobs_in_queue < njobs_queue) and \ (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \ and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info( 'Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job return_code = launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode) if return_code is None: l_logger.info('No READY jobs detected...') break elif not return_code: raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format( QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0 or \ (timeout and (datetime.now() - start_time).total_seconds() >= timeout): break l_logger.info( 'Finished a round of launches, sleeping for {} secs'.format( sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
#============================================================================== if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( description='Demonstrates Fireworks workflows on OLCF resources.') parser.add_argument('--reset', action='store_true', help='Reset the database.') parser.add_argument('-p', '--push-wf', dest='push_wf', action='store_true', help='Push demo workflow to DB.') parser.add_argument('-b', '--block', action='store_true', help='Create a new output block.') parser.add_argument('-u', '--update-only', action='store_true', help='Update the DB, but do not process jobs.') parser.add_argument('-d', '--daemon-mode', action='store_true', help='Update the DB, but do not process jobs.') args = parser.parse_args() if args.reset: launchpad.reset('', require_password=False) if args.push_wf: launchpad.add_wf(workflow) if args.block: from fireworks.utilities.fw_utilities import create_datestamp_dir create_datestamp_dir(out_dir, launchpad.m_logger) if not args.update_only: process_offline(launchpad, launcher_args, args.daemon_mode)
def rapidfire( launchpad, fworker, qadapter, launch_dir=".", block_dir=None, nlaunches=0, njobs_queue=0, njobs_block=500, sleep_time=None, reserve=False, strm_lvl="INFO", timeout=None, fill_mode=False, ): """ Submit many jobs to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launch_dir (str): directory where we want to write the blocks block_dir (str): directory to use as block dir. Can be a new or existing block. Dirname must start with 'block_'. nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit. If 0 skips the check on the number of jobs in the queue. njobs_block (int): automatically write a new block when njobs_block jobs are in a single block sleep_time (int): secs to sleep between rapidfire loop iterations reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages timeout (int): # of seconds after which to stop the rapidfire process fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == "infinite" else int(nlaunches) l_logger = get_fw_logger("queue.launcher", l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError(f"Desired launch directory {launch_dir} does not exist!") num_launched = 0 start_time = datetime.now() try: l_logger.info("getting queue adapter") prev_blocks = sorted(glob.glob(os.path.join(launch_dir, "block_*")), reverse=True) if block_dir is not None: if not block_dir.startswith("block_"): raise ValueError(f"Invalid name {block_dir}, block dirs must start with 'block_") block_dir = os.path.abspath(os.path.join(launch_dir, block_dir)) os.mkdir(block_dir, exist_ok=True) elif prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0])) l_logger.info(f"Found previous block, using {block_dir}") else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue if a maximum has been set. jobs_in_queue = 0 if njobs_queue: jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while launchpad.run_exists(fworker) or (fill_mode and not reserve): if timeout and (datetime.now() - start_time).total_seconds() >= timeout: l_logger.info("Timeout reached.") break if njobs_queue and jobs_in_queue >= njobs_queue: l_logger.info(f"Jobs in queue ({jobs_in_queue}) meets/exceeds maximum allowed ({njobs_queue})") break l_logger.info("Launching a rocket!") # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info(f"Block got bigger than {njobs_block} jobs.") block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job return_code = launch_rocket_to_queue( launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode ) if return_code is None: l_logger.info("No READY jobs detected...") break elif not return_code: raise RuntimeError("Launch unsuccessful!") num_launched += 1 if nlaunches > 0 and num_launched == nlaunches: l_logger.info(f"Launched allowed number of jobs: {num_launched}") break # wait for the queue system to update l_logger.info(f"Sleeping for {QUEUE_UPDATE_INTERVAL} seconds...zzz...") time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0 and njobs_queue: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) if ( (nlaunches > 0 and num_launched == nlaunches) or (timeout and (datetime.now() - start_time).total_seconds() >= timeout) or (nlaunches == 0 and not launchpad.future_run_exists(fworker)) ): break l_logger.info(f"Finished a round of launches, sleeping for {sleep_time} secs") time.sleep(sleep_time) l_logger.info("Checking for Rockets to run...") except Exception: log_exception(l_logger, "Error with queue launcher rapid fire!")
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None, local_redirect=False, pdb_on_exception=False): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process local_redirect (bool): redirect standard input and output to local file """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 def time_ok(): # has the rapidfire run timed out? return (timeout is None or (datetime.now() - start_time).total_seconds() < timeout) while num_loops != max_loops and time_ok(): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and time_ok(): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) if local_redirect: with redirect_local(): rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl, pdb_on_exception=pdb_on_exception) else: rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl, pdb_on_exception=pdb_on_exception) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if nlaunches > 0 and num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if nlaunches == 0: if not launchpad.future_run_exists(fworker): break elif num_launched == nlaunches: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...') os.chdir(curdir)
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=10, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO'): """ Submit many jobs to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launch_dir: directory where we want to write the blocks :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while jobs_in_queue < njobs_queue and launchpad.run_exists(fworker): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info('Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # create launcher_dir launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_') # launch a single job if not launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir, reserve, strm_lvl): raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format(QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=10, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO'): """ Submit many jobs to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launch_dir: directory where we want to write the blocks :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath( os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while jobs_in_queue < njobs_queue and launchpad.run_exists( fworker): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info( 'Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job if not launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True): raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format( QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info( 'Finished a round of launches, sleeping for {} secs'.format( sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False, fill_mode=False, fw_id=None): """ Submit a single job to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launcher_dir (str): The directory where to submit the job reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) fw_id (int): specific fw_id to reserve (reservation mode only) """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict()) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError("Must use reservation mode (-r option) of qlaunch " "when using offline option of rlaunch!!") if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!') if fill_mode and reserve: raise ValueError("Fill_mode cannot be used in conjunction with reserve mode!") if fw_id and not reserve: raise ValueError("qlaunch for specific fireworks may only be used in reservation mode.") if fill_mode or launchpad.run_exists(fworker): launch_id = None try: if reserve: if fw_id: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir, fw_id=fw_id) if not fw: l_logger.info('No jobs exist in the LaunchPad for submission to queue!') return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug('updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir makedirs_p(launcher_dir) launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError('queue script could not be submitted, check queue ' 'script/queue adapter/queue server status!') elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') if reserve and launch_id is not None: try: l_logger.info('Un-reserving FW with fw_id, launch_id: {}, {}'.format( fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) launchpad.forget_offline(launch_id) except: log_exception(l_logger, 'Error unreserving FW with fw_id {}'.format(fw.fw_id)) return False else: l_logger.info('No jobs exist in the LaunchPad for submission to queue!') return None # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False): """ Submit a single job to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launcher_dir: (str) The directory where to submit the job :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages :param create_launcher_dir: (bool) Whether to create a subfolder launcher+timestamp, if needed """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict( )) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError( "Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!" ) if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) if launchpad.run_exists(fworker): try: if reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir) if not fw: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!' ) return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug( 'updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir try: os.makedirs(launcher_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: if reserve: l_logger.info( 'Un-reserving FW with fw_id, launch_id: {}, {}'. format(fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) raise RuntimeError( 'queue script could not be submitted, check queue script/queue adapter/queue server status!' ) elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') return False else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!') return False