def _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger): """ Internal method to get the number of jobs in the queue using the given job params. In case of failure, automatically retries at certain intervals... Args: qadapter (QueueAdapter) njobs_queue (int): The desired maximum number of jobs in the queue l_logger (logger): A logger to put errors/info/warnings/etc. Return: (int) """ RETRY_INTERVAL = 30 # initial retry in 30 sec upon failure for i in range(QUEUE_RETRY_ATTEMPTS): try: jobs_in_queue = qadapter.get_njobs_in_queue() if jobs_in_queue is not None: l_logger.info(f"{jobs_in_queue} jobs in queue. Maximum allowed by user: {njobs_queue}") return jobs_in_queue except Exception: log_exception(l_logger, f"Could not get number of jobs in queue! Sleeping {RETRY_INTERVAL} secs...zzz...") time.sleep(RETRY_INTERVAL) RETRY_INTERVAL *= 2 raise RuntimeError("Unable to determine number of jobs in queue, check queue adapter and queue server status!")
def _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger): """ Internal method to get the number of jobs in the queue using the given job params. In case of failure, automatically retries at certain intervals... Args: qadapter (QueueAdapter) njobs_queue (int): The desired maximum number of jobs in the queue l_logger (logger): A logger to put errors/info/warnings/etc. Return: (int) """ RETRY_INTERVAL = 30 # initial retry in 30 sec upon failure for i in range(QUEUE_RETRY_ATTEMPTS): try: jobs_in_queue = qadapter.get_njobs_in_queue() if jobs_in_queue is not None: l_logger.info('{} jobs in queue. ' 'Maximum allowed by user: {}'.format(jobs_in_queue, njobs_queue)) return jobs_in_queue except: log_exception(l_logger, 'Could not get number of jobs in queue! ' 'Sleeping {} secs...zzz...'.format(RETRY_INTERVAL)) time.sleep(RETRY_INTERVAL) RETRY_INTERVAL *= 2 raise RuntimeError('Unable to determine number of jobs in queue, ' 'check queue adapter and queue server status!')
def _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger): """ Internal method to get the number of jobs in the queue using the given job params. \ In case of failure, automatically retries at certain intervals... :param qadapter: (QueueAdapter) :param njobs_queue: (int) The desired maximum number of jobs in the queue :param l_logger: (logger) A logger to put errors/info/warnings/etc. """ RETRY_INTERVAL = 30 # initial retry in 30 sec upon failure for i in range(QUEUE_RETRY_ATTEMPTS): try: jobs_in_queue = qadapter.get_njobs_in_queue() if jobs_in_queue is not None: l_logger.info( '{} jobs in queue. Maximum allowed by user: {}'.format( jobs_in_queue, njobs_queue)) return jobs_in_queue except: log_exception( l_logger, 'Could not get number of jobs in queue! Sleeping {} secs...zzz...' .format(RETRY_INTERVAL)) time.sleep(RETRY_INTERVAL) RETRY_INTERVAL *= 2 raise RuntimeError( 'Unable to determine number of jobs in queue, check queue adapter and queue server status!' )
def submit_to_queue(self, script_file): """ submits the job to the queue and returns the job id :param script_file: (str) name of the script file to use (String) :return: (int) job_id """ if not os.path.exists(script_file): raise ValueError( 'Cannot find script file located at: {}'.format( script_file)) queue_logger = self.get_qlogger('qadapter.{}'.format(self.q_name)) submit_cmd = self.q_commands[self.q_type]["submit_cmd"] # submit the job try: if self.q_type == "Cobalt": # Cobalt requires scripts to be executable os.chmod(script_file, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP) cmd = [submit_cmd, script_file] # For most of the queues handled by common_adapter, it's best to simply submit the file name # as an argument. LoadSharingFacility doesn't handle the header section (queue name, nodes, etc) # when taking file arguments, so the file needs to be passed as stdin to make it work correctly. if self.q_type == 'LoadSharingFacility': with open(script_file, 'r') as inputFile: p = subprocess.Popen([submit_cmd], stdin=inputFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() # retrieve the returncode. PBS returns 0 if the job was successful if p.returncode == 0: try: job_id = self._parse_jobid(p.stdout.read().decode()) queue_logger.info( 'Job submission was successful and job_id is {}'.format( job_id)) return job_id except Exception as ex: # probably error parsing job code log_exception(queue_logger, 'Could not parse job id following {} due to error {}...' .format(submit_cmd, str(ex))) else: # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc... msgs = [ 'Error in job submission with {n} file {f} and cmd {c}'.format( n=self.q_name, f=script_file, c=cmd), 'The error response reads: {}'.format(p.stderr.read())] log_fancy(queue_logger, msgs, 'error') except Exception as ex: # random error, e.g. no qsub on machine! log_exception(queue_logger, 'Running the command: {} caused an error...' .format(submit_cmd))
def submit_to_queue(self, script_file): """ submits the job to the queue and returns the job id :param script_file: (str) name of the script file to use (String) :return: (int) job_id """ if not os.path.exists(script_file): raise ValueError( 'Cannot find script file located at: {}'.format( script_file)) queue_logger = self.get_qlogger('qadapter.{}'.format(self.q_name)) submit_cmd = self.q_commands[self.q_type]["submit_cmd"] # submit the job try: if self.q_type == "Cobalt": # Cobalt requires scripts to be executable os.chmod(script_file,stat.S_IRWXU|stat.S_IRGRP|stat.S_IXGRP) cmd = [submit_cmd, script_file] #For most of the queues handled by common_adapter, it's best to simply submit the file name #as an argument. LoadSharingFacility doesn't handle the header section (queue name, nodes, etc) #when taking file arguments, so the file needs to be passed as stdin to make it work correctly. if self.q_type == 'LoadSharingFacility': with open(script_file, 'r') as inputFile: p = subprocess.Popen([submit_cmd],stdin=inputFile,stdout=subprocess.PIPE,stderr=subprocess.PIPE) else: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() # retrieve the returncode. PBS returns 0 if the job was successful if p.returncode == 0: try: job_id = self._parse_jobid(p.stdout.read().decode()) queue_logger.info( 'Job submission was successful and job_id is {}'.format( job_id)) return job_id except Exception as ex: # probably error parsing job code log_exception(queue_logger, 'Could not parse job id following {} due to error {}...' .format(submit_cmd, str(ex))) else: # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc... msgs = [ 'Error in job submission with {n} file {f} and cmd {c}'.format( n=self.q_name, f=script_file, c=cmd), 'The error response reads: {}'.format(p.stderr.read())] log_fancy(queue_logger, msgs, 'error') except Exception as ex: # random error, e.g. no qsub on machine! log_exception(queue_logger, 'Running the command: {} caused an error...' .format(submit_cmd))
def submit_to_queue(self, script_file): """ submits the job to the queue and returns the job id :param script_file: (str) name of the script file to use (String) :return: (int) job_id """ if not os.path.exists(script_file): raise ValueError( 'Cannot find script file located at: {}'.format(script_file)) queue_logger = self.get_qlogger('qadapter.{}'.format(self.q_name)) # submit the job try: cmd = [self.submit_cmd, script_file] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() # grab the returncode. PBS returns 0 if the job was successful if p.returncode == 0: try: job_id = self._parse_jobid(p.stdout.read()) queue_logger.info( 'Job submission was successful and job_id is {}'. format(job_id)) return job_id except: # probably error parsing job code log_exception( queue_logger, 'Could not parse job id following {}...'.format( self.submit_cmd)) else: # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc... msgs = [ 'Error in job submission with {n} file {f} and cmd {c}'. format(n=self.q_name, f=script_file, c=cmd), 'The error response reads: {}'.format(p.stderr.read()) ] log_fancy(queue_logger, msgs, 'error') except: # random error, e.g. no qsub on machine! log_exception( queue_logger, 'Running the command: {} caused an error...'.format( self.submit_cmd))
def submit_to_queue(self, script_file): """ submits the job to the queue and returns the job id :param script_file: (str) name of the script file to use (String) :return: (int) job_id """ if not os.path.exists(script_file): raise ValueError( 'Cannot find script file located at: {}'.format( script_file)) queue_logger = self.get_qlogger('qadapter.{}'.format(self.q_name)) # submit the job try: cmd = [self.submit_cmd, script_file] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() # grab the returncode. PBS returns 0 if the job was successful if p.returncode == 0: try: job_id = self._parse_jobid(p.stdout.read()) queue_logger.info( 'Job submission was successful and job_id is {}'.format( job_id)) return job_id except: # probably error parsing job code log_exception(queue_logger, 'Could not parse job id following {}...'.format( self.submit_cmd)) else: # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc... msgs = [ 'Error in job submission with {n} file {f} and cmd {c}'.format( n=self.q_name, f=script_file, c=cmd), 'The error response reads: {}'.format(p.stderr.read())] log_fancy(queue_logger, msgs, 'error') except: # random error, e.g. no qsub on machine! log_exception(queue_logger, 'Running the command: {} caused an error...'.format( self.submit_cmd))
def submit_to_queue(self, queue_params, script_file): """ for documentation, see parent object """ if not os.path.exists(script_file): raise ValueError( 'Cannot find script file located at: {}'.format(script_file)) # initialize logger slurm_logger = get_fw_logger('rocket.slurm', queue_params.logging_dir) # submit the job try: cmd = ['sbatch', script_file] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() # grab the returncode. SLURM returns 0 if the job was successful if p.returncode == 0: try: # output should of the form '2561553.sdb' or '352353.jessup' - just grab the first part for job id job_id = int(p.stdout.read().split()[3]) slurm_logger.info( 'Job submission was successful and job_id is {}'. format(job_id)) return job_id except: # probably error parsing job code log_exception(slurm_logger, 'Could not parse job id following slurm...') else: # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc... msgs = [ 'Error in job submission with SLURM file {f} and cmd {c}'. format(f=script_file, c=cmd) ] msgs.append('The error response reads: {}'.format( p.stderr.read())) log_fancy(slurm_logger, 'error', msgs) except: # random error, e.g. no qsub on machine! log_exception(slurm_logger, 'Running slurm caused an error...')
def submit_to_queue(self, queue_params, script_file): """ for documentation, see parent object """ if not os.path.exists(script_file): raise ValueError('Cannot find script file located at: {}'.format(script_file)) # initialize logger slurm_logger = get_fw_logger('rocket.slurm', queue_params.logging_dir) # submit the job try: cmd = ['sbatch', script_file] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() # grab the returncode. SLURM returns 0 if the job was successful if p.returncode == 0: try: # output should of the form '2561553.sdb' or '352353.jessup' - just grab the first part for job id job_id = int(p.stdout.read().split()[3]) slurm_logger.info('Job submission was successful and job_id is {}'.format(job_id)) return job_id except: # probably error parsing job code log_exception(slurm_logger, 'Could not parse job id following slurm...') else: # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc... msgs = ['Error in job submission with SLURM file {f} and cmd {c}'.format(f=script_file, c=cmd)] msgs.append('The error response reads: {}'.format(p.stderr.read())) log_fancy(slurm_logger, 'error', msgs) except: # random error, e.g. no qsub on machine! log_exception(slurm_logger, 'Running slurm caused an error...')
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=0, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None, fill_mode=False): """ Submit many jobs to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launch_dir (str): directory where we want to write the blocks nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit njobs_block (int): automatically write a new block when njobs_block jobs are in a single block sleep_time (int): secs to sleep between rapidfire loop iterations reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages timeout (int): # of seconds after which to stop the rapidfire process fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 start_time = datetime.now() try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath( os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while (not njobs_queue or jobs_in_queue < njobs_queue) and \ (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \ and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info( 'Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job return_code = launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode) if return_code is None: l_logger.info('No READY jobs detected...') break elif not return_code: raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format( QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0 or \ (timeout and (datetime.now() - start_time).total_seconds() >= timeout): break l_logger.info( 'Finished a round of launches, sleeping for {} secs'.format( sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=10, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO'): """ Submit many jobs to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launch_dir: directory where we want to write the blocks :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while jobs_in_queue < njobs_queue and launchpad.run_exists(fworker): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info('Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # create launcher_dir launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_') # launch a single job if not launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir, reserve, strm_lvl): raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format(QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO'): """ Submit a single job to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launcher_dir: (str) The directory where to submit the job :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict()) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode oldlaunch_dir = None # only needed in --offline mode with _launch_dir option if not os.path.exists(launcher_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError("Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!") if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!') if launchpad.run_exists(fworker): try: # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir) if not fw: l_logger.info('No jobs exist in the LaunchPad for submission to queue!') return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:20] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug('updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) if '--offline' in qadapter['rocket_launch']: # handle _launch_dir parameter now b/c we can't call # launchpad.change_launch_dir() later on in offline mode if '_launch_dir' in fw.spec: os.chdir(fw.spec['_launch_dir']) oldlaunch_dir = launcher_dir launcher_dir = os.path.abspath(os.getcwd()) launchpad.change_launch_dir(launch_id, launcher_dir) setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: if reserve: l_logger.info('Un-reserving FW with fw_id, launch_id: {}, {}'.format(fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) raise RuntimeError('queue script could not be submitted, check queue script/queue adapter/queue server status!') elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') return False finally: if oldlaunch_dir: os.chdir(oldlaunch_dir) # this only matters in --offline mode with _launch_dir! else: l_logger.info('No jobs exist in the LaunchPad for submission to queue!') return False
def launch_rocket_to_queue(queue_params, launcher_dir='.', strm_lvl=None, launchpad=None, fworker=None, reserve=False): """ Submit a single job to the queue. :param queue_params: A QueueParams instance :param launcher_dir: The directory where to submit the job """ #TODO: move the jobs_exist code here, so the singleshot() also knows if a job exists before submitting to the queue! fworker = fworker if fworker else FWorker() # convert launch_dir to absolute path launcher_dir = os.path.abspath(launcher_dir) # initialize logger l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launcher_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir)) jobs_exist = not launchpad or launchpad.run_exists() if jobs_exist: try: # get the queue adapter l_logger.debug('getting queue adapter') qa = queue_params.qa # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) os.chdir(launcher_dir) if reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad._reserve_fw(fworker, launcher_dir) l_logger.debug('reserved FW with fw_id: {}'.format(fw.fw_id)) if '_queueparams' in fw.spec: l_logger.debug('updating queue params using FireWork spec..') # TODO: make sure this does not affect future FireWorks!! queue_params.params.update(fw.spec['_queueparams']) # update the exe to include the FW_id if 'singleshot' not in queue_params.params['exe']: raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!') queue_params.params['exe'] += ' --fw_id {}'.format(fw.fw_id) # write and submit the queue script using the queue adapter l_logger.debug('writing queue script') with open(FWConfig().SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qa.get_script_str(queue_params, launcher_dir) if not queue_script: raise RuntimeError('queue script could not be written, check job params and queue adapter!') f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qa.submit_to_queue(queue_params, FWConfig().SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError('queue script could not be submitted, check queue adapter and queue server status!') elif reserve: launchpad._set_reservation_id(launch_id, reservation_id) except: log_exception(l_logger, 'Error writing/submitting queue script!') else: l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
def rapidfire(queue_params, launch_dir='.', njobs_queue=10, njobs_block=500, strm_lvl=None, nlaunches=0, sleep_time=60, launchpad=None, fworker=None, reserve=False): """ Submit many jobs to the queue. :param queue_params: A QueueParams instance :param launch_dir: directory where we want to write the blocks :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block """ # convert launch_dir to absolute path launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) # initialize logger l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue(queue_params, njobs_queue, l_logger) jobs_exist = not launchpad or launchpad.run_exists() while jobs_in_queue < njobs_queue and jobs_exist: l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info('Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # create launcher_dir launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_') # launch a single job launch_rocket_to_queue(queue_params, launcher_dir, strm_lvl, launchpad, fworker, reserve) # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format(FWConfig().QUEUE_UPDATE_INTERVAL)) time.sleep(FWConfig().QUEUE_UPDATE_INTERVAL) num_launched += 1 if num_launched == nlaunches: break jobs_exist = not launchpad or launchpad.run_exists() jobs_in_queue = _get_number_of_jobs_in_queue(queue_params, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False): """ Submit a single job to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launcher_dir: (str) The directory where to submit the job :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages :param create_launcher_dir: (bool) Whether to create a subfolder launcher+timestamp, if needed """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict( )) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError( "Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!" ) if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) if launchpad.run_exists(fworker): try: if reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir) if not fw: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!' ) return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug( 'updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir try: os.makedirs(launcher_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: if reserve: l_logger.info( 'Un-reserving FW with fw_id, launch_id: {}, {}'. format(fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) raise RuntimeError( 'queue script could not be submitted, check queue script/queue adapter/queue server status!' ) elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') return False else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!') return False
def launch_rocket_to_queue(queue_params, launcher_dir='.', strm_lvl=None, launchpad=None, fworker=None, reserve=False): """ Submit a single job to the queue. :param queue_params: A QueueParams instance :param launcher_dir: The directory where to submit the job """ #TODO: move the jobs_exist code here, so the singleshot() also knows if a job exists before submitting to the queue! fworker = fworker if fworker else FWorker() # convert launch_dir to absolute path launcher_dir = os.path.abspath(launcher_dir) # initialize logger l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) jobs_exist = not launchpad or launchpad.run_exists() if jobs_exist: try: # get the queue adapter l_logger.debug('getting queue adapter') qa = queue_params.qa # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) os.chdir(launcher_dir) if reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad._reserve_fw(fworker, launcher_dir) l_logger.debug('reserved FW with fw_id: {}'.format(fw.fw_id)) if '_queueparams' in fw.spec: l_logger.debug( 'updating queue params using FireWork spec..') # TODO: make sure this does not affect future FireWorks!! queue_params.params.update(fw.spec['_queueparams']) # update the exe to include the FW_id if 'singleshot' not in queue_params.params['exe']: raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) queue_params.params['exe'] += ' --fw_id {}'.format(fw.fw_id) # write and submit the queue script using the queue adapter l_logger.debug('writing queue script') with open(FWConfig().SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qa.get_script_str(queue_params, launcher_dir) if not queue_script: raise RuntimeError( 'queue script could not be written, check job params and queue adapter!' ) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qa.submit_to_queue(queue_params, FWConfig().SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError( 'queue script could not be submitted, check queue adapter and queue server status!' ) elif reserve: launchpad._set_reservation_id(launch_id, reservation_id) except: log_exception(l_logger, 'Error writing/submitting queue script!') else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!')
def rapidfire(queue_params, launch_dir='.', njobs_queue=10, njobs_block=500, strm_lvl=None, nlaunches=0, sleep_time=60, launchpad=None, fworker=None, reserve=False): """ Submit many jobs to the queue. :param queue_params: A QueueParams instance :param launch_dir: directory where we want to write the blocks :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block """ # convert launch_dir to absolute path launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) # initialize logger l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue( queue_params, njobs_queue, l_logger) jobs_exist = not launchpad or launchpad.run_exists() while jobs_in_queue < njobs_queue and jobs_exist: l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info( 'Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # create launcher_dir launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_') # launch a single job launch_rocket_to_queue(queue_params, launcher_dir, strm_lvl, launchpad, fworker, reserve) # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format( FWConfig().QUEUE_UPDATE_INTERVAL)) time.sleep(FWConfig().QUEUE_UPDATE_INTERVAL) num_launched += 1 if num_launched == nlaunches: break jobs_exist = not launchpad or launchpad.run_exists() jobs_in_queue = _get_number_of_jobs_in_queue( queue_params, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info( 'Finished a round of launches, sleeping for {} secs'.format( sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def rapidfire( launchpad, fworker, qadapter, launch_dir=".", block_dir=None, nlaunches=0, njobs_queue=0, njobs_block=500, sleep_time=None, reserve=False, strm_lvl="INFO", timeout=None, fill_mode=False, ): """ Submit many jobs to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launch_dir (str): directory where we want to write the blocks block_dir (str): directory to use as block dir. Can be a new or existing block. Dirname must start with 'block_'. nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit. If 0 skips the check on the number of jobs in the queue. njobs_block (int): automatically write a new block when njobs_block jobs are in a single block sleep_time (int): secs to sleep between rapidfire loop iterations reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages timeout (int): # of seconds after which to stop the rapidfire process fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == "infinite" else int(nlaunches) l_logger = get_fw_logger("queue.launcher", l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError(f"Desired launch directory {launch_dir} does not exist!") num_launched = 0 start_time = datetime.now() try: l_logger.info("getting queue adapter") prev_blocks = sorted(glob.glob(os.path.join(launch_dir, "block_*")), reverse=True) if block_dir is not None: if not block_dir.startswith("block_"): raise ValueError(f"Invalid name {block_dir}, block dirs must start with 'block_") block_dir = os.path.abspath(os.path.join(launch_dir, block_dir)) os.mkdir(block_dir, exist_ok=True) elif prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0])) l_logger.info(f"Found previous block, using {block_dir}") else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue if a maximum has been set. jobs_in_queue = 0 if njobs_queue: jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while launchpad.run_exists(fworker) or (fill_mode and not reserve): if timeout and (datetime.now() - start_time).total_seconds() >= timeout: l_logger.info("Timeout reached.") break if njobs_queue and jobs_in_queue >= njobs_queue: l_logger.info(f"Jobs in queue ({jobs_in_queue}) meets/exceeds maximum allowed ({njobs_queue})") break l_logger.info("Launching a rocket!") # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info(f"Block got bigger than {njobs_block} jobs.") block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job return_code = launch_rocket_to_queue( launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode ) if return_code is None: l_logger.info("No READY jobs detected...") break elif not return_code: raise RuntimeError("Launch unsuccessful!") num_launched += 1 if nlaunches > 0 and num_launched == nlaunches: l_logger.info(f"Launched allowed number of jobs: {num_launched}") break # wait for the queue system to update l_logger.info(f"Sleeping for {QUEUE_UPDATE_INTERVAL} seconds...zzz...") time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0 and njobs_queue: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) if ( (nlaunches > 0 and num_launched == nlaunches) or (timeout and (datetime.now() - start_time).total_seconds() >= timeout) or (nlaunches == 0 and not launchpad.future_run_exists(fworker)) ): break l_logger.info(f"Finished a round of launches, sleeping for {sleep_time} secs") time.sleep(sleep_time) l_logger.info("Checking for Rockets to run...") except Exception: log_exception(l_logger, "Error with queue launcher rapid fire!")
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO'): """ Submit a single job to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launcher_dir: (str) The directory where to submit the job :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # get the queue adapter l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict( )) # make a defensive copy, mainly for reservation mode # make sure launch_dir exists: if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) if launchpad.run_exists(fworker): try: # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) os.chdir(launcher_dir) oldlaunch_dir = None if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError( "Must use reservation mode (-r option) of qlaunch when using offline mode (--offline option) of rlaunch!!" ) elif reserve: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad._reserve_fw(fworker, launcher_dir) if not fw: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!' ) return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # set job name to the FW name job_name = get_slug(fw.name) job_name = job_name[0:20] if len(job_name) > 20 else job_name qadapter.update({'job_name': job_name}) # set the job name to FW name if '_queueadapter' in fw.spec: l_logger.debug( 'updating queue params using FireWork spec..') qadapter.update(fw.spec['_queueadapter']) # update the exe to include the FW_id if 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) if '--offline' in qadapter['rocket_launch']: # handle _launch_dir parameter early... if '_launch_dir' in fw.spec: os.chdir(fw.spec['_launch_dir']) oldlaunch_dir = launcher_dir launcher_dir = os.path.abspath(os.getcwd()) launchpad._change_launch_dir(launch_id, launcher_dir) # write FW.json fw.to_file("FW.json") # write Launchid with open('FW_offline.json', 'w') as f: f.write('{"launch_id":%s}' % launch_id) launchpad.add_offline_run(launch_id, fw.fw_id, fw.name) # write and submit the queue script using the queue adapter l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError( 'queue script could not be submitted, check queue adapter and queue server status!' ) elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') return False finally: if oldlaunch_dir: os.chdir( oldlaunch_dir ) # this only matters in --offline mode with _launch_dir! else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!') return False
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=0, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None, fill_mode=False): """ Submit many jobs to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launch_dir (str): directory where we want to write the blocks nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit njobs_block (int): automatically write a new block when njobs_block jobs are in a single block sleep_time (int): secs to sleep between rapidfire loop iterations reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages timeout (int): # of seconds after which to stop the rapidfire process fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 start_time = datetime.now() try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while (not njobs_queue or jobs_in_queue < njobs_queue) and \ (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \ and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info('Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job return_code = launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode) if return_code is None: l_logger.info('No READY jobs detected...') break elif not return_code: raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format(QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0 or \ (timeout and (datetime.now() - start_time).total_seconds() >= timeout): break l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False, fill_mode=False, fw_id=None): """ Submit a single job to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launcher_dir (str): The directory where to submit the job reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) fw_id (int): specific fw_id to reserve (reservation mode only) """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict( )) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError("Must use reservation mode (-r option) of qlaunch " "when using offline option of rlaunch!!") if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError( 'Reservation mode of queue launcher only works for singleshot Rocket Launcher!' ) if fill_mode and reserve: raise ValueError( "Fill_mode cannot be used in conjunction with reserve mode!") if fw_id and not reserve: raise ValueError( "qlaunch for specific fireworks may only be used in reservation mode." ) if fill_mode or launchpad.run_exists(fworker): launch_id = None try: if reserve: if fw_id: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir, fw_id=fw_id) if not fw: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!' ) return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug( 'updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir makedirs_p(launcher_dir) launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError( 'queue script could not be submitted, check queue ' 'script/queue adapter/queue server status!') elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') if reserve and launch_id is not None: try: l_logger.info( 'Un-reserving FW with fw_id, launch_id: {}, {}'.format( fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) launchpad.forget_offline(launch_id) except: log_exception( l_logger, 'Error unreserving FW with fw_id {}'.format(fw.fw_id)) return False else: l_logger.info( 'No jobs exist in the LaunchPad for submission to queue!') return None # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO', create_launcher_dir=False, fill_mode=False, fw_id=None): """ Submit a single job to the queue. Args: launchpad (LaunchPad) fworker (FWorker) qadapter (QueueAdapterBase) launcher_dir (str): The directory where to submit the job reserve (bool): Whether to queue in reservation mode strm_lvl (str): level at which to stream log messages create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed fill_mode (bool): whether to submit jobs even when there is nothing to run (only in non-reservation mode) fw_id (int): specific fw_id to reserve (reservation mode only) """ fworker = fworker if fworker else FWorker() launcher_dir = os.path.abspath(launcher_dir) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) l_logger.debug('getting queue adapter') qadapter = load_object(qadapter.to_dict()) # make a defensive copy, mainly for reservation mode fw, launch_id = None, None # only needed in reservation mode if not os.path.exists(launcher_dir): raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir)) if '--offline' in qadapter['rocket_launch'] and not reserve: raise ValueError("Must use reservation mode (-r option) of qlaunch " "when using offline option of rlaunch!!") if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''): raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!') if fill_mode and reserve: raise ValueError("Fill_mode cannot be used in conjunction with reserve mode!") if fw_id and not reserve: raise ValueError("qlaunch for specific fireworks may only be used in reservation mode.") if fill_mode or launchpad.run_exists(fworker): launch_id = None try: if reserve: if fw_id: l_logger.debug('finding a FW to reserve...') fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir, fw_id=fw_id) if not fw: l_logger.info('No jobs exist in the LaunchPad for submission to queue!') return False l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id)) # update qadapter job_name based on FW name job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN] qadapter.update({'job_name': job_name}) if '_queueadapter' in fw.spec: l_logger.debug('updating queue params using Firework spec..') qadapter.update(fw.spec['_queueadapter']) # reservation mode includes --fw_id in rocket launch qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id) # update launcher_dir if _launch_dir is selected in reserved fw if '_launch_dir' in fw.spec: fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir']) if not os.path.isabs(fw_launch_dir): fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir) launcher_dir = fw_launch_dir makedirs_p(launcher_dir) launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') launchpad.change_launch_dir(launch_id, launcher_dir) elif create_launcher_dir: # create launcher_dir launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_') # move to the launch directory l_logger.info('moving to launch_dir {}'.format(launcher_dir)) with cd(launcher_dir): if '--offline' in qadapter['rocket_launch']: setup_offline_job(launchpad, fw, launch_id) l_logger.debug('writing queue script') with open(SUBMIT_SCRIPT_NAME, 'w') as f: queue_script = qadapter.get_script_str(launcher_dir) f.write(queue_script) l_logger.info('submitting queue script') reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME) if not reservation_id: raise RuntimeError('queue script could not be submitted, check queue ' 'script/queue adapter/queue server status!') elif reserve: launchpad.set_reservation_id(launch_id, reservation_id) return reservation_id except: log_exception(l_logger, 'Error writing/submitting queue script!') if reserve and launch_id is not None: try: l_logger.info('Un-reserving FW with fw_id, launch_id: {}, {}'.format( fw.fw_id, launch_id)) launchpad.cancel_reservation(launch_id) launchpad.forget_offline(launch_id) except: log_exception(l_logger, 'Error unreserving FW with fw_id {}'.format(fw.fw_id)) return False else: l_logger.info('No jobs exist in the LaunchPad for submission to queue!') return None # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=10, njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO'): """ Submit many jobs to the queue. :param launchpad: (LaunchPad) :param fworker: (FWorker) :param qadapter: (QueueAdapterBase) :param launch_dir: directory where we want to write the blocks :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue :param njobs_block: automatically write a new block when njobs_block jobs are in a single block :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param reserve: (bool) Whether to queue in reservation mode :param strm_lvl: (str) level at which to stream log messages """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS launch_dir = os.path.abspath(launch_dir) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl) # make sure launch_dir exists: if not os.path.exists(launch_dir): raise ValueError( 'Desired launch directory {} does not exist!'.format(launch_dir)) num_launched = 0 try: l_logger.info('getting queue adapter') prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True) if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK: block_dir = os.path.abspath( os.path.join(launch_dir, prev_blocks[0])) l_logger.info('Found previous block, using {}'.format(block_dir)) else: block_dir = create_datestamp_dir(launch_dir, l_logger) while True: # get number of jobs in queue jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) job_counter = 0 # this is for QSTAT_FREQUENCY option while jobs_in_queue < njobs_queue and launchpad.run_exists( fworker): l_logger.info('Launching a rocket!') # switch to new block dir if it got too big if _njobs_in_dir(block_dir) >= njobs_block: l_logger.info( 'Block got bigger than {} jobs.'.format(njobs_block)) block_dir = create_datestamp_dir(launch_dir, l_logger) # launch a single job if not launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True): raise RuntimeError("Launch unsuccessful!") num_launched += 1 if num_launched == nlaunches: break # wait for the queue system to update l_logger.info('Sleeping for {} seconds...zzz...'.format( QUEUE_UPDATE_INTERVAL)) time.sleep(QUEUE_UPDATE_INTERVAL) jobs_in_queue += 1 job_counter += 1 if job_counter % QSTAT_FREQUENCY == 0: job_counter = 0 jobs_in_queue = _get_number_of_jobs_in_queue( qadapter, njobs_queue, l_logger) if num_launched == nlaunches or nlaunches == 0: break l_logger.info( 'Finished a round of launches, sleeping for {} secs'.format( sleep_time)) time.sleep(sleep_time) l_logger.info('Checking for Rockets to run...'.format(sleep_time)) except: log_exception(l_logger, 'Error with queue launcher rapid fire!')