Exemple #1
0
def _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger):
    """
    Internal method to get the number of jobs in the queue using the given job params.
    In case of failure, automatically retries at certain intervals...

    Args:
        qadapter (QueueAdapter)
        njobs_queue (int): The desired maximum number of jobs in the queue
        l_logger (logger): A logger to put errors/info/warnings/etc.

    Return:
        (int)
    """
    RETRY_INTERVAL = 30  # initial retry in 30 sec upon failure

    for i in range(QUEUE_RETRY_ATTEMPTS):
        try:
            jobs_in_queue = qadapter.get_njobs_in_queue()
            if jobs_in_queue is not None:
                l_logger.info(f"{jobs_in_queue} jobs in queue. Maximum allowed by user: {njobs_queue}")
                return jobs_in_queue
        except Exception:
            log_exception(l_logger, f"Could not get number of jobs in queue! Sleeping {RETRY_INTERVAL} secs...zzz...")
        time.sleep(RETRY_INTERVAL)
        RETRY_INTERVAL *= 2

    raise RuntimeError("Unable to determine number of jobs in queue, check queue adapter and queue server status!")
def _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger):
    """
    Internal method to get the number of jobs in the queue using the given job params.
    In case of failure, automatically retries at certain intervals...

    Args:
        qadapter (QueueAdapter)
        njobs_queue (int): The desired maximum number of jobs in the queue
        l_logger (logger): A logger to put errors/info/warnings/etc.

    Return:
        (int)
    """
    RETRY_INTERVAL = 30  # initial retry in 30 sec upon failure

    for i in range(QUEUE_RETRY_ATTEMPTS):
        try:
            jobs_in_queue = qadapter.get_njobs_in_queue()
            if jobs_in_queue is not None:
                l_logger.info('{} jobs in queue. '
                              'Maximum allowed by user: {}'.format(jobs_in_queue, njobs_queue))
                return jobs_in_queue
        except:
            log_exception(l_logger, 'Could not get number of jobs in queue! '
                                    'Sleeping {} secs...zzz...'.format(RETRY_INTERVAL))
        time.sleep(RETRY_INTERVAL)
        RETRY_INTERVAL *= 2

    raise RuntimeError('Unable to determine number of jobs in queue, '
                       'check queue adapter and queue server status!')
def _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger):
    """
    Internal method to get the number of jobs in the queue using the given job params. \
    In case of failure, automatically retries at certain intervals...
    
    :param qadapter: (QueueAdapter)
    :param njobs_queue: (int) The desired maximum number of jobs in the queue
    :param l_logger: (logger) A logger to put errors/info/warnings/etc.
    """

    RETRY_INTERVAL = 30  # initial retry in 30 sec upon failure

    for i in range(QUEUE_RETRY_ATTEMPTS):
        try:
            jobs_in_queue = qadapter.get_njobs_in_queue()
            if jobs_in_queue is not None:
                l_logger.info(
                    '{} jobs in queue. Maximum allowed by user: {}'.format(
                        jobs_in_queue, njobs_queue))
                return jobs_in_queue
        except:
            log_exception(
                l_logger,
                'Could not get number of jobs in queue! Sleeping {} secs...zzz...'
                .format(RETRY_INTERVAL))
        time.sleep(RETRY_INTERVAL)
        RETRY_INTERVAL *= 2

    raise RuntimeError(
        'Unable to determine number of jobs in queue, check queue adapter and queue server status!'
    )
    def submit_to_queue(self, script_file):
        """
        submits the job to the queue and returns the job id

        :param script_file: (str) name of the script file to use (String)
        :return: (int) job_id
        """
        if not os.path.exists(script_file):
            raise ValueError(
                'Cannot find script file located at: {}'.format(
                    script_file))

        queue_logger = self.get_qlogger('qadapter.{}'.format(self.q_name))
        submit_cmd = self.q_commands[self.q_type]["submit_cmd"]
        # submit the job
        try:
            if self.q_type == "Cobalt":
                # Cobalt requires scripts to be executable
                os.chmod(script_file, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP)
            cmd = [submit_cmd, script_file]
            # For most of the queues handled by common_adapter, it's best to simply submit the file name
            # as an argument.  LoadSharingFacility doesn't handle the header section (queue name, nodes, etc)
            # when taking file arguments, so the file needs to be passed as stdin to make it work correctly.
            if self.q_type == 'LoadSharingFacility':
                with open(script_file, 'r') as inputFile:
                    p = subprocess.Popen([submit_cmd], stdin=inputFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            else:
                p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            p.wait()

            # retrieve the returncode. PBS returns 0 if the job was successful
            if p.returncode == 0:
                try:
                    job_id = self._parse_jobid(p.stdout.read().decode())
                    queue_logger.info(
                        'Job submission was successful and job_id is {}'.format(
                            job_id))
                    return job_id
                except Exception as ex:
                    # probably error parsing job code
                    log_exception(queue_logger,
                                  'Could not parse job id following {} due to error {}...'
                                  .format(submit_cmd, str(ex)))
            else:
                # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc...
                msgs = [
                    'Error in job submission with {n} file {f} and cmd {c}'.format(
                        n=self.q_name, f=script_file, c=cmd),
                    'The error response reads: {}'.format(p.stderr.read())]
                log_fancy(queue_logger, msgs, 'error')

        except Exception as ex:
            # random error, e.g. no qsub on machine!
            log_exception(queue_logger,
                          'Running the command: {} caused an error...'
                          .format(submit_cmd))
    def submit_to_queue(self, script_file):
        """
        submits the job to the queue and returns the job id

        :param script_file: (str) name of the script file to use (String)
        :return: (int) job_id
        """
        if not os.path.exists(script_file):
            raise ValueError(
                'Cannot find script file located at: {}'.format(
                    script_file))

        queue_logger = self.get_qlogger('qadapter.{}'.format(self.q_name))
        submit_cmd = self.q_commands[self.q_type]["submit_cmd"]
        # submit the job
        try:
            if self.q_type == "Cobalt":
                # Cobalt requires scripts to be executable
                os.chmod(script_file,stat.S_IRWXU|stat.S_IRGRP|stat.S_IXGRP)
            cmd = [submit_cmd, script_file]
            #For most of the queues handled by common_adapter, it's best to simply submit the file name
            #as an argument.  LoadSharingFacility doesn't handle the header section (queue name, nodes, etc)
            #when taking file arguments, so the file needs to be passed as stdin to make it work correctly.
            if self.q_type == 'LoadSharingFacility':
                with open(script_file, 'r') as inputFile:
                    p = subprocess.Popen([submit_cmd],stdin=inputFile,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
            else:
                p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            p.wait()

            # retrieve the returncode. PBS returns 0 if the job was successful
            if p.returncode == 0:
                try:
                    job_id = self._parse_jobid(p.stdout.read().decode())
                    queue_logger.info(
                        'Job submission was successful and job_id is {}'.format(
                            job_id))
                    return job_id
                except Exception as ex:
                    # probably error parsing job code
                    log_exception(queue_logger,
                                  'Could not parse job id following {} due to error {}...'
                                  .format(submit_cmd, str(ex)))
            else:
                # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc...
                msgs = [
                    'Error in job submission with {n} file {f} and cmd {c}'.format(
                        n=self.q_name, f=script_file, c=cmd),
                    'The error response reads: {}'.format(p.stderr.read())]
                log_fancy(queue_logger, msgs, 'error')

        except Exception as ex:
            # random error, e.g. no qsub on machine!
            log_exception(queue_logger,
                          'Running the command: {} caused an error...'
                          .format(submit_cmd))
Exemple #6
0
    def submit_to_queue(self, script_file):
        """
        submits the job to the queue and returns the job id

        :param script_file: (str) name of the script file to use (String)
        :return: (int) job_id
        """
        if not os.path.exists(script_file):
            raise ValueError(
                'Cannot find script file located at: {}'.format(script_file))

        queue_logger = self.get_qlogger('qadapter.{}'.format(self.q_name))

        # submit the job
        try:
            cmd = [self.submit_cmd, script_file]
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            p.wait()

            # grab the returncode. PBS returns 0 if the job was successful
            if p.returncode == 0:
                try:
                    job_id = self._parse_jobid(p.stdout.read())
                    queue_logger.info(
                        'Job submission was successful and job_id is {}'.
                        format(job_id))
                    return job_id
                except:
                    # probably error parsing job code
                    log_exception(
                        queue_logger,
                        'Could not parse job id following {}...'.format(
                            self.submit_cmd))

            else:
                # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc...
                msgs = [
                    'Error in job submission with {n} file {f} and cmd {c}'.
                    format(n=self.q_name, f=script_file, c=cmd),
                    'The error response reads: {}'.format(p.stderr.read())
                ]
                log_fancy(queue_logger, msgs, 'error')

        except:
            # random error, e.g. no qsub on machine!
            log_exception(
                queue_logger,
                'Running the command: {} caused an error...'.format(
                    self.submit_cmd))
Exemple #7
0
    def submit_to_queue(self, script_file):
        """
        submits the job to the queue and returns the job id

        :param script_file: (str) name of the script file to use (String)
        :return: (int) job_id
        """
        if not os.path.exists(script_file):
            raise ValueError(
                'Cannot find script file located at: {}'.format(
                    script_file))

        queue_logger = self.get_qlogger('qadapter.{}'.format(self.q_name))

        # submit the job
        try:
            cmd = [self.submit_cmd, script_file]
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            p.wait()

            # grab the returncode. PBS returns 0 if the job was successful
            if p.returncode == 0:
                try:
                    job_id = self._parse_jobid(p.stdout.read())
                    queue_logger.info(
                        'Job submission was successful and job_id is {}'.format(
                            job_id))
                    return job_id
                except:
                    # probably error parsing job code
                    log_exception(queue_logger,
                                  'Could not parse job id following {}...'.format(
                                      self.submit_cmd))

            else:
                # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc...
                msgs = [
                    'Error in job submission with {n} file {f} and cmd {c}'.format(
                        n=self.q_name, f=script_file, c=cmd),
                    'The error response reads: {}'.format(p.stderr.read())]
                log_fancy(queue_logger, msgs, 'error')

        except:
            # random error, e.g. no qsub on machine!
            log_exception(queue_logger,
                          'Running the command: {} caused an error...'.format(
                              self.submit_cmd))
Exemple #8
0
    def submit_to_queue(self, queue_params, script_file):
        """
        for documentation, see parent object
        """

        if not os.path.exists(script_file):
            raise ValueError(
                'Cannot find script file located at: {}'.format(script_file))

        # initialize logger
        slurm_logger = get_fw_logger('rocket.slurm', queue_params.logging_dir)

        # submit the job
        try:
            cmd = ['sbatch', script_file]
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            p.wait()

            # grab the returncode. SLURM returns 0 if the job was successful
            if p.returncode == 0:
                try:
                    # output should of the form '2561553.sdb' or '352353.jessup' - just grab the first part for job id
                    job_id = int(p.stdout.read().split()[3])
                    slurm_logger.info(
                        'Job submission was successful and job_id is {}'.
                        format(job_id))
                    return job_id
                except:
                    # probably error parsing job code
                    log_exception(slurm_logger,
                                  'Could not parse job id following slurm...')

            else:
                # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc...
                msgs = [
                    'Error in job submission with SLURM file {f} and cmd {c}'.
                    format(f=script_file, c=cmd)
                ]
                msgs.append('The error response reads: {}'.format(
                    p.stderr.read()))
                log_fancy(slurm_logger, 'error', msgs)

        except:
            # random error, e.g. no qsub on machine!
            log_exception(slurm_logger, 'Running slurm caused an error...')
Exemple #9
0
    def submit_to_queue(self, queue_params, script_file):
        """
        for documentation, see parent object
        """

        if not os.path.exists(script_file):
            raise ValueError('Cannot find script file located at: {}'.format(script_file))

        # initialize logger
        slurm_logger = get_fw_logger('rocket.slurm', queue_params.logging_dir)

        # submit the job
        try:
            cmd = ['sbatch', script_file]
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            p.wait()

            # grab the returncode. SLURM returns 0 if the job was successful
            if p.returncode == 0:
                try:
                    # output should of the form '2561553.sdb' or '352353.jessup' - just grab the first part for job id
                    job_id = int(p.stdout.read().split()[3])
                    slurm_logger.info('Job submission was successful and job_id is {}'.format(job_id))
                    return job_id
                except:
                    # probably error parsing job code
                    log_exception(slurm_logger, 'Could not parse job id following slurm...')

            else:
                # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc...
                msgs = ['Error in job submission with SLURM file {f} and cmd {c}'.format(f=script_file, c=cmd)]
                msgs.append('The error response reads: {}'.format(p.stderr.read()))
                log_fancy(slurm_logger, 'error', msgs)

        except:
            # random error, e.g. no qsub on machine!
            log_exception(slurm_logger, 'Running slurm caused an error...')
Exemple #10
0
def rapidfire(launchpad,
              fworker,
              qadapter,
              launch_dir='.',
              nlaunches=0,
              njobs_queue=0,
              njobs_block=500,
              sleep_time=None,
              reserve=False,
              strm_lvl='INFO',
              timeout=None,
              fill_mode=False):
    """
    Submit many jobs to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launch_dir (str): directory where we want to write the blocks
        nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round
        njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit
        njobs_block (int): automatically write a new block when njobs_block jobs are in a single block
        sleep_time (int): secs to sleep between rapidfire loop iterations
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        timeout (int): # of seconds after which to stop the rapidfire process
        fill_mode (bool): whether to submit jobs even when there is nothing to run (only in
            non-reservation mode)
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    start_time = datetime.now()

    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')),
                             reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(
                os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(
                qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while (not njobs_queue or jobs_in_queue < njobs_queue) and \
                    (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \
                    and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(
                        'Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                return_code = launch_rocket_to_queue(launchpad, fworker,
                                                     qadapter, block_dir,
                                                     reserve, strm_lvl, True,
                                                     fill_mode)
                if return_code is None:
                    l_logger.info('No READY jobs detected...')
                    break
                elif not return_code:
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(
                    QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(
                        qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0 or \
                    (timeout and (datetime.now() - start_time).total_seconds() >= timeout):
                break
            l_logger.info(
                'Finished a round of launches, sleeping for {} secs'.format(
                    sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #11
0
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=10, njobs_block=500,
              sleep_time=None, reserve=False, strm_lvl='INFO'):
    """
    Submit many jobs to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launch_dir: directory where we want to write the blocks
    :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while jobs_in_queue < njobs_queue and launchpad.run_exists(fworker):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info('Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # create launcher_dir
                launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_')
                # launch a single job
                if not launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir, reserve, strm_lvl):
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #12
0
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO'):
    """
    Submit a single job to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launcher_dir: (str) The directory where to submit the job
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    """

    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict())  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode
    oldlaunch_dir = None  # only needed in --offline mode with _launch_dir option

    if not os.path.exists(launcher_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
                raise ValueError("Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!")

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!')

    if launchpad.run_exists(fworker):
        try:
            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))
            with cd(launcher_dir):
                if reserve:
                    l_logger.debug('finding a FW to reserve...')
                    fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir)
                    if not fw:
                        l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
                        return False
                    l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                    # update qadapter job_name based on FW name
                    job_name = get_slug(fw.name)[0:20]
                    qadapter.update({'job_name': job_name})

                    if '_queueadapter' in fw.spec:
                        l_logger.debug('updating queue params using Firework spec..')
                        qadapter.update(fw.spec['_queueadapter'])

                    # reservation mode includes --fw_id in rocket launch
                    qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                    if '--offline' in qadapter['rocket_launch']:
                        # handle _launch_dir parameter now b/c we can't call
                        # launchpad.change_launch_dir() later on in offline mode
                        if '_launch_dir' in fw.spec:
                            os.chdir(fw.spec['_launch_dir'])
                            oldlaunch_dir = launcher_dir
                            launcher_dir = os.path.abspath(os.getcwd())
                            launchpad.change_launch_dir(launch_id, launcher_dir)

                        setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    if reserve:
                        l_logger.info('Un-reserving FW with fw_id, launch_id: {}, {}'.format(fw.fw_id, launch_id))
                        launchpad.cancel_reservation(launch_id)
                    raise RuntimeError('queue script could not be submitted, check queue script/queue adapter/queue server status!')
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            return False

        finally:
            if oldlaunch_dir:
                os.chdir(oldlaunch_dir)  # this only matters in --offline mode with _launch_dir!
    else:
        l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
        return False
Exemple #13
0
def launch_rocket_to_queue(queue_params, launcher_dir='.', strm_lvl=None, launchpad=None, fworker=None, reserve=False):
    """
    Submit a single job to the queue.
    
    :param queue_params: A QueueParams instance
    :param launcher_dir: The directory where to submit the job
    """

    #TODO: move the jobs_exist code here, so the singleshot() also knows if a job exists before submitting to the queue!
    fworker = fworker if fworker else FWorker()

    # convert launch_dir to absolute path
    launcher_dir = os.path.abspath(launcher_dir)

    # initialize logger
    l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launcher_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir))

    jobs_exist = not launchpad or launchpad.run_exists()

    if jobs_exist:
        try:
            # get the queue adapter
            l_logger.debug('getting queue adapter')
            qa = queue_params.qa

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))
            os.chdir(launcher_dir)

            if reserve:
                l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad._reserve_fw(fworker, launcher_dir)
                l_logger.debug('reserved FW with fw_id: {}'.format(fw.fw_id))
                if '_queueparams' in fw.spec:
                    l_logger.debug('updating queue params using FireWork spec..')
                    # TODO: make sure this does not affect future FireWorks!!
                    queue_params.params.update(fw.spec['_queueparams'])
                # update the exe to include the FW_id
                if 'singleshot' not in queue_params.params['exe']:
                    raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!')
                queue_params.params['exe'] += ' --fw_id {}'.format(fw.fw_id)

            # write and submit the queue script using the queue adapter
            l_logger.debug('writing queue script')
            with open(FWConfig().SUBMIT_SCRIPT_NAME, 'w') as f:
                queue_script = qa.get_script_str(queue_params, launcher_dir)
                if not queue_script:
                    raise RuntimeError('queue script could not be written, check job params and queue adapter!')
                f.write(queue_script)
            l_logger.info('submitting queue script')
            reservation_id = qa.submit_to_queue(queue_params, FWConfig().SUBMIT_SCRIPT_NAME)
            if not reservation_id:
                raise RuntimeError('queue script could not be submitted, check queue adapter and queue server status!')
            elif reserve:
                launchpad._set_reservation_id(launch_id, reservation_id)

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
    else:
        l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
Exemple #14
0
def rapidfire(queue_params, launch_dir='.', njobs_queue=10, njobs_block=500, strm_lvl=None, nlaunches=0, sleep_time=60, launchpad=None, fworker=None, reserve=False):
    """
    Submit many jobs to the queue.
    
    :param queue_params: A QueueParams instance
    :param launch_dir: directory where we want to write the blocks
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    """

    # convert launch_dir to absolute path
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)

    # initialize logger
    l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(queue_params, njobs_queue, l_logger)
            jobs_exist = not launchpad or launchpad.run_exists()

            while jobs_in_queue < njobs_queue and jobs_exist:
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info('Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # create launcher_dir
                launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_')
                # launch a single job
                launch_rocket_to_queue(queue_params, launcher_dir, strm_lvl, launchpad, fworker, reserve)
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(FWConfig().QUEUE_UPDATE_INTERVAL))
                time.sleep(FWConfig().QUEUE_UPDATE_INTERVAL)
                num_launched += 1
                if num_launched == nlaunches:
                    break
                jobs_exist = not launchpad or launchpad.run_exists()
                jobs_in_queue = _get_number_of_jobs_in_queue(queue_params, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #15
0
def launch_rocket_to_queue(launchpad,
                           fworker,
                           qadapter,
                           launcher_dir='.',
                           reserve=False,
                           strm_lvl='INFO',
                           create_launcher_dir=False):
    """
    Submit a single job to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launcher_dir: (str) The directory where to submit the job
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    :param create_launcher_dir: (bool) Whether to create a subfolder launcher+timestamp, if needed
    """

    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict(
    ))  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError(
            "Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!"
        )

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError(
            'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
        )

    if launchpad.run_exists(fworker):
        try:
            if reserve:
                l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir)
                if not fw:
                    l_logger.info(
                        'No jobs exist in the LaunchPad for submission to queue!'
                    )
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug(
                        'updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir,
                                                     fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    try:
                        os.makedirs(launcher_dir)
                    except OSError as exception:
                        if exception.errno != errno.EEXIST:
                            raise

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir,
                                                        l_logger,
                                                        prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir,
                                                    l_logger,
                                                    prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    if reserve:
                        l_logger.info(
                            'Un-reserving FW with fw_id, launch_id: {}, {}'.
                            format(fw.fw_id, launch_id))
                        launchpad.cancel_reservation(launch_id)
                    raise RuntimeError(
                        'queue script could not be submitted, check queue script/queue adapter/queue server status!'
                    )
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            return False

    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
        return False
Exemple #16
0
def launch_rocket_to_queue(queue_params,
                           launcher_dir='.',
                           strm_lvl=None,
                           launchpad=None,
                           fworker=None,
                           reserve=False):
    """
    Submit a single job to the queue.
    
    :param queue_params: A QueueParams instance
    :param launcher_dir: The directory where to submit the job
    """

    #TODO: move the jobs_exist code here, so the singleshot() also knows if a job exists before submitting to the queue!
    fworker = fworker if fworker else FWorker()

    # convert launch_dir to absolute path
    launcher_dir = os.path.abspath(launcher_dir)

    # initialize logger
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=queue_params.logging_dir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    jobs_exist = not launchpad or launchpad.run_exists()

    if jobs_exist:
        try:
            # get the queue adapter
            l_logger.debug('getting queue adapter')
            qa = queue_params.qa

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))
            os.chdir(launcher_dir)

            if reserve:
                l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad._reserve_fw(fworker, launcher_dir)
                l_logger.debug('reserved FW with fw_id: {}'.format(fw.fw_id))
                if '_queueparams' in fw.spec:
                    l_logger.debug(
                        'updating queue params using FireWork spec..')
                    # TODO: make sure this does not affect future FireWorks!!
                    queue_params.params.update(fw.spec['_queueparams'])
                # update the exe to include the FW_id
                if 'singleshot' not in queue_params.params['exe']:
                    raise ValueError(
                        'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
                    )
                queue_params.params['exe'] += ' --fw_id {}'.format(fw.fw_id)

            # write and submit the queue script using the queue adapter
            l_logger.debug('writing queue script')
            with open(FWConfig().SUBMIT_SCRIPT_NAME, 'w') as f:
                queue_script = qa.get_script_str(queue_params, launcher_dir)
                if not queue_script:
                    raise RuntimeError(
                        'queue script could not be written, check job params and queue adapter!'
                    )
                f.write(queue_script)
            l_logger.info('submitting queue script')
            reservation_id = qa.submit_to_queue(queue_params,
                                                FWConfig().SUBMIT_SCRIPT_NAME)
            if not reservation_id:
                raise RuntimeError(
                    'queue script could not be submitted, check queue adapter and queue server status!'
                )
            elif reserve:
                launchpad._set_reservation_id(launch_id, reservation_id)

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
Exemple #17
0
def rapidfire(queue_params,
              launch_dir='.',
              njobs_queue=10,
              njobs_block=500,
              strm_lvl=None,
              nlaunches=0,
              sleep_time=60,
              launchpad=None,
              fworker=None,
              reserve=False):
    """
    Submit many jobs to the queue.
    
    :param queue_params: A QueueParams instance
    :param launch_dir: directory where we want to write the blocks
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    """

    # convert launch_dir to absolute path
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)

    # initialize logger
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=queue_params.logging_dir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(
                queue_params, njobs_queue, l_logger)
            jobs_exist = not launchpad or launchpad.run_exists()

            while jobs_in_queue < njobs_queue and jobs_exist:
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(
                        'Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # create launcher_dir
                launcher_dir = create_datestamp_dir(block_dir,
                                                    l_logger,
                                                    prefix='launcher_')
                # launch a single job
                launch_rocket_to_queue(queue_params, launcher_dir, strm_lvl,
                                       launchpad, fworker, reserve)
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(
                    FWConfig().QUEUE_UPDATE_INTERVAL))
                time.sleep(FWConfig().QUEUE_UPDATE_INTERVAL)
                num_launched += 1
                if num_launched == nlaunches:
                    break
                jobs_exist = not launchpad or launchpad.run_exists()
                jobs_in_queue = _get_number_of_jobs_in_queue(
                    queue_params, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info(
                'Finished a round of launches, sleeping for {} secs'.format(
                    sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #18
0
def rapidfire(
    launchpad,
    fworker,
    qadapter,
    launch_dir=".",
    block_dir=None,
    nlaunches=0,
    njobs_queue=0,
    njobs_block=500,
    sleep_time=None,
    reserve=False,
    strm_lvl="INFO",
    timeout=None,
    fill_mode=False,
):
    """
    Submit many jobs to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launch_dir (str): directory where we want to write the blocks
        block_dir (str): directory to use as block dir. Can be a new or existing block. Dirname must
            start with 'block_'.
        nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round
        njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit.
            If 0 skips the check on the number of jobs in the queue.
        njobs_block (int): automatically write a new block when njobs_block jobs are in a single block
        sleep_time (int): secs to sleep between rapidfire loop iterations
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        timeout (int): # of seconds after which to stop the rapidfire process
        fill_mode (bool): whether to submit jobs even when there is nothing to run (only in
            non-reservation mode)
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == "infinite" else int(nlaunches)
    l_logger = get_fw_logger("queue.launcher", l_dir=launchpad.logdir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(f"Desired launch directory {launch_dir} does not exist!")

    num_launched = 0
    start_time = datetime.now()

    try:
        l_logger.info("getting queue adapter")

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, "block_*")), reverse=True)
        if block_dir is not None:
            if not block_dir.startswith("block_"):
                raise ValueError(f"Invalid name {block_dir}, block dirs must start with 'block_")
            block_dir = os.path.abspath(os.path.join(launch_dir, block_dir))
            os.mkdir(block_dir, exist_ok=True)
        elif prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info(f"Found previous block, using {block_dir}")
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue if a maximum has been set.
            jobs_in_queue = 0
            if njobs_queue:
                jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while launchpad.run_exists(fworker) or (fill_mode and not reserve):

                if timeout and (datetime.now() - start_time).total_seconds() >= timeout:
                    l_logger.info("Timeout reached.")
                    break

                if njobs_queue and jobs_in_queue >= njobs_queue:
                    l_logger.info(f"Jobs in queue ({jobs_in_queue}) meets/exceeds maximum allowed ({njobs_queue})")
                    break

                l_logger.info("Launching a rocket!")

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(f"Block got bigger than {njobs_block} jobs.")
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                return_code = launch_rocket_to_queue(
                    launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode
                )
                if return_code is None:
                    l_logger.info("No READY jobs detected...")
                    break
                elif not return_code:
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if nlaunches > 0 and num_launched == nlaunches:
                    l_logger.info(f"Launched allowed number of jobs: {num_launched}")
                    break
                # wait for the queue system to update
                l_logger.info(f"Sleeping for {QUEUE_UPDATE_INTERVAL} seconds...zzz...")
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0 and njobs_queue:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)

            if (
                (nlaunches > 0 and num_launched == nlaunches)
                or (timeout and (datetime.now() - start_time).total_seconds() >= timeout)
                or (nlaunches == 0 and not launchpad.future_run_exists(fworker))
            ):
                break

            l_logger.info(f"Finished a round of launches, sleeping for {sleep_time} secs")
            time.sleep(sleep_time)
            l_logger.info("Checking for Rockets to run...")

    except Exception:
        log_exception(l_logger, "Error with queue launcher rapid fire!")
Exemple #19
0
def launch_rocket_to_queue(launchpad,
                           fworker,
                           qadapter,
                           launcher_dir='.',
                           reserve=False,
                           strm_lvl='INFO'):
    """
    Submit a single job to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launcher_dir: (str) The directory where to submit the job
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    """

    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)
    # get the queue adapter
    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict(
    ))  # make a defensive copy, mainly for reservation mode

    # make sure launch_dir exists:
    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    if launchpad.run_exists(fworker):
        try:
            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))
            os.chdir(launcher_dir)

            oldlaunch_dir = None
            if '--offline' in qadapter['rocket_launch'] and not reserve:
                raise ValueError(
                    "Must use reservation mode (-r option) of qlaunch when using offline mode (--offline option) of rlaunch!!"
                )
            elif reserve:
                l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad._reserve_fw(fworker, launcher_dir)
                if not fw:
                    l_logger.info(
                        'No jobs exist in the LaunchPad for submission to queue!'
                    )
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # set job name to the FW name
                job_name = get_slug(fw.name)
                job_name = job_name[0:20] if len(job_name) > 20 else job_name
                qadapter.update({'job_name':
                                 job_name})  # set the job name to FW name

                if '_queueadapter' in fw.spec:
                    l_logger.debug(
                        'updating queue params using FireWork spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # update the exe to include the FW_id
                if 'singleshot' not in qadapter.get('rocket_launch', ''):
                    raise ValueError(
                        'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
                    )
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                if '--offline' in qadapter['rocket_launch']:
                    # handle _launch_dir parameter early...
                    if '_launch_dir' in fw.spec:
                        os.chdir(fw.spec['_launch_dir'])
                        oldlaunch_dir = launcher_dir
                        launcher_dir = os.path.abspath(os.getcwd())
                        launchpad._change_launch_dir(launch_id, launcher_dir)

                    # write FW.json
                    fw.to_file("FW.json")
                    # write Launchid
                    with open('FW_offline.json', 'w') as f:
                        f.write('{"launch_id":%s}' % launch_id)

                    launchpad.add_offline_run(launch_id, fw.fw_id, fw.name)

            # write and submit the queue script using the queue adapter
            l_logger.debug('writing queue script')
            with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                queue_script = qadapter.get_script_str(launcher_dir)
                f.write(queue_script)
            l_logger.info('submitting queue script')
            reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
            if not reservation_id:
                raise RuntimeError(
                    'queue script could not be submitted, check queue adapter and queue server status!'
                )
            elif reserve:
                launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            return False

        finally:
            if oldlaunch_dir:
                os.chdir(
                    oldlaunch_dir
                )  # this only matters in --offline mode with _launch_dir!
    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
        return False
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=0,
              njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None,
              fill_mode=False):
    """
    Submit many jobs to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launch_dir (str): directory where we want to write the blocks
        nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round
        njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit
        njobs_block (int): automatically write a new block when njobs_block jobs are in a single block
        sleep_time (int): secs to sleep between rapidfire loop iterations
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        timeout (int): # of seconds after which to stop the rapidfire process
        fill_mode (bool): whether to submit jobs even when there is nothing to run (only in
            non-reservation mode)
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    start_time = datetime.now()

    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while (not njobs_queue or jobs_in_queue < njobs_queue) and \
                    (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \
                    and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info('Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                return_code = launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve,
                                              strm_lvl, True, fill_mode)
                if return_code is None:
                    l_logger.info('No READY jobs detected...')
                    break
                elif not return_code:
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0 or \
                    (timeout and (datetime.now() - start_time).total_seconds() >= timeout):
                break
            l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #21
0
def launch_rocket_to_queue(launchpad,
                           fworker,
                           qadapter,
                           launcher_dir='.',
                           reserve=False,
                           strm_lvl='INFO',
                           create_launcher_dir=False,
                           fill_mode=False,
                           fw_id=None):
    """
    Submit a single job to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launcher_dir (str): The directory where to submit the job
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed
        fill_mode (bool): whether to submit jobs even when there is nothing to run
            (only in non-reservation mode)
        fw_id (int): specific fw_id to reserve (reservation mode only)
    """
    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict(
    ))  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError("Must use reservation mode (-r option) of qlaunch "
                         "when using offline option of rlaunch!!")

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError(
            'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
        )

    if fill_mode and reserve:
        raise ValueError(
            "Fill_mode cannot be used in conjunction with reserve mode!")

    if fw_id and not reserve:
        raise ValueError(
            "qlaunch for specific fireworks may only be used in reservation mode."
        )

    if fill_mode or launchpad.run_exists(fworker):
        launch_id = None
        try:
            if reserve:
                if fw_id:
                    l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker,
                                                     launcher_dir,
                                                     fw_id=fw_id)
                if not fw:
                    l_logger.info(
                        'No jobs exist in the LaunchPad for submission to queue!'
                    )
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug(
                        'updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir,
                                                     fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    makedirs_p(launcher_dir)

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir,
                                                        l_logger,
                                                        prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir,
                                                    l_logger,
                                                    prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    raise RuntimeError(
                        'queue script could not be submitted, check queue '
                        'script/queue adapter/queue server status!')
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            if reserve and launch_id is not None:
                try:
                    l_logger.info(
                        'Un-reserving FW with fw_id, launch_id: {}, {}'.format(
                            fw.fw_id, launch_id))
                    launchpad.cancel_reservation(launch_id)
                    launchpad.forget_offline(launch_id)
                except:
                    log_exception(
                        l_logger,
                        'Error unreserving FW with fw_id {}'.format(fw.fw_id))

            return False

    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
        return None  # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False,
                           strm_lvl='INFO', create_launcher_dir=False, fill_mode=False,
                           fw_id=None):
    """
    Submit a single job to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launcher_dir (str): The directory where to submit the job
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed
        fill_mode (bool): whether to submit jobs even when there is nothing to run
            (only in non-reservation mode)
        fw_id (int): specific fw_id to reserve (reservation mode only)
    """
    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict())  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError("Must use reservation mode (-r option) of qlaunch "
                         "when using offline option of rlaunch!!")

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!')

    if fill_mode and reserve:
        raise ValueError("Fill_mode cannot be used in conjunction with reserve mode!")

    if fw_id and not reserve:
        raise ValueError("qlaunch for specific fireworks may only be used in reservation mode.")

    if fill_mode or launchpad.run_exists(fworker):
        launch_id = None
        try:
            if reserve:
                if fw_id:
                    l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir, fw_id=fw_id)
                if not fw:
                    l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug('updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    makedirs_p(launcher_dir)

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    raise RuntimeError('queue script could not be submitted, check queue '
                                       'script/queue adapter/queue server status!')
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            if reserve and launch_id is not None:
                try:
                    l_logger.info('Un-reserving FW with fw_id, launch_id: {}, {}'.format(
                        fw.fw_id, launch_id))
                    launchpad.cancel_reservation(launch_id)
                    launchpad.forget_offline(launch_id)
                except:
                    log_exception(l_logger, 'Error unreserving FW with fw_id {}'.format(fw.fw_id))

            return False

    else:
        l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
        return None  # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
Exemple #23
0
def rapidfire(launchpad,
              fworker,
              qadapter,
              launch_dir='.',
              nlaunches=0,
              njobs_queue=10,
              njobs_block=500,
              sleep_time=None,
              reserve=False,
              strm_lvl='INFO'):
    """
    Submit many jobs to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launch_dir: directory where we want to write the blocks
    :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')),
                             reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(
                os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(
                qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while jobs_in_queue < njobs_queue and launchpad.run_exists(
                    fworker):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(
                        'Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                if not launch_rocket_to_queue(launchpad, fworker, qadapter,
                                              block_dir, reserve, strm_lvl,
                                              True):
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(
                    QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(
                        qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info(
                'Finished a round of launches, sleeping for {} secs'.format(
                    sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')