Exemple #1
1
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None,
              strm_lvl='INFO', timeout=None):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories
    for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker object)
        m_dir (str): the directory in which to loop Rocket running
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops
        max_loops (int): maximum number of loops (default -1 is infinite)
        sleep_time (int): secs to sleep between rapidfire loop iterations
        strm_lvl (str): level at which to output logs to stdout
        timeout (int): of seconds after which to stop the rapidfire process
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = get_fworker(fworker)

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and \
                (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                time.sleep(0.15)
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
Exemple #2
0
def rapidfire(launchpad,
              fworker=None,
              m_dir=None,
              nlaunches=0,
              max_loops=-1,
              sleep_time=None,
              strm_lvl='INFO'):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param max_loops: (int) maximum number of loops
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=launchpad.get_logdir(),
                             stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    num_loops = 0

    while num_loops != max_loops:
        while launchpad.run_exists(fworker):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir,
                                                l_logger,
                                                prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            time.sleep(
                0.15
            )  # add a small amount of buffer breathing time for DB to refresh, etc.
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
Exemple #3
0
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1,
              sleep_time=None, strm_lvl='INFO', timeout=None):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop until max_loops
    :param max_loops: (int) maximum number of loops (default -1 is infinite)
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    :param timeout: (int) # of seconds after which to stop the rapidfire process
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and \
                (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                time.sleep(0.15)  # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param max_loops: (int) maximum number of loops
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    num_loops = 0

    while num_loops != max_loops:
        skip_check = False  # this is used to speed operation
        while skip_check or launchpad.run_exists(fworker):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                time.sleep(0.15)  # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
def rapidfire(launchpad,
              fworker=None,
              m_dir=None,
              logdir=None,
              strm_lvl=None,
              nlaunches=0,
              sleep_time=60,
              max_loops=-1):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: a LaunchPad object
    :param fworker: a FWorker object
    :param m_dir: the directory in which to loop Rocket running
    :param nlaunches: 0 means 'until completion', -1 means 'infinity'
    """
    curdir = m_dir if m_dir else os.getcwd()
    fworker = fworker if fworker else FWorker()
    # initialize logger
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=logdir,
                             stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)

    # TODO: wrap in try-except. Use log_exception for exceptions EXCEPT running out of jobs.
    # TODO: always chdir() back to curdir when finished...then delete cruft from MongoTests
    num_launched = 0
    num_loops = 0
    while num_loops != max_loops:
        while launchpad.run_exists():
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir,
                                                l_logger,
                                                prefix='launcher_')
            os.chdir(launcher_dir)
            launch_rocket(launchpad, fworker, logdir, strm_lvl)
            num_launched += 1
            if num_launched == nlaunches:
                break
            time.sleep(
                0.1
            )  # add a small amount of buffer breathing time for DB to refresh, etc.
        if num_launched == nlaunches or nlaunches == 0:
            break
        l_logger.info('Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        l_logger.info('Checking for FWs to run...'.format(sleep_time))
def rapidfire(launchpad, fworker=None, m_dir=None, logdir=None, strm_lvl=None, nlaunches=0, sleep_time=60, max_loops=-1):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: a LaunchPad object
    :param fworker: a FWorker object
    :param m_dir: the directory in which to loop Rocket running
    :param nlaunches: 0 means 'until completion', -1 means 'infinity'
    """
    curdir = m_dir if m_dir else os.getcwd()
    fworker = fworker if fworker else FWorker()
    # initialize logger
    l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)

    # TODO: wrap in try-except. Use log_exception for exceptions EXCEPT running out of jobs.
    # TODO: always chdir() back to curdir when finished...then delete cruft from MongoTests
    num_launched = 0
    num_loops = 0
    while num_loops != max_loops:
        while launchpad.run_exists():
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            launch_rocket(launchpad, fworker, logdir, strm_lvl)
            num_launched += 1
            if num_launched == nlaunches:
                break
            time.sleep(0.1)  # add a small amount of buffer breathing time for DB to refresh, etc.
        if num_launched == nlaunches or nlaunches == 0:
            break
        l_logger.info('Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        l_logger.info('Checking for FWs to run...'.format(sleep_time))
def rapidfire(queue_params, launch_dir='.', njobs_queue=10, njobs_block=500, strm_lvl=None, nlaunches=0, sleep_time=60, launchpad=None, fworker=None, reserve=False):
    """
    Submit many jobs to the queue.
    
    :param queue_params: A QueueParams instance
    :param launch_dir: directory where we want to write the blocks
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    """

    # convert launch_dir to absolute path
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)

    # initialize logger
    l_logger = get_fw_logger('queue.launcher', l_dir=queue_params.logging_dir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(queue_params, njobs_queue, l_logger)
            jobs_exist = not launchpad or launchpad.run_exists()

            while jobs_in_queue < njobs_queue and jobs_exist:
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info('Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # create launcher_dir
                launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_')
                # launch a single job
                launch_rocket_to_queue(queue_params, launcher_dir, strm_lvl, launchpad, fworker, reserve)
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(FWConfig().QUEUE_UPDATE_INTERVAL))
                time.sleep(FWConfig().QUEUE_UPDATE_INTERVAL)
                num_launched += 1
                if num_launched == nlaunches:
                    break
                jobs_exist = not launchpad or launchpad.run_exists()
                jobs_in_queue = _get_number_of_jobs_in_queue(queue_params, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #8
0
                        '--push-wf',
                        dest='push_wf',
                        action='store_true',
                        help='Push demo workflow to DB.')
    parser.add_argument('-b',
                        '--block',
                        action='store_true',
                        help='Create a new output block.')
    parser.add_argument('-u',
                        '--update-only',
                        action='store_true',
                        help='Update the DB, but do not process jobs.')
    parser.add_argument('-d',
                        '--daemon-mode',
                        action='store_true',
                        help='Update the DB, but do not process jobs.')
    args = parser.parse_args()

    if args.reset:
        launchpad.reset('', require_password=False)

    if args.push_wf:
        launchpad.add_wf(workflow)

    if args.block:
        from fireworks.utilities.fw_utilities import create_datestamp_dir
        create_datestamp_dir(out_dir, launchpad.m_logger)

    if not args.update_only:
        process_offline(launchpad, launcher_args, args.daemon_mode)
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False, strm_lvl='INFO',
                           create_launcher_dir=False):
    """
    Submit a single job to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launcher_dir: (str) The directory where to submit the job
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    :param create_launcher_dir: (bool) Whether to create a subfolder launcher+timestamp, if needed
    """

    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict())  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError("Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!")

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!')

    if launchpad.run_exists(fworker):
        try:
            if reserve:
                l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir)
                if not fw:
                    l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug('updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    try:
                        os.makedirs(launcher_dir)
                    except OSError as exception:
                        if exception.errno != errno.EEXIST:
                            raise

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    if reserve:
                        l_logger.info('Un-reserving FW with fw_id, launch_id: {}, {}'.format(fw.fw_id, launch_id))
                        launchpad.cancel_reservation(launch_id)
                    raise RuntimeError('queue script could not be submitted, check queue script/queue adapter/queue server status!')
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            return False

    else:
        l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
        return False
Exemple #10
0
def rapidfire(queue_params,
              launch_dir='.',
              njobs_queue=10,
              njobs_block=500,
              strm_lvl=None,
              nlaunches=0,
              sleep_time=60,
              launchpad=None,
              fworker=None,
              reserve=False):
    """
    Submit many jobs to the queue.
    
    :param queue_params: A QueueParams instance
    :param launch_dir: directory where we want to write the blocks
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    """

    # convert launch_dir to absolute path
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)

    # initialize logger
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=queue_params.logging_dir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(
                queue_params, njobs_queue, l_logger)
            jobs_exist = not launchpad or launchpad.run_exists()

            while jobs_in_queue < njobs_queue and jobs_exist:
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(
                        'Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # create launcher_dir
                launcher_dir = create_datestamp_dir(block_dir,
                                                    l_logger,
                                                    prefix='launcher_')
                # launch a single job
                launch_rocket_to_queue(queue_params, launcher_dir, strm_lvl,
                                       launchpad, fworker, reserve)
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(
                    FWConfig().QUEUE_UPDATE_INTERVAL))
                time.sleep(FWConfig().QUEUE_UPDATE_INTERVAL)
                num_launched += 1
                if num_launched == nlaunches:
                    break
                jobs_exist = not launchpad or launchpad.run_exists()
                jobs_in_queue = _get_number_of_jobs_in_queue(
                    queue_params, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info(
                'Finished a round of launches, sleeping for {} secs'.format(
                    sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=0,
              njobs_block=500, sleep_time=None, reserve=False, strm_lvl='INFO', timeout=None,
              fill_mode=False):
    """
    Submit many jobs to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launch_dir (str): directory where we want to write the blocks
        nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round
        njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit
        njobs_block (int): automatically write a new block when njobs_block jobs are in a single block
        sleep_time (int): secs to sleep between rapidfire loop iterations
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        timeout (int): # of seconds after which to stop the rapidfire process
        fill_mode (bool): whether to submit jobs even when there is nothing to run (only in
            non-reservation mode)
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    start_time = datetime.now()

    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while (not njobs_queue or jobs_in_queue < njobs_queue) and \
                    (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \
                    and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info('Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                return_code = launch_rocket_to_queue(launchpad, fworker, qadapter, block_dir, reserve,
                                              strm_lvl, True, fill_mode)
                if return_code is None:
                    l_logger.info('No READY jobs detected...')
                    break
                elif not return_code:
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0 or \
                    (timeout and (datetime.now() - start_time).total_seconds() >= timeout):
                break
            l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #12
0
def launch_rocket_to_queue(launchpad,
                           fworker,
                           qadapter,
                           launcher_dir='.',
                           reserve=False,
                           strm_lvl='INFO',
                           create_launcher_dir=False,
                           fill_mode=False,
                           fw_id=None):
    """
    Submit a single job to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launcher_dir (str): The directory where to submit the job
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed
        fill_mode (bool): whether to submit jobs even when there is nothing to run
            (only in non-reservation mode)
        fw_id (int): specific fw_id to reserve (reservation mode only)
    """
    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict(
    ))  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError("Must use reservation mode (-r option) of qlaunch "
                         "when using offline option of rlaunch!!")

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError(
            'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
        )

    if fill_mode and reserve:
        raise ValueError(
            "Fill_mode cannot be used in conjunction with reserve mode!")

    if fw_id and not reserve:
        raise ValueError(
            "qlaunch for specific fireworks may only be used in reservation mode."
        )

    if fill_mode or launchpad.run_exists(fworker):
        launch_id = None
        try:
            if reserve:
                if fw_id:
                    l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker,
                                                     launcher_dir,
                                                     fw_id=fw_id)
                if not fw:
                    l_logger.info(
                        'No jobs exist in the LaunchPad for submission to queue!'
                    )
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug(
                        'updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir,
                                                     fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    makedirs_p(launcher_dir)

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir,
                                                        l_logger,
                                                        prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir,
                                                    l_logger,
                                                    prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    raise RuntimeError(
                        'queue script could not be submitted, check queue '
                        'script/queue adapter/queue server status!')
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            if reserve and launch_id is not None:
                try:
                    l_logger.info(
                        'Un-reserving FW with fw_id, launch_id: {}, {}'.format(
                            fw.fw_id, launch_id))
                    launchpad.cancel_reservation(launch_id)
                    launchpad.forget_offline(launch_id)
                except:
                    log_exception(
                        l_logger,
                        'Error unreserving FW with fw_id {}'.format(fw.fw_id))

            return False

    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
        return None  # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
Exemple #13
0
def rapidfire(launchpad,
              fworker,
              qadapter,
              launch_dir='.',
              nlaunches=0,
              njobs_queue=0,
              njobs_block=500,
              sleep_time=None,
              reserve=False,
              strm_lvl='INFO',
              timeout=None,
              fill_mode=False):
    """
    Submit many jobs to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launch_dir (str): directory where we want to write the blocks
        nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round
        njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit
        njobs_block (int): automatically write a new block when njobs_block jobs are in a single block
        sleep_time (int): secs to sleep between rapidfire loop iterations
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        timeout (int): # of seconds after which to stop the rapidfire process
        fill_mode (bool): whether to submit jobs even when there is nothing to run (only in
            non-reservation mode)
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    start_time = datetime.now()

    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')),
                             reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(
                os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(
                qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while (not njobs_queue or jobs_in_queue < njobs_queue) and \
                    (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \
                    and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(
                        'Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                return_code = launch_rocket_to_queue(launchpad, fworker,
                                                     qadapter, block_dir,
                                                     reserve, strm_lvl, True,
                                                     fill_mode)
                if return_code is None:
                    l_logger.info('No READY jobs detected...')
                    break
                elif not return_code:
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(
                    QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(
                        qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0 or \
                    (timeout and (datetime.now() - start_time).total_seconds() >= timeout):
                break
            l_logger.info(
                'Finished a round of launches, sleeping for {} secs'.format(
                    sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #14
0
#==============================================================================
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
            description='Demonstrates Fireworks workflows on OLCF resources.')
    parser.add_argument('--reset', action='store_true',
            help='Reset the database.')
    parser.add_argument('-p', '--push-wf', dest='push_wf', action='store_true',
            help='Push demo workflow to DB.')
    parser.add_argument('-b', '--block', action='store_true',
            help='Create a new output block.')
    parser.add_argument('-u', '--update-only', action='store_true',
            help='Update the DB, but do not process jobs.')
    parser.add_argument('-d', '--daemon-mode', action='store_true',
            help='Update the DB, but do not process jobs.')
    args = parser.parse_args()

    if args.reset:
        launchpad.reset('', require_password=False)

    if args.push_wf:
        launchpad.add_wf(workflow)

    if args.block:
        from fireworks.utilities.fw_utilities import create_datestamp_dir
        create_datestamp_dir(out_dir, launchpad.m_logger)

    if not args.update_only:
        process_offline(launchpad, launcher_args, args.daemon_mode)

Exemple #15
0
def rapidfire(
    launchpad,
    fworker,
    qadapter,
    launch_dir=".",
    block_dir=None,
    nlaunches=0,
    njobs_queue=0,
    njobs_block=500,
    sleep_time=None,
    reserve=False,
    strm_lvl="INFO",
    timeout=None,
    fill_mode=False,
):
    """
    Submit many jobs to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launch_dir (str): directory where we want to write the blocks
        block_dir (str): directory to use as block dir. Can be a new or existing block. Dirname must
            start with 'block_'.
        nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round
        njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit.
            If 0 skips the check on the number of jobs in the queue.
        njobs_block (int): automatically write a new block when njobs_block jobs are in a single block
        sleep_time (int): secs to sleep between rapidfire loop iterations
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        timeout (int): # of seconds after which to stop the rapidfire process
        fill_mode (bool): whether to submit jobs even when there is nothing to run (only in
            non-reservation mode)
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == "infinite" else int(nlaunches)
    l_logger = get_fw_logger("queue.launcher", l_dir=launchpad.logdir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(f"Desired launch directory {launch_dir} does not exist!")

    num_launched = 0
    start_time = datetime.now()

    try:
        l_logger.info("getting queue adapter")

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, "block_*")), reverse=True)
        if block_dir is not None:
            if not block_dir.startswith("block_"):
                raise ValueError(f"Invalid name {block_dir}, block dirs must start with 'block_")
            block_dir = os.path.abspath(os.path.join(launch_dir, block_dir))
            os.mkdir(block_dir, exist_ok=True)
        elif prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info(f"Found previous block, using {block_dir}")
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue if a maximum has been set.
            jobs_in_queue = 0
            if njobs_queue:
                jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while launchpad.run_exists(fworker) or (fill_mode and not reserve):

                if timeout and (datetime.now() - start_time).total_seconds() >= timeout:
                    l_logger.info("Timeout reached.")
                    break

                if njobs_queue and jobs_in_queue >= njobs_queue:
                    l_logger.info(f"Jobs in queue ({jobs_in_queue}) meets/exceeds maximum allowed ({njobs_queue})")
                    break

                l_logger.info("Launching a rocket!")

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(f"Block got bigger than {njobs_block} jobs.")
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                return_code = launch_rocket_to_queue(
                    launchpad, fworker, qadapter, block_dir, reserve, strm_lvl, True, fill_mode
                )
                if return_code is None:
                    l_logger.info("No READY jobs detected...")
                    break
                elif not return_code:
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if nlaunches > 0 and num_launched == nlaunches:
                    l_logger.info(f"Launched allowed number of jobs: {num_launched}")
                    break
                # wait for the queue system to update
                l_logger.info(f"Sleeping for {QUEUE_UPDATE_INTERVAL} seconds...zzz...")
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0 and njobs_queue:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)

            if (
                (nlaunches > 0 and num_launched == nlaunches)
                or (timeout and (datetime.now() - start_time).total_seconds() >= timeout)
                or (nlaunches == 0 and not launchpad.future_run_exists(fworker))
            ):
                break

            l_logger.info(f"Finished a round of launches, sleeping for {sleep_time} secs")
            time.sleep(sleep_time)
            l_logger.info("Checking for Rockets to run...")

    except Exception:
        log_exception(l_logger, "Error with queue launcher rapid fire!")
Exemple #16
0
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None,
              strm_lvl='INFO', timeout=None, local_redirect=False, pdb_on_exception=False):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories
    for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker object)
        m_dir (str): the directory in which to loop Rocket running
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops
        max_loops (int): maximum number of loops (default -1 is infinite)
        sleep_time (int): secs to sleep between rapidfire loop iterations
        strm_lvl (str): level at which to output logs to stdout
        timeout (int): of seconds after which to stop the rapidfire process
        local_redirect (bool): redirect standard input and output to local file
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = get_fworker(fworker)

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    def time_ok():
        # has the rapidfire run timed out?
        return (timeout is None or
                (datetime.now() - start_time).total_seconds() < timeout)

    while num_loops != max_loops and time_ok():
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and time_ok():
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            if local_redirect:
                with redirect_local():
                    rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl,
                                               pdb_on_exception=pdb_on_exception)
            else:
                rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl,
                                           pdb_on_exception=pdb_on_exception)

            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if nlaunches > 0 and num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                time.sleep(0.15)
                skip_check = False
        if nlaunches == 0:
            if not launchpad.future_run_exists(fworker):
                break
        elif num_launched == nlaunches:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...')
    os.chdir(curdir)
Exemple #17
0
def rapidfire(launchpad, fworker, qadapter, launch_dir='.', nlaunches=0, njobs_queue=10, njobs_block=500,
              sleep_time=None, reserve=False, strm_lvl='INFO'):
    """
    Submit many jobs to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launch_dir: directory where we want to write the blocks
    :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')), reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while jobs_in_queue < njobs_queue and launchpad.run_exists(fworker):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info('Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # create launcher_dir
                launcher_dir = create_datestamp_dir(block_dir, l_logger, prefix='launcher_')
                # launch a single job
                if not launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir, reserve, strm_lvl):
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info('Finished a round of launches, sleeping for {} secs'.format(sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #18
0
def rapidfire(launchpad,
              fworker,
              qadapter,
              launch_dir='.',
              nlaunches=0,
              njobs_queue=10,
              njobs_block=500,
              sleep_time=None,
              reserve=False,
              strm_lvl='INFO'):
    """
    Submit many jobs to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launch_dir: directory where we want to write the blocks
    :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')),
                             reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(
                os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(
                qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while jobs_in_queue < njobs_queue and launchpad.run_exists(
                    fworker):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(
                        'Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                if not launch_rocket_to_queue(launchpad, fworker, qadapter,
                                              block_dir, reserve, strm_lvl,
                                              True):
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(
                    QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(
                        qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info(
                'Finished a round of launches, sleeping for {} secs'.format(
                    sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
def launch_rocket_to_queue(launchpad, fworker, qadapter, launcher_dir='.', reserve=False,
                           strm_lvl='INFO', create_launcher_dir=False, fill_mode=False,
                           fw_id=None):
    """
    Submit a single job to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launcher_dir (str): The directory where to submit the job
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed
        fill_mode (bool): whether to submit jobs even when there is nothing to run
            (only in non-reservation mode)
        fw_id (int): specific fw_id to reserve (reservation mode only)
    """
    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher', l_dir=launchpad.logdir, stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict())  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError('Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError("Must use reservation mode (-r option) of qlaunch "
                         "when using offline option of rlaunch!!")

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError('Reservation mode of queue launcher only works for singleshot Rocket Launcher!')

    if fill_mode and reserve:
        raise ValueError("Fill_mode cannot be used in conjunction with reserve mode!")

    if fw_id and not reserve:
        raise ValueError("qlaunch for specific fireworks may only be used in reservation mode.")

    if fill_mode or launchpad.run_exists(fworker):
        launch_id = None
        try:
            if reserve:
                if fw_id:
                    l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir, fw_id=fw_id)
                if not fw:
                    l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug('updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir, fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    makedirs_p(launcher_dir)

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir, l_logger, prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    raise RuntimeError('queue script could not be submitted, check queue '
                                       'script/queue adapter/queue server status!')
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            if reserve and launch_id is not None:
                try:
                    l_logger.info('Un-reserving FW with fw_id, launch_id: {}, {}'.format(
                        fw.fw_id, launch_id))
                    launchpad.cancel_reservation(launch_id)
                    launchpad.forget_offline(launch_id)
                except:
                    log_exception(l_logger, 'Error unreserving FW with fw_id {}'.format(fw.fw_id))

            return False

    else:
        l_logger.info('No jobs exist in the LaunchPad for submission to queue!')
        return None  # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
Exemple #20
0
def launch_rocket_to_queue(launchpad,
                           fworker,
                           qadapter,
                           launcher_dir='.',
                           reserve=False,
                           strm_lvl='INFO',
                           create_launcher_dir=False):
    """
    Submit a single job to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launcher_dir: (str) The directory where to submit the job
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    :param create_launcher_dir: (bool) Whether to create a subfolder launcher+timestamp, if needed
    """

    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict(
    ))  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError(
            "Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!"
        )

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError(
            'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
        )

    if launchpad.run_exists(fworker):
        try:
            if reserve:
                l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir)
                if not fw:
                    l_logger.info(
                        'No jobs exist in the LaunchPad for submission to queue!'
                    )
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug(
                        'updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir,
                                                     fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    try:
                        os.makedirs(launcher_dir)
                    except OSError as exception:
                        if exception.errno != errno.EEXIST:
                            raise

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir,
                                                        l_logger,
                                                        prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir,
                                                    l_logger,
                                                    prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    if reserve:
                        l_logger.info(
                            'Un-reserving FW with fw_id, launch_id: {}, {}'.
                            format(fw.fw_id, launch_id))
                        launchpad.cancel_reservation(launch_id)
                    raise RuntimeError(
                        'queue script could not be submitted, check queue script/queue adapter/queue server status!'
                    )
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            return False

    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
        return False