Exemple #1
1
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None,
              strm_lvl='INFO', timeout=None):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories
    for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker object)
        m_dir (str): the directory in which to loop Rocket running
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops
        max_loops (int): maximum number of loops (default -1 is infinite)
        sleep_time (int): secs to sleep between rapidfire loop iterations
        strm_lvl (str): level at which to output logs to stdout
        timeout (int): of seconds after which to stop the rapidfire process
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = get_fworker(fworker)

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and \
                (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                time.sleep(0.15)
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
Exemple #2
0
 def __init__(self, launchpad, opt_label):
     self.logger = get_fw_logger("rocketsled")
     self.config = None
     self.launchpad = launchpad
     self.opt_label = opt_label
     self.c = getattr(self.launchpad.db, opt_label)
     self.is_configured = False
Exemple #3
0
 def __init__(self, launchpad, opt_label):
     self.logger = get_fw_logger("rocketsled")
     self.config = None
     self.launchpad = launchpad
     self.opt_label = opt_label
     self.c = getattr(self.launchpad.db, opt_label)
     self.is_configured = False
Exemple #4
0
    def __init__(self, host='localhost', port=27017, name='fireworks', username=None, password=None,
                 logdir=None, strm_lvl=None):
        """
        
        :param host:
        :param port:
        :param name:
        :param username:
        :param password:
        :param logdir:
        :param strm_lvl:
        """
        self.host = host
        self.port = port
        self.name = name
        self.username = username
        self.password = password

        # set up logger
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else 'INFO'
        self.m_logger = get_fw_logger('launchpad', l_dir=self.logdir, stream_level=self.strm_lvl)

        self.connection = MongoClient(host, port, j=True)
        self.database = self.connection[name]
        if username:
            self.database.authenticate(username, password)

        self.fireworks = self.database.fireworks
        self.launches = self.database.launches
        self.fw_id_assigner = self.database.fw_id_assigner
        self.links = self.database.links
Exemple #5
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list,
                      sub_nproc, timeout, running_ids_dict):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    FWData().Running_IDs = running_ids_dict
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout)
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        launch_ids = FWData().Running_IDs.values()
        live_ids = list(set(launch_ids) - {None})
        if len(live_ids) > 0:
            # Some other sub jobs are still running
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    sleep_time))
            time.sleep(sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(sleep_time))
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout)
        else:
            break
    log_multi(l_logger, 'Sub job finished')
Exemple #6
0
    def __init__(self, host='localhost', port=27017, name='fireworks',
                 username=None, password=None, logdir=None, strm_lvl=None,
                 user_indices=None, wf_user_indices=None):
        """

        :param host:
        :param port:
        :param name:
        :param username:
        :param password:
        :param logdir:
        :param strm_lvl:
        :param user_indices:
        :param wf_user_indices:
        """
        self.host = host
        self.port = port
        self.name = name
        self.username = username
        self.password = password

        # set up logger
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else 'INFO'
        self.m_logger = get_fw_logger('launchpad', l_dir=self.logdir,
                                      stream_level=self.strm_lvl)

        self.user_indices = user_indices if user_indices else []
        self.wf_user_indices = wf_user_indices if wf_user_indices else []

        # get connection
        self.client = MongoClient(host, port, j=True)
Exemple #7
0
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO',
                  pdb_on_exception=False):
    """
    Run a single rocket in the current directory.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        fw_id (int): if set, a particular Firework to run
        strm_lvl (str): level at which to output logs to stdout
        pdb_on_exception (bool): if set to True, python will start
            the debugger on a firework exception

    Returns:
        bool
    """
    fworker = get_fworker(fworker)
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=strm_lvl)

    log_multi(l_logger, 'Launching Rocket')
    rocket = Rocket(launchpad, fworker, fw_id)
    rocket_ran = rocket.run(pdb_on_exception=pdb_on_exception)
    log_multi(l_logger, 'Rocket finished')
    return rocket_ran
Exemple #8
0
def rlaunch():

    m_description = 'This program launches one or more Rockets. A Rocket grabs a job from the central database and ' \
                    'runs it. The "single-shot" option launches a single Rocket, ' \
                    'whereas the "rapidfire" option loops until all FireWorks are completed.'

    parser = ArgumentParser(description=m_description)
    subparsers = parser.add_subparsers(help='command', dest='command')
    single_parser = subparsers.add_parser('singleshot', help='launch a single Rocket')
    rapid_parser = subparsers.add_parser('rapidfire',
                                         help='launch multiple Rockets (loop until all FireWorks complete)')

    single_parser.add_argument('-f', '--fw_id', help='specific fw_id to run', default=None, type=int)
    single_parser.add_argument('--offline', help='run in offline mode (FW.json required)', action='store_true')

    rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite"; default 0 is all jobs in DB)', default=0)
    rapid_parser.add_argument('--sleep', help='sleep time between loops (secs)', default=None, type=int)

    parser.add_argument('-l', '--launchpad_file', help='path to launchpad file', default=LAUNCHPAD_LOC)
    parser.add_argument('-w', '--fworker_file', help='path to fworker file', default=FWORKER_LOC)
    parser.add_argument('-c', '--config_dir', help='path to a directory containing the config file (used if -l, -w unspecified)',
                        default=CONFIG_FILE_DIR)

    parser.add_argument('--loglvl', help='level to print log messages', default='INFO')
    parser.add_argument('-s', '--silencer', help='shortcut to mute log messages', action='store_true')

    args = parser.parse_args()

    signal.signal(signal.SIGINT, handle_interrupt)  # graceful exist on ^C

    if not args.launchpad_file and os.path.exists(os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml')

    if not args.fworker_file and os.path.exists(os.path.join(args.config_dir, 'my_fworker.yaml')):
        args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml')

    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'singleshot' and args.offline:
        launchpad = None
    else:
        launchpad = LaunchPad.from_file(args.launchpad_file) if args.launchpad_file else LaunchPad(strm_lvl=args.loglvl)

    if args.fworker_file:
        fworker = FWorker.from_file(args.fworker_file)
    else:
        fworker = FWorker()

    # prime addr lookups
    _log = get_fw_logger("rlaunch", stream_level="INFO")
    _log.info("Hostname/IP lookup (this will take a few seconds)")
    get_my_host()
    get_my_ip()

    if args.command == 'rapidfire':
        rapidfire(launchpad, fworker, None, args.nlaunches, -1, args.sleep, args.loglvl)

    else:
        launch_rocket(launchpad, fworker, args.fw_id, args.loglvl)
Exemple #9
0
def rlaunch():

    m_description = 'This program launches one or more Rockets. A Rocket grabs a job from the central database and ' \
                    'runs it. The "single-shot" option launches a single Rocket, ' \
                    'whereas the "rapidfire" option loops until all FireWorks are completed.'

    parser = ArgumentParser(description=m_description)
    subparsers = parser.add_subparsers(help='command', dest='command')
    single_parser = subparsers.add_parser('singleshot', help='launch a single Rocket')
    rapid_parser = subparsers.add_parser('rapidfire',
                                         help='launch multiple Rockets (loop until all FireWorks complete)')

    single_parser.add_argument('-f', '--fw_id', help='specific fw_id to run', default=None, type=int)
    single_parser.add_argument('--offline', help='run in offline mode (FW.json required)', action='store_true')

    rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite"; default 0 is all jobs in DB)', default=0)
    rapid_parser.add_argument('--sleep', help='sleep time between loops (secs)', default=None, type=int)

    parser.add_argument('-l', '--launchpad_file', help='path to launchpad file', default=LAUNCHPAD_LOC)
    parser.add_argument('-w', '--fworker_file', help='path to fworker file', default=FWORKER_LOC)
    parser.add_argument('-c', '--config_dir', help='path to a directory containing the config file (used if -l, -w unspecified)',
                        default=CONFIG_FILE_DIR)

    parser.add_argument('--loglvl', help='level to print log messages', default='INFO')
    parser.add_argument('-s', '--silencer', help='shortcut to mute log messages', action='store_true')

    args = parser.parse_args()

    signal.signal(signal.SIGINT, handle_interrupt)  # graceful exist on ^C

    if not args.launchpad_file and os.path.exists(os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml')

    if not args.fworker_file and os.path.exists(os.path.join(args.config_dir, 'my_fworker.yaml')):
        args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml')

    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'singleshot' and args.offline:
        launchpad = None
    else:
        launchpad = LaunchPad.from_file(args.launchpad_file) if args.launchpad_file else LaunchPad(strm_lvl=args.loglvl)

    if args.fworker_file:
        fworker = FWorker.from_file(args.fworker_file)
    else:
        fworker = FWorker()

    # prime addr lookups
    _log = get_fw_logger("rlaunch", stream_level="INFO")
    _log.info("Hostname/IP lookup (this will take a few seconds)")
    get_my_host()
    get_my_ip()

    if args.command == 'rapidfire':
        rapidfire(launchpad, fworker, None, args.nlaunches, -1, args.sleep, args.loglvl)

    else:
        launch_rocket(launchpad, fworker, args.fw_id, args.loglvl)
Exemple #10
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    FWData().Running_IDs = running_ids_dict
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger("rocket.launcher", l_dir=l_dir, stream_level=loglvl)
    rapidfire(
        launchpad,
        fworker=fworker,
        m_dir=None,
        nlaunches=nlaunches,
        max_loops=-1,
        sleep_time=sleep,
        strm_lvl=loglvl,
        timeout=timeout,
    )
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        launch_ids = FWData().Running_IDs.values()
        live_ids = list(set(launch_ids) - {None})
        if len(live_ids) > 0:
            # Some other sub jobs are still running
            log_multi(l_logger, "Sleeping for {} secs before resubmit sub job".format(sleep_time))
            time.sleep(sleep_time)
            log_multi(l_logger, "Resubmit sub job".format(sleep_time))
            rapidfire(
                launchpad,
                fworker=fworker,
                m_dir=None,
                nlaunches=nlaunches,
                max_loops=-1,
                sleep_time=sleep,
                strm_lvl=loglvl,
                timeout=timeout,
            )
        else:
            break
    log_multi(l_logger, "Sub job finished")
Exemple #11
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False,
                        local_redirect=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
        local_redirect (bool): redirect standard input and output to local file
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()
    firing_state_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict,
                              local_redirect=local_redirect, firing_state_dict=firing_state_dict)
    FWData().Running_IDs = running_ids_dict
    FWData().FiringState = firing_state_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Exemple #12
0
def rapidfire(launchpad,
              fworker=None,
              m_dir=None,
              nlaunches=0,
              max_loops=-1,
              sleep_time=None,
              strm_lvl='INFO'):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param max_loops: (int) maximum number of loops
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=launchpad.get_logdir(),
                             stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    num_loops = 0

    while num_loops != max_loops:
        while launchpad.run_exists(fworker):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir,
                                                l_logger,
                                                prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            time.sleep(
                0.15
            )  # add a small amount of buffer breathing time for DB to refresh, etc.
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
Exemple #13
0
    def __init__(self,
                 host='localhost',
                 port=27017,
                 database='fireworks',
                 username=None,
                 password=None,
                 filepad_coll_name="filepad",
                 gridfs_coll_name="filepad_gfs",
                 logdir=None,
                 strm_lvl=None,
                 text_mode=False):
        """
        Args:
            host (str): hostname
            port (int): port number
            database (str): database name
            username (str)
            password (str)
            filepad_coll_name (str): filepad collection name
            gridfs_coll_name (str): gridfs collection name
            logdir (str): path to the log directory
            strm_lvl (str): the logger stream level
            text_mode (bool): whether to use text_mode for file read/write (instead of binary). Might be useful if
                working only with text files between Windows and Unix systems
        """
        self.host = host
        self.port = int(port)
        self.database = database
        self.username = username
        self.password = password
        self.gridfs_coll_name = gridfs_coll_name
        self.text_mode = text_mode
        try:
            self.connection = MongoClient(self.host, self.port)
            self.db = self.connection[database]
        except Exception:
            raise Exception("connection failed")
        try:
            if self.username:
                self.db.authenticate(self.username, self.password)
        except Exception:
            raise Exception("authentication failed")

        # set collections: filepad and gridfs
        self.filepad = self.db[filepad_coll_name]
        self.gridfs = gridfs.GridFS(self.db, gridfs_coll_name)

        # logging
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else 'INFO'
        self.logger = get_fw_logger('filepad',
                                    l_dir=self.logdir,
                                    stream_level=self.strm_lvl)

        # build indexes
        self.build_indexes()
Exemple #14
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout,
                      running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
        macro_sleep_time (int): secs to sleep between sub job resubmit
        local_redirect (bool): redirect standard input and output to local file
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    fw_data = FWData()
    fw_data.DATASERVER = ds
    fw_data.MULTIPROCESSING = True
    fw_data.NODE_LIST = node_list
    fw_data.SUB_NPROCS = sub_nproc
    fw_data.Running_IDs = running_ids_dict
    fw_data.FiringState = firing_state_dict
    fw_data.lp = launchpad
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
    fw_data.FiringState[os.getpid()] = True
    rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches,
              max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout,
              local_redirect=local_redirect)
    fw_data.FiringState[os.getpid()] = False
    while nlaunches == 0:
        time.sleep(1.5) # wait for LaunchPad to be initialized
        firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing]
        if len(firing_pids) > 0:
            # Some other sub jobs are still running
            macro_sleep_time = macro_sleep_time if macro_sleep_time \
                else sleep_time * len(fw_data.FiringState)
            log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time))
            time.sleep(macro_sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time))
            fw_data.FiringState[os.getpid()] = True
            rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches,
                      max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout,
                      local_redirect=local_redirect)
            fw_data.FiringState[os.getpid()] = False
        else:
            break
    log_multi(l_logger, 'Sub job finished')
Exemple #15
0
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1,
              sleep_time=None, strm_lvl='INFO', timeout=None):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop until max_loops
    :param max_loops: (int) maximum number of loops (default -1 is infinite)
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    :param timeout: (int) # of seconds after which to stop the rapidfire process
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and \
                (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                time.sleep(0.15)  # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
def launch_rocket(launchpad, fworker=None, logdir=None, strm_lvl=None, fw_id=None):
    """
    Run a single rocket in the current directory
    :param launchpad: a LaunchPad object
    :param fworker: a FWorker object
    """
    fworker = fworker if fworker else FWorker()
    l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=strm_lvl)
    l_logger.info('Launching Rocket')
    rocket = Rocket(launchpad, fworker, fw_id)
    rocket.run()
    l_logger.info('Rocket finished')
Exemple #17
0
    def __init__(self,
                 host='localhost',
                 port=27017,
                 database='fireworks',
                 username=None,
                 password=None,
                 filepad_coll_name="filepad",
                 gridfs_coll_name="filepad_gfs",
                 logdir=None,
                 strm_lvl=None):
        """
        Args:
            host (str): hostname
            port (int): port number
            database (str): database name
            username (str)
            password (str)
            filepad_coll_name (str): filepad collection name
            gridfs_coll_name (str): gridfs collection name
            logdir (str): path to the log directory
            strm_lvl (str): the logger stream level
        """
        self.host = host
        self.port = int(port)
        self.database = database
        self.username = username
        self.password = password
        self.gridfs_coll_name = gridfs_coll_name
        try:
            self.connection = MongoClient(self.host, self.port)
            self.db = self.connection[database]
        except:
            raise Exception("connection failed")
        try:
            if self.username:
                self.db.authenticate(self.username, self.password)
        except:
            raise Exception("authentication failed")

        # set collections: filepad and gridfs
        self.filepad = self.db[filepad_coll_name]
        self.gridfs = gridfs.GridFS(self.db, gridfs_coll_name)

        # logging
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else 'INFO'
        self.logger = get_fw_logger('filepad',
                                    l_dir=self.logdir,
                                    stream_level=self.strm_lvl)

        # build indexes
        self.build_indexes()
Exemple #18
0
    def __init__(
        self,
        host="localhost",
        port=27017,
        database="fireworks",
        username=None,
        password=None,
        filepad_coll_name="filepad",
        gridfs_coll_name="filepad_gfs",
        logdir=None,
        strm_lvl=None,
    ):
        """
        Args:
            host (str): hostname
            port (int): port number
            database (str): database name
            username (str)
            password (str)
            filepad_coll_name (str): filepad collection name
            gridfs_coll_name (str): gridfs collection name
            logdir (str): path to the log directory
            strm_lvl (str): the logger stream level
        """
        self.host = host
        self.port = int(port)
        self.database = database
        self.username = username
        self.password = password
        self.gridfs_coll_name = gridfs_coll_name
        try:
            self.connection = MongoClient(self.host, self.port)
            self.db = self.connection[database]
        except:
            raise Exception("connection failed")
        try:
            if self.username:
                self.db.authenticate(self.username, self.password)
        except:
            raise Exception("authentication failed")

        # set collections: filepad and gridfs
        self.filepad = self.db[filepad_coll_name]
        self.gridfs = gridfs.GridFS(self.db, gridfs_coll_name)

        # logging
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else "INFO"
        self.logger = get_fw_logger("filepad", l_dir=self.logdir, stream_level=self.strm_lvl)

        # build indexes
        self.build_indexes()
Exemple #19
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict)
    FWData().Running_IDs = running_ids_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO'):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: (LaunchPad)
    :param fworker: (FWorker object)
    :param m_dir: (str) the directory in which to loop Rocket running
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param max_loops: (int) maximum number of loops
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param strm_lvl: (str) level at which to output logs to stdout
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = fworker if fworker else FWorker()

    num_launched = 0
    num_loops = 0

    while num_loops != max_loops:
        skip_check = False  # this is used to speed operation
        while skip_check or launchpad.run_exists(fworker):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                time.sleep(0.15)  # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
def rapidfire(launchpad,
              fworker=None,
              m_dir=None,
              logdir=None,
              strm_lvl=None,
              nlaunches=0,
              sleep_time=60,
              max_loops=-1):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket.
    Usually stops when we run out of FireWorks from the LaunchPad.

    :param launchpad: a LaunchPad object
    :param fworker: a FWorker object
    :param m_dir: the directory in which to loop Rocket running
    :param nlaunches: 0 means 'until completion', -1 means 'infinity'
    """
    curdir = m_dir if m_dir else os.getcwd()
    fworker = fworker if fworker else FWorker()
    # initialize logger
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=logdir,
                             stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)

    # TODO: wrap in try-except. Use log_exception for exceptions EXCEPT running out of jobs.
    # TODO: always chdir() back to curdir when finished...then delete cruft from MongoTests
    num_launched = 0
    num_loops = 0
    while num_loops != max_loops:
        while launchpad.run_exists():
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir,
                                                l_logger,
                                                prefix='launcher_')
            os.chdir(launcher_dir)
            launch_rocket(launchpad, fworker, logdir, strm_lvl)
            num_launched += 1
            if num_launched == nlaunches:
                break
            time.sleep(
                0.1
            )  # add a small amount of buffer breathing time for DB to refresh, etc.
        if num_launched == nlaunches or nlaunches == 0:
            break
        l_logger.info('Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        l_logger.info('Checking for FWs to run...'.format(sleep_time))
Exemple #22
0
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO'):
    """
    Run a single rocket in the current directory
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param fw_id: (int) if set, a particular FireWork to run
    :param strm_lvl: (str) level at which to output logs to stdout
    """
    fworker = fworker if fworker else FWorker()
    if launchpad:
        l_logger = get_fw_logger('rocket.launcher',
                                 l_dir=launchpad.get_logdir(),
                                 stream_level=strm_lvl)
    else:
        # offline mode
        l_logger = get_fw_logger('rocket.launcher',
                                 l_dir=None,
                                 stream_level=strm_lvl)

    log_multi(l_logger, 'Launching Rocket')
    rocket = Rocket(launchpad, fworker, fw_id)
    rocket_ran = rocket.run()
    log_multi(l_logger, 'Rocket finished')
    return rocket_ran
Exemple #23
0
    def submit_to_queue(self, queue_params, script_file):
        """
        for documentation, see parent object
        """

        if not os.path.exists(script_file):
            raise ValueError(
                'Cannot find script file located at: {}'.format(script_file))

        # initialize logger
        slurm_logger = get_fw_logger('rocket.slurm', queue_params.logging_dir)

        # submit the job
        try:
            cmd = ['sbatch', script_file]
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            p.wait()

            # grab the returncode. SLURM returns 0 if the job was successful
            if p.returncode == 0:
                try:
                    # output should of the form '2561553.sdb' or '352353.jessup' - just grab the first part for job id
                    job_id = int(p.stdout.read().split()[3])
                    slurm_logger.info(
                        'Job submission was successful and job_id is {}'.
                        format(job_id))
                    return job_id
                except:
                    # probably error parsing job code
                    log_exception(slurm_logger,
                                  'Could not parse job id following slurm...')

            else:
                # some qsub error, e.g. maybe wrong queue specified, don't have permission to submit, etc...
                msgs = [
                    'Error in job submission with SLURM file {f} and cmd {c}'.
                    format(f=script_file, c=cmd)
                ]
                msgs.append('The error response reads: {}'.format(
                    p.stderr.read()))
                log_fancy(slurm_logger, 'error', msgs)

        except:
            # random error, e.g. no qsub on machine!
            log_exception(slurm_logger, 'Running slurm caused an error...')
def launch_rocket(launchpad, fworker=None, fw_id=None, strm_lvl='INFO'):
    """
    Run a single rocket in the current directory
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param fw_id: (int) if set, a particular Firework to run
    :param strm_lvl: (str) level at which to output logs to stdout
    """
    fworker = fworker if fworker else FWorker()
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=strm_lvl)

    log_multi(l_logger, 'Launching Rocket')
    rocket = Rocket(launchpad, fworker, fw_id)
    rocket_ran = rocket.run()
    log_multi(l_logger, 'Rocket finished')
    return rocket_ran
def launch_rocket(launchpad,
                  fworker=None,
                  logdir=None,
                  strm_lvl=None,
                  fw_id=None):
    """
    Run a single rocket in the current directory
    :param launchpad: a LaunchPad object
    :param fworker: a FWorker object
    """
    fworker = fworker if fworker else FWorker()
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=logdir,
                             stream_level=strm_lvl)
    l_logger.info('Launching Rocket')
    rocket = Rocket(launchpad, fworker, fw_id)
    rocket.run()
    l_logger.info('Rocket finished')
Exemple #26
0
    def get_njobs_in_queue(self, rocket_params, username=None):
        """
        for documentation, see parent object
        """

        # TODO: (low-priority) parse the qstat -x output as an alternate way to get this working
        # tmp_file_name = 'tmp_qstat.xml'
        # cmd = ['qstat', '-x']\n

        # initialize logger
        pbs_logger = get_fw_logger('rocket.pbs', rocket_params.logging_dir)

        # initialize username
        if username is None:
            username = getpass.getuser()

        # run qstat
        cmd = ['qstat', '-a', '-u', username]
        p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE)
        p.wait()

        # parse the result
        if p.returncode == 0:
            # lines should have this form
            # '1339044.sdb          username  queuename    2012-02-29-16-43  20460   --   --    --  00:20 C 00:09'
            # count lines that include the username in it

            # TODO: only count running or queued jobs. or rather, *don't* count jobs that are 'C'.
            outs = p.stdout.readlines()
            rx = re.compile(username)
            njobs = len(
                [line.split() for line in outs if rx.search(line) is not None])
            pbs_logger.info(
                'The number of jobs currently in the queue is: {}'.format(
                    njobs))
            return njobs

        # there's a problem talking to qstat server?
        msgs = [
            'Error trying to get the number of jobs in the queue using qstat service'
        ]
        msgs.append('The error response reads: {}'.format(p.stderr.read()))
        log_fancy(pbs_logger, 'error', msgs)
        return None
Exemple #27
0
    def get_njobs_in_queue(self, queue_params, username=None):
        """
        for documentation, see parent object
        """

        # TODO: (low-priority) parse the qstat -x output as an alternate way to get this working
        # tmp_file_name = 'tmp_qstat.xml'
        # cmd = ['qstat', '-x']\n

        # initialize logger
        slurm_logger = get_fw_logger('rocket.slurm', queue_params.logging_dir)

        # initialize username
        if username is None:
            username = getpass.getuser()

        # run qstat
        cmd = ['squeue', '-o "%u"', '-u', username]
        p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE)
        p.wait()

        # parse the result
        if p.returncode == 0:
            # lines should have this form
            # username
            # count lines that include the username in it

            outs = p.stdout.readlines()
            rx = re.compile(username)
            njobs = len(
                [line.split() for line in outs if rx.search(line) is not None])
            slurm_logger.info(
                'The number of jobs currently in the queue is: {}'.format(
                    njobs))
            return njobs

        # there's a problem talking to qstat server?
        msgs = [
            'Error trying to get the number of jobs in the queue using squeue service'
        ]
        msgs.append('The error response reads: {}'.format(p.stderr.read()))
        log_fancy(slurm_logger, 'error', msgs)
        return None
Exemple #28
0
    def __init__(self, host='localhost', port=27017, name='fireworks',
                 username=None, password=None, logdir=None, strm_lvl=None,
                 user_indices=None, wf_user_indices=None):
        """

        :param host:
        :param port:
        :param name:
        :param username:
        :param password:
        :param logdir:
        :param strm_lvl:
        :param user_indices:
        :param wf_user_indices:
        """
        self.host = host
        self.port = port
        self.name = name
        self.username = username
        self.password = password

        # set up logger
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else 'INFO'
        self.m_logger = get_fw_logger('launchpad', l_dir=self.logdir,
                                      stream_level=self.strm_lvl)

        self.user_indices = user_indices if user_indices else []
        self.wf_user_indices = wf_user_indices if wf_user_indices else []

        # get connection
        self.connection = MongoClient(host, port, j=True)
        self.db = self.connection[name]
        if username:
            self.db.authenticate(username, password)

        self.fireworks = self.db.fireworks
        self.launches = self.db.launches
        self.offline_runs = self.db.offline_runs
        self.fw_id_assigner = self.db.fw_id_assigner
        self.workflows = self.db.workflows
Exemple #29
0
    def __init__(self, host='localhost', port=27017, name='fireworks',
                 username=None, password=None, logdir=None, strm_lvl=None,
                 user_indices=None, wf_user_indices=None):
        """

        :param host:
        :param port:
        :param name:
        :param username:
        :param password:
        :param logdir:
        :param strm_lvl:
        :param user_indices:
        :param wf_user_indices:
        """
        self.host = host
        self.port = port
        self.name = name
        self.username = username
        self.password = password

        # set up logger
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else 'INFO'
        self.m_logger = get_fw_logger('launchpad', l_dir=self.logdir,
                                      stream_level=self.strm_lvl)

        self.user_indices = user_indices if user_indices else []
        self.wf_user_indices = wf_user_indices if wf_user_indices else []

        # get connection
        self.connection = MongoClient(host, port, j=True)
        self.db = self.connection[name]
        if username:
            self.db.authenticate(username, password)

        self.fireworks = self.db.fireworks
        self.launches = self.db.launches
        self.offline_runs = self.db.offline_runs
        self.fw_id_assigner = self.db.fw_id_assigner
        self.workflows = self.db.workflows
Exemple #30
0
    def __init__(self,
                 host='localhost',
                 port=27017,
                 name='fireworks',
                 username=None,
                 password=None,
                 logdir=None,
                 strm_lvl=None):
        """
        
        :param host:
        :param port:
        :param name:
        :param username:
        :param password:
        :param logdir:
        :param strm_lvl:
        """
        self.host = host
        self.port = port
        self.name = name
        self.username = username
        self.password = password

        # set up logger
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else 'INFO'
        self.m_logger = get_fw_logger('launchpad',
                                      l_dir=self.logdir,
                                      stream_level=self.strm_lvl)

        self.connection = MongoClient(host, port, j=True)
        self.database = self.connection[name]
        if username:
            self.database.authenticate(username, password)

        self.fireworks = self.database.fireworks
        self.launches = self.database.launches
        self.fw_id_assigner = self.database.fw_id_assigner
        self.links = self.database.links
Exemple #31
0
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None,
              strm_lvl='INFO', timeout=None, local_redirect=False, pdb_on_exception=False):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories
    for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker object)
        m_dir (str): the directory in which to loop Rocket running
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops
        max_loops (int): maximum number of loops (default -1 is infinite)
        sleep_time (int): secs to sleep between rapidfire loop iterations
        strm_lvl (str): level at which to output logs to stdout
        timeout (int): of seconds after which to stop the rapidfire process
        local_redirect (bool): redirect standard input and output to local file
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = get_fworker(fworker)

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    def time_ok():
        # has the rapidfire run timed out?
        return (timeout is None or
                (datetime.now() - start_time).total_seconds() < timeout)

    while num_loops != max_loops and time_ok():
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and time_ok():
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            if local_redirect:
                with redirect_local():
                    rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl,
                                               pdb_on_exception=pdb_on_exception)
            else:
                rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl,
                                           pdb_on_exception=pdb_on_exception)

            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if nlaunches > 0 and num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                time.sleep(0.15)
                skip_check = False
        if nlaunches == 0:
            if not launchpad.future_run_exists(fworker):
                break
        elif num_launched == nlaunches:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        time.sleep(sleep_time)
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...')
    os.chdir(curdir)
Exemple #32
0
    def __init__(
        self,
        host="localhost",
        port=27017,
        database="fireworks",
        username=None,
        password=None,
        authsource=None,
        uri_mode=False,
        mongoclient_kwargs=None,
        filepad_coll_name="filepad",
        gridfs_coll_name="filepad_gfs",
        logdir=None,
        strm_lvl=None,
        text_mode=False,
    ):
        """
        Args:
            host (str): hostname
            port (int): port number
            database (str): database name
            username (str)
            password (str)

            authsource (str): authSource parameter for MongoDB authentication; defaults to "name" (i.e., db name) if
                not set
            uri_mode (bool): if set True, all Mongo connection parameters occur through a MongoDB URI string (set as
                the host).

            filepad_coll_name (str): filepad collection name
            gridfs_coll_name (str): gridfs collection name
            logdir (str): path to the log directory
            strm_lvl (str): the logger stream level
            text_mode (bool): whether to use text_mode for file read/write (instead of binary). Might be useful if
                working only with text files between Windows and Unix systems
        """
        self.host = host
        self.port = int(port)
        self.database = database
        self.username = username
        self.password = password
        self.authsource = authsource or self.database
        self.mongoclient_kwargs = mongoclient_kwargs or {}
        self.uri_mode = uri_mode

        self.gridfs_coll_name = gridfs_coll_name
        self.text_mode = text_mode

        # get connection
        if uri_mode:
            self.connection = MongoClient(host)
            dbname = host.split("/")[-1].split("?")[
                0]  # parse URI to extract dbname
            self.db = self.connection[dbname]
        else:
            self.connection = MongoClient(
                self.host,
                self.port,
                socketTimeoutMS=MONGO_SOCKET_TIMEOUT_MS,
                username=self.username,
                password=self.password,
                authSource=self.authsource,
                **self.mongoclient_kwargs,
            )
            self.db = self.connection[self.database]
        # except Exception:
        #     raise Exception("connection failed")
        # try:
        #     if self.username:
        #         self.db.authenticate(self.username, self.password)
        # except Exception:
        #     raise Exception("authentication failed")

        # set collections: filepad and gridfs
        self.filepad = self.db[filepad_coll_name]
        self.gridfs = gridfs.GridFS(self.db, gridfs_coll_name)

        # logging
        self.logdir = logdir
        self.strm_lvl = strm_lvl if strm_lvl else "INFO"
        self.logger = get_fw_logger("filepad",
                                    l_dir=self.logdir,
                                    stream_level=self.strm_lvl)

        # build indexes
        self.build_indexes()
def arlaunch():
    """
    Function rapid-fire job launching
    """
    m_description = 'This program launches one or more Rockets. A Rocket retrieves a job from the ' \
                    'central database and runs it. The "single-shot" option launches a single Rocket, ' \
                    'whereas the "rapidfire" option loops until all FireWorks are completed.'

    parser = ArgumentParser(description=m_description)
    subparsers = parser.add_subparsers(help='command', dest='command')
    single_parser = subparsers.add_parser('singleshot',
                                          help='launch a single Rocket')
    rapid_parser = subparsers.add_parser(
        'rapidfire',
        help='launch multiple Rockets (loop until all FireWorks complete)')
    multi_parser = subparsers.add_parser(
        'multi', help='launches multiple Rockets simultaneously')

    single_parser.add_argument('-f',
                               '--fw_id',
                               help='specific fw_id to run',
                               default=None,
                               type=int)
    single_parser.add_argument('--offline',
                               help='run in offline mode (FW.json required)',
                               action='store_true')
    single_parser.add_argument('--pdb',
                               help='shortcut to invoke debugger on error',
                               action='store_true')

    rapid_parser.add_argument('--nlaunches',
                              help='num_launches (int or "infinite"; '
                              'default 0 is all jobs in DB)',
                              default=0)
    rapid_parser.add_argument(
        '--timeout',
        help='timeout (secs) after which to quit (default None)',
        default=None,
        type=int)
    rapid_parser.add_argument(
        '--max_loops',
        help='after this many sleep loops, quit even in '
        'infinite nlaunches mode (default -1 is infinite loops)',
        default=-1,
        type=int)
    rapid_parser.add_argument('--sleep',
                              help='sleep time between loops (secs)',
                              default=None,
                              type=int)
    rapid_parser.add_argument(
        '--local_redirect',
        help="Redirect stdout and stderr to the launch directory",
        action="store_true")

    multi_parser.add_argument('num_jobs',
                              help='the number of jobs to run in parallel',
                              type=int)
    multi_parser.add_argument('--nlaunches',
                              help='number of FireWorks to run in series per '
                              'parallel job (int or "infinite"; default 0 is '
                              'all jobs in DB)',
                              default=0)
    multi_parser.add_argument(
        '--sleep',
        help='sleep time between loops in infinite launch mode'
        '(secs)',
        default=None,
        type=int)
    multi_parser.add_argument(
        '--timeout',
        help='timeout (secs) after which to quit (default None)',
        default=None,
        type=int)
    multi_parser.add_argument(
        '--nodefile',
        help='nodefile name or environment variable name '
        'containing the node file name (for populating'
        ' FWData only)',
        default=None,
        type=str)
    multi_parser.add_argument(
        '--ppn',
        help='processors per node (for populating FWData only)',
        default=1,
        type=int)
    multi_parser.add_argument('--exclude_current_node',
                              help="Don't use the script launching node"
                              "as compute node",
                              action="store_true")
    multi_parser.add_argument(
        '--local_redirect',
        help="Redirect stdout and stderr to the launch directory",
        action="store_true")

    parser.add_argument('-l',
                        '--launchpad_file',
                        help='path to launchpad file')
    parser.add_argument('-w',
                        '--fworker_file',
                        required=True,
                        help='path to fworker file')
    parser.add_argument('-c',
                        '--config_dir',
                        help='path to a directory containing the config file '
                        '(used if -l, -w unspecified)',
                        default=CONFIG_FILE_DIR)

    parser.add_argument('--loglvl',
                        help='level to print log messages',
                        default='INFO')
    parser.add_argument('-s',
                        '--silencer',
                        help='shortcut to mute log messages',
                        action='store_true')

    try:
        import argcomplete
        argcomplete.autocomplete(parser)
        # This supports bash autocompletion. To enable this, pip install
        # argcomplete, activate global completion, or add
        #      eval "$(register-python-argcomplete rlaunch)"
        # into your .bash_profile or .bashrc
    except ImportError:
        pass

    args = parser.parse_args()

    signal.signal(signal.SIGINT, handle_interrupt)  # graceful exit on ^C

    if not args.launchpad_file and os.path.exists(
            os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir,
                                           'my_launchpad.yaml')
    elif not args.launchpad_file:
        args.launchpad_file = LAUNCHPAD_LOC

    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'singleshot' and args.offline:
        launchpad = None
    else:
        launchpad = LaunchPad.from_file(
            args.launchpad_file) if args.launchpad_file else LaunchPad(
                strm_lvl=args.loglvl)

    fworker = AiiDAFWorker.from_file(args.fworker_file)

    # prime addr lookups
    _log = get_fw_logger("rlaunch", stream_level="INFO")
    _log.info("Hostname/IP lookup (this will take a few seconds)")
    get_my_host()
    get_my_ip()

    if args.command == 'rapidfire':
        rapidfire(launchpad,
                  fworker=fworker,
                  m_dir=None,
                  nlaunches=args.nlaunches,
                  max_loops=args.max_loops,
                  sleep_time=args.sleep,
                  strm_lvl=args.loglvl,
                  timeout=args.timeout,
                  local_redirect=args.local_redirect)
    elif args.command == 'multi':
        total_node_list = None
        if args.nodefile:
            if args.nodefile in os.environ:
                args.nodefile = os.environ[args.nodefile]
            with open(args.nodefile, 'r') as fhandle:
                total_node_list = [
                    line.strip() for line in fhandle.readlines()
                ]
        launch_multiprocess(launchpad,
                            fworker,
                            args.loglvl,
                            args.nlaunches,
                            args.num_jobs,
                            args.sleep,
                            total_node_list,
                            args.ppn,
                            timeout=args.timeout,
                            exclude_current_node=args.exclude_current_node,
                            local_redirect=args.local_redirect)
    else:
        launch_rocket(launchpad,
                      fworker,
                      args.fw_id,
                      args.loglvl,
                      pdb_on_exception=args.pdb)
Exemple #34
0
def rapidfire(launchpad,
              fworker,
              qadapter,
              launch_dir='.',
              nlaunches=0,
              njobs_queue=10,
              njobs_block=500,
              sleep_time=None,
              reserve=False,
              strm_lvl='INFO'):
    """
    Submit many jobs to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launch_dir: directory where we want to write the blocks
    :param nlaunches: total number of launches desired; "infinite" for loop, 0 for one round
    :param njobs_queue: stops submitting jobs when njobs_queue jobs are in the queue
    :param njobs_block: automatically write a new block when njobs_block jobs are in a single block
    :param sleep_time: (int) secs to sleep between rapidfire loop iterations
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')),
                             reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(
                os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(
                qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while jobs_in_queue < njobs_queue and launchpad.run_exists(
                    fworker):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(
                        'Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                if not launch_rocket_to_queue(launchpad, fworker, qadapter,
                                              block_dir, reserve, strm_lvl,
                                              True):
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(
                    QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(
                        qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0:
                break
            l_logger.info(
                'Finished a round of launches, sleeping for {} secs'.format(
                    sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')
Exemple #35
0
from fireworks.utilities.fw_serializers import DATETIME_HANDLER
from fireworks.utilities.fw_utilities import get_fw_logger
from fireworks.core.launchpad import LaunchPad
from fireworks.fw_config import WEBSERVER_PERFWARNINGS
import fireworks.flask_site.helpers as fwapp_util
from fireworks.flask_site.util import jsonify

app = Flask(__name__)
app.use_reloader = True
app.secret_key = os.environ.get("FWAPP_SECRET_KEY", os.urandom(24))

hello = __name__
app.BASE_Q = {}
app.BASE_Q_WF = {}

logger = get_fw_logger('app')

PER_PAGE = 20
STATES = sorted(Firework.STATE_RANKS, key=Firework.STATE_RANKS.get)


def check_auth(username, password):
    """
    This function is called to check if a username /
    password combination is valid.
    """
    AUTH_USER = app.config.get("WEBGUI_USERNAME")
    AUTH_PASSWD = app.config.get("WEBGUI_PASSWORD")

    if (AUTH_USER is None) or (AUTH_PASSWD is None):
        return True
Exemple #36
0
app = Flask(__name__)


# Allow application to run under a service prefix url
if os.environ.get("FW_APPLICATION_ROOT"):
    app.config["APPLICATION_ROOT"] = os.environ.get("FW_APPLICATION_ROOT")


app.use_reloader = True
app.secret_key = os.environ.get("FWAPP_SECRET_KEY", os.urandom(24))

hello = __name__
app.BASE_Q = {}
app.BASE_Q_WF = {}

logger = get_fw_logger("app")

PER_PAGE = 20
STATES = sorted(Firework.STATE_RANKS, key=Firework.STATE_RANKS.get)


def check_auth(username, password):
    """
    This function is called to check if a username /
    password combination is valid.
    """
    AUTH_USER = app.config.get("WEBGUI_USERNAME")
    AUTH_PASSWD = app.config.get("WEBGUI_PASSWORD")

    if (AUTH_USER is None) or (AUTH_PASSWD is None):
        return True
Exemple #37
0
def launch_rocket_to_queue(launchpad,
                           fworker,
                           qadapter,
                           launcher_dir='.',
                           reserve=False,
                           strm_lvl='INFO',
                           create_launcher_dir=False):
    """
    Submit a single job to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launcher_dir: (str) The directory where to submit the job
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    :param create_launcher_dir: (bool) Whether to create a subfolder launcher+timestamp, if needed
    """

    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict(
    ))  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError(
            "Must use reservation mode (-r option) of qlaunch when using offline option of rlaunch!!"
        )

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError(
            'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
        )

    if launchpad.run_exists(fworker):
        try:
            if reserve:
                l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker, launcher_dir)
                if not fw:
                    l_logger.info(
                        'No jobs exist in the LaunchPad for submission to queue!'
                    )
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug(
                        'updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir,
                                                     fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    try:
                        os.makedirs(launcher_dir)
                    except OSError as exception:
                        if exception.errno != errno.EEXIST:
                            raise

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir,
                                                        l_logger,
                                                        prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir,
                                                    l_logger,
                                                    prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    if reserve:
                        l_logger.info(
                            'Un-reserving FW with fw_id, launch_id: {}, {}'.
                            format(fw.fw_id, launch_id))
                        launchpad.cancel_reservation(launch_id)
                    raise RuntimeError(
                        'queue script could not be submitted, check queue script/queue adapter/queue server status!'
                    )
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            return False

    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
        return False
Exemple #38
0
 def get_qlogger(self, name):
     if "logdir" in self:
         return get_fw_logger(name, self["logdir"])
     else:
         return get_fw_logger(name, stream_level="CRITICAL")
Exemple #39
0
def launch_rocket_to_queue(launchpad,
                           fworker,
                           qadapter,
                           launcher_dir='.',
                           reserve=False,
                           strm_lvl='INFO',
                           create_launcher_dir=False,
                           fill_mode=False,
                           fw_id=None):
    """
    Submit a single job to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launcher_dir (str): The directory where to submit the job
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        create_launcher_dir (bool): Whether to create a subfolder launcher+timestamp, if needed
        fill_mode (bool): whether to submit jobs even when there is nothing to run
            (only in non-reservation mode)
        fw_id (int): specific fw_id to reserve (reservation mode only)
    """
    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict(
    ))  # make a defensive copy, mainly for reservation mode

    fw, launch_id = None, None  # only needed in reservation mode

    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    if '--offline' in qadapter['rocket_launch'] and not reserve:
        raise ValueError("Must use reservation mode (-r option) of qlaunch "
                         "when using offline option of rlaunch!!")

    if reserve and 'singleshot' not in qadapter.get('rocket_launch', ''):
        raise ValueError(
            'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
        )

    if fill_mode and reserve:
        raise ValueError(
            "Fill_mode cannot be used in conjunction with reserve mode!")

    if fw_id and not reserve:
        raise ValueError(
            "qlaunch for specific fireworks may only be used in reservation mode."
        )

    if fill_mode or launchpad.run_exists(fworker):
        launch_id = None
        try:
            if reserve:
                if fw_id:
                    l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad.reserve_fw(fworker,
                                                     launcher_dir,
                                                     fw_id=fw_id)
                if not fw:
                    l_logger.info(
                        'No jobs exist in the LaunchPad for submission to queue!'
                    )
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # update qadapter job_name based on FW name
                job_name = get_slug(fw.name)[0:QUEUE_JOBNAME_MAXLEN]
                qadapter.update({'job_name': job_name})

                if '_queueadapter' in fw.spec:
                    l_logger.debug(
                        'updating queue params using Firework spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # reservation mode includes --fw_id in rocket launch
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                # update launcher_dir if _launch_dir is selected in reserved fw
                if '_launch_dir' in fw.spec:
                    fw_launch_dir = os.path.expandvars(fw.spec['_launch_dir'])

                    if not os.path.isabs(fw_launch_dir):
                        fw_launch_dir = os.path.join(launcher_dir,
                                                     fw_launch_dir)

                    launcher_dir = fw_launch_dir

                    makedirs_p(launcher_dir)

                    launchpad.change_launch_dir(launch_id, launcher_dir)
                elif create_launcher_dir:
                    # create launcher_dir
                    launcher_dir = create_datestamp_dir(launcher_dir,
                                                        l_logger,
                                                        prefix='launcher_')
                    launchpad.change_launch_dir(launch_id, launcher_dir)

            elif create_launcher_dir:
                # create launcher_dir
                launcher_dir = create_datestamp_dir(launcher_dir,
                                                    l_logger,
                                                    prefix='launcher_')

            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))

            with cd(launcher_dir):

                if '--offline' in qadapter['rocket_launch']:
                    setup_offline_job(launchpad, fw, launch_id)

                l_logger.debug('writing queue script')
                with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                    queue_script = qadapter.get_script_str(launcher_dir)
                    f.write(queue_script)

                l_logger.info('submitting queue script')
                reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
                if not reservation_id:
                    raise RuntimeError(
                        'queue script could not be submitted, check queue '
                        'script/queue adapter/queue server status!')
                elif reserve:
                    launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            if reserve and launch_id is not None:
                try:
                    l_logger.info(
                        'Un-reserving FW with fw_id, launch_id: {}, {}'.format(
                            fw.fw_id, launch_id))
                    launchpad.cancel_reservation(launch_id)
                    launchpad.forget_offline(launch_id)
                except:
                    log_exception(
                        l_logger,
                        'Error unreserving FW with fw_id {}'.format(fw.fw_id))

            return False

    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
        return None  # note: this is a hack (rather than False) to indicate a soft failure to rapidfire()
Exemple #40
0
def launch_rocket_to_queue(launchpad,
                           fworker,
                           qadapter,
                           launcher_dir='.',
                           reserve=False,
                           strm_lvl='INFO'):
    """
    Submit a single job to the queue.
    
    :param launchpad: (LaunchPad)
    :param fworker: (FWorker)
    :param qadapter: (QueueAdapterBase)
    :param launcher_dir: (str) The directory where to submit the job
    :param reserve: (bool) Whether to queue in reservation mode
    :param strm_lvl: (str) level at which to stream log messages
    """

    fworker = fworker if fworker else FWorker()
    launcher_dir = os.path.abspath(launcher_dir)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)
    # get the queue adapter
    l_logger.debug('getting queue adapter')
    qadapter = load_object(qadapter.to_dict(
    ))  # make a defensive copy, mainly for reservation mode

    # make sure launch_dir exists:
    if not os.path.exists(launcher_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launcher_dir))

    if launchpad.run_exists(fworker):
        try:
            # move to the launch directory
            l_logger.info('moving to launch_dir {}'.format(launcher_dir))
            os.chdir(launcher_dir)

            oldlaunch_dir = None
            if '--offline' in qadapter['rocket_launch'] and not reserve:
                raise ValueError(
                    "Must use reservation mode (-r option) of qlaunch when using offline mode (--offline option) of rlaunch!!"
                )
            elif reserve:
                l_logger.debug('finding a FW to reserve...')
                fw, launch_id = launchpad._reserve_fw(fworker, launcher_dir)
                if not fw:
                    l_logger.info(
                        'No jobs exist in the LaunchPad for submission to queue!'
                    )
                    return False
                l_logger.info('reserved FW with fw_id: {}'.format(fw.fw_id))

                # set job name to the FW name
                job_name = get_slug(fw.name)
                job_name = job_name[0:20] if len(job_name) > 20 else job_name
                qadapter.update({'job_name':
                                 job_name})  # set the job name to FW name

                if '_queueadapter' in fw.spec:
                    l_logger.debug(
                        'updating queue params using FireWork spec..')
                    qadapter.update(fw.spec['_queueadapter'])

                # update the exe to include the FW_id
                if 'singleshot' not in qadapter.get('rocket_launch', ''):
                    raise ValueError(
                        'Reservation mode of queue launcher only works for singleshot Rocket Launcher!'
                    )
                qadapter['rocket_launch'] += ' --fw_id {}'.format(fw.fw_id)

                if '--offline' in qadapter['rocket_launch']:
                    # handle _launch_dir parameter early...
                    if '_launch_dir' in fw.spec:
                        os.chdir(fw.spec['_launch_dir'])
                        oldlaunch_dir = launcher_dir
                        launcher_dir = os.path.abspath(os.getcwd())
                        launchpad._change_launch_dir(launch_id, launcher_dir)

                    # write FW.json
                    fw.to_file("FW.json")
                    # write Launchid
                    with open('FW_offline.json', 'w') as f:
                        f.write('{"launch_id":%s}' % launch_id)

                    launchpad.add_offline_run(launch_id, fw.fw_id, fw.name)

            # write and submit the queue script using the queue adapter
            l_logger.debug('writing queue script')
            with open(SUBMIT_SCRIPT_NAME, 'w') as f:
                queue_script = qadapter.get_script_str(launcher_dir)
                f.write(queue_script)
            l_logger.info('submitting queue script')
            reservation_id = qadapter.submit_to_queue(SUBMIT_SCRIPT_NAME)
            if not reservation_id:
                raise RuntimeError(
                    'queue script could not be submitted, check queue adapter and queue server status!'
                )
            elif reserve:
                launchpad.set_reservation_id(launch_id, reservation_id)
            return reservation_id

        except:
            log_exception(l_logger, 'Error writing/submitting queue script!')
            return False

        finally:
            if oldlaunch_dir:
                os.chdir(
                    oldlaunch_dir
                )  # this only matters in --offline mode with _launch_dir!
    else:
        l_logger.info(
            'No jobs exist in the LaunchPad for submission to queue!')
        return False
Exemple #41
0
 def get_qlogger(self, name):
     if 'logdir' in self:
         return get_fw_logger(name, self['logdir'])
     else:
         return get_fw_logger(name, stream_level='CRITICAL')
Exemple #42
0
    def run(self, pdb_on_exception=False):
        """
        Run the rocket (check out a job from the database and execute it)

        Args:
            pdb_on_exception (bool): whether to invoke the debugger on
                a caught exception.  Default False.
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())
        logdir = lp.get_logdir() if lp else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=logdir,
                                 stream_level=ROCKET_STREAM_LOGLEVEL)

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id)
        else:  # offline mode
            m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            fpath = zpath("FW_offline.json")
            with zopen(fpath) as f_in:
                d = json.loads(f_in.read())
                d['started_on'] = datetime.utcnow().isoformat()
                with zopen(fpath, "wt") as f_out:
                    f_out.write(json.dumps(d, ensure_ascii=False))

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(self.fworker.query))
            return False

        final_state = None
        ping_stop = None
        btask_stops = []

        try:
            if '_launch_dir' in m_fw.spec and lp:
                prev_dir = launch_dir
                launch_dir = os.path.expandvars(m_fw.spec['_launch_dir'])
                if not os.path.abspath(launch_dir):
                    launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir))
                # thread-safe "mkdir -p"
                try:
                    os.makedirs(launch_dir)
                except OSError as exception:
                    if exception.errno != errno.EEXIST:
                        raise
                os.chdir(launch_dir)

                if not os.path.samefile(launch_dir, prev_dir):
                    lp.change_launch_dir(launch_id, launch_dir)

                if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                    try:
                        os.rmdir(prev_dir)
                    except Exception:
                        pass

            recovery = m_fw.spec.get('_recovery', None)
            if recovery:
                recovery_dir = recovery.get('_prev_dir')
                recovery_mode = recovery.get('_mode')
                starting_task = recovery.get('_task_n')
                all_stored_data.update(recovery.get('_all_stored_data'))
                all_update_spec.update(recovery.get('_all_update_spec'))
                all_mod_spec.extend(recovery.get('_all_mod_spec'))
                if lp:
                    l_logger.log(
                        logging.INFO,
                        'Recovering from task number {} in folder {}.'.format(starting_task,
                                                                              recovery_dir))
                if recovery_mode == 'cp' and launch_dir != recovery_dir:
                    if lp:
                        l_logger.log(
                            logging.INFO,
                            'Copying data from recovery folder {} to folder {}.'.format(recovery_dir,
                                                                                        launch_dir))
                    distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1)

            else:
                starting_task = 0
                files_in = m_fw.spec.get("_files_in", {})
                prev_files = m_fw.spec.get("_files_prev", {})
                for f in set(files_in.keys()).intersection(prev_files.keys()):
                    # We use zopen for the file objects for transparent handling
                    # of zipped files. shutil.copyfileobj does the actual copy
                    # in chunks that avoid memory issues.
                    with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout:
                        shutil.copyfileobj(fin, fout)

            if lp:
                message = 'RUNNING fw_id: {} in directory: {}'. \
                    format(m_fw.fw_id, os.getcwd())
                l_logger.log(logging.INFO, message)

            # write FW.json and/or FW.yaml to the directory
            if PRINT_FW_JSON:
                m_fw.to_file('FW.json', indent=4)
            if PRINT_FW_YAML:
                m_fw.to_file('FW.yaml')

            my_spec = dict(m_fw.spec)  # make a copy of spec, don't override original
            my_spec["_fw_env"] = self.fworker.env

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the Firetasks!
            for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task):
                checkpoint = {'_task_n': t_counter,
                              '_all_stored_data': all_stored_data,
                              '_all_update_spec': all_update_spec,
                              '_all_mod_spec': all_mod_spec}
                Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint)

                if lp:
                    l_logger.log(logging.INFO, "Task started: %s." % t.fw_name)

                if my_spec.get("_add_launchpad_and_fw_id"):
                    t.fw_id = m_fw.fw_id
                    if FWData().MULTIPROCESSING:
                        # hack because AutoProxy manager can't access attributes
                        t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict())
                    else:
                        t.launchpad = self.launchpad

                if my_spec.get("_add_fworker"):
                    t.fworker = self.fworker

                try:
                    m_action = t.run_task(my_spec)
                except BaseException as e:
                    traceback.print_exc()
                    tb = traceback.format_exc()
                    stop_backgrounds(ping_stop, btask_stops)
                    do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
                    # If the exception is serializable, save its details
                    if pdb_on_exception:
                        pdb.post_mortem()
                    try:
                        exception_details = e.to_dict()
                    except AttributeError:
                        exception_details = None
                    except BaseException as e:
                        if lp:
                            l_logger.log(logging.WARNING,
                                         "Exception couldn't be serialized: %s " % e)
                        exception_details = None

                    try:
                        m_task = t.to_dict()
                    except Exception:
                        m_task = None

                    m_action = FWAction(stored_data={'_message': 'runtime error during task',
                                                     '_task': m_task,
                                                     '_exception': {'_stacktrace': tb,
                                                                    '_details': exception_details}},
                                        exit=True)
                    m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

                    if lp:
                        final_state = 'FIZZLED'
                        lp.complete_launch(launch_id, m_action, final_state)
                    else:
                        fpath = zpath("FW_offline.json")
                        with zopen(fpath) as f_in:
                            d = json.loads(f_in.read())
                            d['fwaction'] = m_action.to_dict()
                            d['state'] = 'FIZZLED'
                            d['completed_on'] = datetime.utcnow().isoformat()
                            with zopen(fpath, "wt") as f_out:
                                f_out.write(json.dumps(d, ensure_ascii=False))

                    return True

                # read in a FWAction from a file, in case the task is not Python and cannot return
                # it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this
                # particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)
                if lp:
                    l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name)
                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data['multiprocess_name'] = multiprocessing.current_process().name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

            if lp:
                final_state = 'COMPLETED'
                lp.complete_launch(launch_id, m_action, final_state)
            else:

                fpath = zpath("FW_offline.json")
                with zopen(fpath) as f_in:
                    d = json.loads(f_in.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    with zopen(fpath, "wt") as f_out:
                        f_out.write(json.dumps(d, ensure_ascii=False))

            return True

        except LockedWorkflowError as e:
            l_logger.log(logging.DEBUG, traceback.format_exc())
            l_logger.log(logging.WARNING,
                         "Firework {} reached final state {} but couldn't complete the update of "
                         "the database. Reason: {}\nRefresh the WF to recover the result "
                         "(lpad admin refresh -i {}).".format(
                             self.fw_id, final_state, e, self.fw_id))
            return True

        except Exception:
            # problems while processing the results. high probability of malformed data.
            traceback.print_exc()
            stop_backgrounds(ping_stop, btask_stops)
            # restore initial state to prevent the raise of further exceptions
            if lp:
                lp.restore_backup_data(launch_id, m_fw.fw_id)

            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # the action produced by the task is discarded
            m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None,
                                             '_exception': {'_stacktrace': traceback.format_exc(),
                                                            '_details': None}},
                                exit=True)

            try:
                m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)
            except Exception:
                traceback.print_exc()

            if lp:
                try:
                    lp.complete_launch(launch_id, m_action, 'FIZZLED')
                except LockedWorkflowError as e:
                    l_logger.log(logging.DEBUG, traceback.format_exc())
                    l_logger.log(logging.WARNING,
                                 "Firework {} fizzled but couldn't complete the update of the database."
                                 " Reason: {}\nRefresh the WF to recover the result "
                                 "(lpad admin refresh -i {}).".format(
                                     self.fw_id, final_state, e, self.fw_id))
                    return True
            else:
                fpath = zpath("FW_offline.json")
                with zopen(fpath) as f_in:
                    d = json.loads(f_in.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    with zopen(fpath, "wt") as f_out:
                        f_out.write(json.dumps(d, ensure_ascii=False))

            return True
Exemple #43
0
def rapidfire(launchpad,
              fworker,
              qadapter,
              launch_dir='.',
              nlaunches=0,
              njobs_queue=0,
              njobs_block=500,
              sleep_time=None,
              reserve=False,
              strm_lvl='INFO',
              timeout=None,
              fill_mode=False):
    """
    Submit many jobs to the queue.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        qadapter (QueueAdapterBase)
        launch_dir (str): directory where we want to write the blocks
        nlaunches (int): total number of launches desired; "infinite" for loop, 0 for one round
        njobs_queue (int): stops submitting jobs when njobs_queue jobs are in the queue, 0 for no limit
        njobs_block (int): automatically write a new block when njobs_block jobs are in a single block
        sleep_time (int): secs to sleep between rapidfire loop iterations
        reserve (bool): Whether to queue in reservation mode
        strm_lvl (str): level at which to stream log messages
        timeout (int): # of seconds after which to stop the rapidfire process
        fill_mode (bool): whether to submit jobs even when there is nothing to run (only in
            non-reservation mode)
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    launch_dir = os.path.abspath(launch_dir)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    l_logger = get_fw_logger('queue.launcher',
                             l_dir=launchpad.logdir,
                             stream_level=strm_lvl)

    # make sure launch_dir exists:
    if not os.path.exists(launch_dir):
        raise ValueError(
            'Desired launch directory {} does not exist!'.format(launch_dir))

    num_launched = 0
    start_time = datetime.now()

    try:
        l_logger.info('getting queue adapter')

        prev_blocks = sorted(glob.glob(os.path.join(launch_dir, 'block_*')),
                             reverse=True)
        if prev_blocks and not ALWAYS_CREATE_NEW_BLOCK:
            block_dir = os.path.abspath(
                os.path.join(launch_dir, prev_blocks[0]))
            l_logger.info('Found previous block, using {}'.format(block_dir))
        else:
            block_dir = create_datestamp_dir(launch_dir, l_logger)

        while True:
            # get number of jobs in queue
            jobs_in_queue = _get_number_of_jobs_in_queue(
                qadapter, njobs_queue, l_logger)
            job_counter = 0  # this is for QSTAT_FREQUENCY option

            while (not njobs_queue or jobs_in_queue < njobs_queue) and \
                    (launchpad.run_exists(fworker) or (fill_mode and not reserve)) \
                    and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
                l_logger.info('Launching a rocket!')

                # switch to new block dir if it got too big
                if _njobs_in_dir(block_dir) >= njobs_block:
                    l_logger.info(
                        'Block got bigger than {} jobs.'.format(njobs_block))
                    block_dir = create_datestamp_dir(launch_dir, l_logger)

                # launch a single job
                return_code = launch_rocket_to_queue(launchpad, fworker,
                                                     qadapter, block_dir,
                                                     reserve, strm_lvl, True,
                                                     fill_mode)
                if return_code is None:
                    l_logger.info('No READY jobs detected...')
                    break
                elif not return_code:
                    raise RuntimeError("Launch unsuccessful!")
                num_launched += 1
                if num_launched == nlaunches:
                    break
                # wait for the queue system to update
                l_logger.info('Sleeping for {} seconds...zzz...'.format(
                    QUEUE_UPDATE_INTERVAL))
                time.sleep(QUEUE_UPDATE_INTERVAL)
                jobs_in_queue += 1
                job_counter += 1
                if job_counter % QSTAT_FREQUENCY == 0:
                    job_counter = 0
                    jobs_in_queue = _get_number_of_jobs_in_queue(
                        qadapter, njobs_queue, l_logger)

            if num_launched == nlaunches or nlaunches == 0 or \
                    (timeout and (datetime.now() - start_time).total_seconds() >= timeout):
                break
            l_logger.info(
                'Finished a round of launches, sleeping for {} secs'.format(
                    sleep_time))
            time.sleep(sleep_time)
            l_logger.info('Checking for Rockets to run...'.format(sleep_time))

    except:
        log_exception(l_logger, 'Error with queue launcher rapid fire!')