Example #1
0
def rlaunch():

    m_description = 'This program launches one or more Rockets. A Rocket grabs a job from the central database and ' \
                    'runs it. The "single-shot" option launches a single Rocket, ' \
                    'whereas the "rapidfire" option loops until all FireWorks are completed.'

    parser = ArgumentParser(description=m_description)
    subparsers = parser.add_subparsers(help='command', dest='command')
    single_parser = subparsers.add_parser('singleshot', help='launch a single Rocket')
    rapid_parser = subparsers.add_parser('rapidfire',
                                         help='launch multiple Rockets (loop until all FireWorks complete)')

    single_parser.add_argument('-f', '--fw_id', help='specific fw_id to run', default=None, type=int)
    single_parser.add_argument('--offline', help='run in offline mode (FW.json required)', action='store_true')

    rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite"; default 0 is all jobs in DB)', default=0)
    rapid_parser.add_argument('--sleep', help='sleep time between loops (secs)', default=None, type=int)

    parser.add_argument('-l', '--launchpad_file', help='path to launchpad file', default=LAUNCHPAD_LOC)
    parser.add_argument('-w', '--fworker_file', help='path to fworker file', default=FWORKER_LOC)
    parser.add_argument('-c', '--config_dir', help='path to a directory containing the config file (used if -l, -w unspecified)',
                        default=CONFIG_FILE_DIR)

    parser.add_argument('--loglvl', help='level to print log messages', default='INFO')
    parser.add_argument('-s', '--silencer', help='shortcut to mute log messages', action='store_true')

    args = parser.parse_args()

    signal.signal(signal.SIGINT, handle_interrupt)  # graceful exist on ^C

    if not args.launchpad_file and os.path.exists(os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml')

    if not args.fworker_file and os.path.exists(os.path.join(args.config_dir, 'my_fworker.yaml')):
        args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml')

    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'singleshot' and args.offline:
        launchpad = None
    else:
        launchpad = LaunchPad.from_file(args.launchpad_file) if args.launchpad_file else LaunchPad(strm_lvl=args.loglvl)

    if args.fworker_file:
        fworker = FWorker.from_file(args.fworker_file)
    else:
        fworker = FWorker()

    # prime addr lookups
    _log = get_fw_logger("rlaunch", stream_level="INFO")
    _log.info("Hostname/IP lookup (this will take a few seconds)")
    get_my_host()
    get_my_ip()

    if args.command == 'rapidfire':
        rapidfire(launchpad, fworker, None, args.nlaunches, -1, args.sleep, args.loglvl)

    else:
        launch_rocket(launchpad, fworker, args.fw_id, args.loglvl)
Example #2
0
def rlaunch():

    m_description = 'This program launches one or more Rockets. A Rocket grabs a job from the central database and ' \
                    'runs it. The "single-shot" option launches a single Rocket, ' \
                    'whereas the "rapidfire" option loops until all FireWorks are completed.'

    parser = ArgumentParser(description=m_description)
    subparsers = parser.add_subparsers(help='command', dest='command')
    single_parser = subparsers.add_parser('singleshot', help='launch a single Rocket')
    rapid_parser = subparsers.add_parser('rapidfire',
                                         help='launch multiple Rockets (loop until all FireWorks complete)')

    single_parser.add_argument('-f', '--fw_id', help='specific fw_id to run', default=None, type=int)
    single_parser.add_argument('--offline', help='run in offline mode (FW.json required)', action='store_true')

    rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite"; default 0 is all jobs in DB)', default=0)
    rapid_parser.add_argument('--sleep', help='sleep time between loops (secs)', default=None, type=int)

    parser.add_argument('-l', '--launchpad_file', help='path to launchpad file', default=LAUNCHPAD_LOC)
    parser.add_argument('-w', '--fworker_file', help='path to fworker file', default=FWORKER_LOC)
    parser.add_argument('-c', '--config_dir', help='path to a directory containing the config file (used if -l, -w unspecified)',
                        default=CONFIG_FILE_DIR)

    parser.add_argument('--loglvl', help='level to print log messages', default='INFO')
    parser.add_argument('-s', '--silencer', help='shortcut to mute log messages', action='store_true')

    args = parser.parse_args()

    signal.signal(signal.SIGINT, handle_interrupt)  # graceful exist on ^C

    if not args.launchpad_file and os.path.exists(os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml')

    if not args.fworker_file and os.path.exists(os.path.join(args.config_dir, 'my_fworker.yaml')):
        args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml')

    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'singleshot' and args.offline:
        launchpad = None
    else:
        launchpad = LaunchPad.from_file(args.launchpad_file) if args.launchpad_file else LaunchPad(strm_lvl=args.loglvl)

    if args.fworker_file:
        fworker = FWorker.from_file(args.fworker_file)
    else:
        fworker = FWorker()

    # prime addr lookups
    _log = get_fw_logger("rlaunch", stream_level="INFO")
    _log.info("Hostname/IP lookup (this will take a few seconds)")
    get_my_host()
    get_my_ip()

    if args.command == 'rapidfire':
        rapidfire(launchpad, fworker, None, args.nlaunches, -1, args.sleep, args.loglvl)

    else:
        launch_rocket(launchpad, fworker, args.fw_id, args.loglvl)
Example #3
0
    def __init__(self, state, launch_dir, fworker=None, host=None, ip=None,
                 trackers=None, action=None, state_history=None,
                 launch_id=None, fw_id=None):
        """
        :param state: (str) the state of the Launch (e.g. RUNNING, COMPLETED)
        :param launch_dir: (str) the directory where the Launch takes place
        :param fworker: (FWorker) The FireWorker running the Launch
        :param host: (str) the hostname where the launch took place (set
        automatically if None)
        :param ip: (str) the IP address where the launch took place (set
        automatically if None)
        :param trackers: ([Tracker]) File Trackers for this Launch
        :param action: (FWAction) the output of the Launch
        :param state_history: ([dict]) a history of all states of the Launch
        and when they occurred
        :param launch_id: (int) launch_id set by the LaunchPad
        :param fw_id: (int) id of the Firework this Launch is running
        """

        if state not in Firework.STATE_RANKS:
            raise ValueError("Invalid launch state: {}".format(state))

        self.launch_dir = launch_dir
        self.fworker = fworker or FWorker()
        self.host = host or get_my_host()
        self.ip = ip or get_my_ip()
        self.trackers = trackers if trackers else []
        self.action = action if action else None
        self.state_history = state_history if state_history else []
        self.state = state
        self.launch_id = launch_id
        self.fw_id = fw_id
Example #4
0
 def __init__(self, state, launch_dir, fworker=None, host=None, ip=None, trackers=None,
              action=None, state_history=None, launch_id=None, fw_id=None):
     """
     Args:
         state (str): the state of the Launch (e.g. RUNNING, COMPLETED)
         launch_dir (str): the directory where the Launch takes place
         fworker (FWorker): The FireWorker running the Launch
         host (str): the hostname where the launch took place (set automatically if None)
         ip (str): the IP address where the launch took place (set automatically if None)
         trackers ([Tracker]): File Trackers for this Launch
         action (FWAction): the output of the Launch
         state_history ([dict]): a history of all states of the Launch and when they occurred
         launch_id (int): launch_id set by the LaunchPad
         fw_id (int): id of the Firework this Launch is running
     """
     if state not in Firework.STATE_RANKS:
         raise ValueError("Invalid launch state: {}".format(state))
     self.launch_dir = launch_dir
     self.fworker = fworker or FWorker()
     self.host = host or get_my_host()
     self.ip = ip or get_my_ip()
     self.trackers = trackers if trackers else []
     self.action = action if action else None
     self.state_history = state_history if state_history else []
     self.state = state
     self.launch_id = launch_id
     self.fw_id = fw_id
Example #5
0
    def __init__(self,
                 state,
                 launch_dir,
                 fworker=None,
                 host=None,
                 ip=None,
                 action=None,
                 state_history=None,
                 launch_id=None,
                 fw_id=None):
        """
        :param state: (str) the state of the Launch (e.g. RUNNING, COMPLETED)
        :param launch_dir: (str) the directory where the Launch takes place
        :param fworker: (FWorker) The FireWorker running the Launch
        :param host: (str) the hostname where the launch took place (set automatically if None)
        :param ip: (str) the IP address where the launch took place (set automatically if None)
        :param action: (FWAction) the output of the Launch
        :param state_history: (list) a history of all states of the Launch and when they occurred
        :param launch_id: (int) launch_id set by the LaunchPad
        :param fw_id: (int) id of the FireWork this Launch is running
        """

        if state not in FireWork.STATE_RANKS:
            raise ValueError("Invalid launch state: {}".format(state))

        self.fworker = fworker
        self.fw_id = fw_id
        self.host = host if host else get_my_host()
        self.ip = ip if ip else get_my_ip()
        self.launch_dir = launch_dir
        self.action = action if action else None
        self.state_history = state_history if state_history else []
        self.state = state
        self.launch_id = launch_id
Example #6
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False,
                        local_redirect=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
        local_redirect (bool): redirect standard input and output to local file
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()
    firing_state_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict,
                              local_redirect=local_redirect, firing_state_dict=firing_state_dict)
    FWData().Running_IDs = running_ids_dict
    FWData().FiringState = firing_state_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Example #7
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict)
    FWData().Running_IDs = running_ids_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
Example #8
0
 def __init__(
     self,
     state,
     launch_dir,
     fworker=None,
     host=None,
     ip=None,
     trackers=None,
     action=None,
     state_history=None,
     launch_id=None,
     fw_id=None,
 ):
     """
     Args:
         state (str): the state of the Launch (e.g. RUNNING, COMPLETED)
         launch_dir (str): the directory where the Launch takes place
         fworker (FWorker): The FireWorker running the Launch
         host (str): the hostname where the launch took place (set automatically if None)
         ip (str): the IP address where the launch took place (set automatically if None)
         trackers ([Tracker]): File Trackers for this Launch
         action (FWAction): the output of the Launch
         state_history ([dict]): a history of all states of the Launch and when they occurred
         launch_id (int): launch_id set by the LaunchPad
         fw_id (int): id of the Firework this Launch is running
     """
     if state not in Firework.STATE_RANKS:
         raise ValueError("Invalid launch state: {}".format(state))
     self.launch_dir = launch_dir
     self.fworker = fworker or FWorker()
     self.host = host or get_my_host()
     self.ip = ip or get_my_ip()
     self.trackers = trackers if trackers else []
     self.action = action if action else None
     self.state_history = state_history if state_history else []
     self.state = state
     self.launch_id = launch_id
     self.fw_id = fw_id
def arlaunch():
    """
    Function rapid-fire job launching
    """
    m_description = 'This program launches one or more Rockets. A Rocket retrieves a job from the ' \
                    'central database and runs it. The "single-shot" option launches a single Rocket, ' \
                    'whereas the "rapidfire" option loops until all FireWorks are completed.'

    parser = ArgumentParser(description=m_description)
    subparsers = parser.add_subparsers(help='command', dest='command')
    single_parser = subparsers.add_parser('singleshot',
                                          help='launch a single Rocket')
    rapid_parser = subparsers.add_parser(
        'rapidfire',
        help='launch multiple Rockets (loop until all FireWorks complete)')
    multi_parser = subparsers.add_parser(
        'multi', help='launches multiple Rockets simultaneously')

    single_parser.add_argument('-f',
                               '--fw_id',
                               help='specific fw_id to run',
                               default=None,
                               type=int)
    single_parser.add_argument('--offline',
                               help='run in offline mode (FW.json required)',
                               action='store_true')
    single_parser.add_argument('--pdb',
                               help='shortcut to invoke debugger on error',
                               action='store_true')

    rapid_parser.add_argument('--nlaunches',
                              help='num_launches (int or "infinite"; '
                              'default 0 is all jobs in DB)',
                              default=0)
    rapid_parser.add_argument(
        '--timeout',
        help='timeout (secs) after which to quit (default None)',
        default=None,
        type=int)
    rapid_parser.add_argument(
        '--max_loops',
        help='after this many sleep loops, quit even in '
        'infinite nlaunches mode (default -1 is infinite loops)',
        default=-1,
        type=int)
    rapid_parser.add_argument('--sleep',
                              help='sleep time between loops (secs)',
                              default=None,
                              type=int)
    rapid_parser.add_argument(
        '--local_redirect',
        help="Redirect stdout and stderr to the launch directory",
        action="store_true")

    multi_parser.add_argument('num_jobs',
                              help='the number of jobs to run in parallel',
                              type=int)
    multi_parser.add_argument('--nlaunches',
                              help='number of FireWorks to run in series per '
                              'parallel job (int or "infinite"; default 0 is '
                              'all jobs in DB)',
                              default=0)
    multi_parser.add_argument(
        '--sleep',
        help='sleep time between loops in infinite launch mode'
        '(secs)',
        default=None,
        type=int)
    multi_parser.add_argument(
        '--timeout',
        help='timeout (secs) after which to quit (default None)',
        default=None,
        type=int)
    multi_parser.add_argument(
        '--nodefile',
        help='nodefile name or environment variable name '
        'containing the node file name (for populating'
        ' FWData only)',
        default=None,
        type=str)
    multi_parser.add_argument(
        '--ppn',
        help='processors per node (for populating FWData only)',
        default=1,
        type=int)
    multi_parser.add_argument('--exclude_current_node',
                              help="Don't use the script launching node"
                              "as compute node",
                              action="store_true")
    multi_parser.add_argument(
        '--local_redirect',
        help="Redirect stdout and stderr to the launch directory",
        action="store_true")

    parser.add_argument('-l',
                        '--launchpad_file',
                        help='path to launchpad file')
    parser.add_argument('-w',
                        '--fworker_file',
                        required=True,
                        help='path to fworker file')
    parser.add_argument('-c',
                        '--config_dir',
                        help='path to a directory containing the config file '
                        '(used if -l, -w unspecified)',
                        default=CONFIG_FILE_DIR)

    parser.add_argument('--loglvl',
                        help='level to print log messages',
                        default='INFO')
    parser.add_argument('-s',
                        '--silencer',
                        help='shortcut to mute log messages',
                        action='store_true')

    try:
        import argcomplete
        argcomplete.autocomplete(parser)
        # This supports bash autocompletion. To enable this, pip install
        # argcomplete, activate global completion, or add
        #      eval "$(register-python-argcomplete rlaunch)"
        # into your .bash_profile or .bashrc
    except ImportError:
        pass

    args = parser.parse_args()

    signal.signal(signal.SIGINT, handle_interrupt)  # graceful exit on ^C

    if not args.launchpad_file and os.path.exists(
            os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir,
                                           'my_launchpad.yaml')
    elif not args.launchpad_file:
        args.launchpad_file = LAUNCHPAD_LOC

    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'singleshot' and args.offline:
        launchpad = None
    else:
        launchpad = LaunchPad.from_file(
            args.launchpad_file) if args.launchpad_file else LaunchPad(
                strm_lvl=args.loglvl)

    fworker = AiiDAFWorker.from_file(args.fworker_file)

    # prime addr lookups
    _log = get_fw_logger("rlaunch", stream_level="INFO")
    _log.info("Hostname/IP lookup (this will take a few seconds)")
    get_my_host()
    get_my_ip()

    if args.command == 'rapidfire':
        rapidfire(launchpad,
                  fworker=fworker,
                  m_dir=None,
                  nlaunches=args.nlaunches,
                  max_loops=args.max_loops,
                  sleep_time=args.sleep,
                  strm_lvl=args.loglvl,
                  timeout=args.timeout,
                  local_redirect=args.local_redirect)
    elif args.command == 'multi':
        total_node_list = None
        if args.nodefile:
            if args.nodefile in os.environ:
                args.nodefile = os.environ[args.nodefile]
            with open(args.nodefile, 'r') as fhandle:
                total_node_list = [
                    line.strip() for line in fhandle.readlines()
                ]
        launch_multiprocess(launchpad,
                            fworker,
                            args.loglvl,
                            args.nlaunches,
                            args.num_jobs,
                            args.sleep,
                            total_node_list,
                            args.ppn,
                            timeout=args.timeout,
                            exclude_current_node=args.exclude_current_node,
                            local_redirect=args.local_redirect)
    else:
        launch_rocket(launchpad,
                      fworker,
                      args.fw_id,
                      args.loglvl,
                      pdb_on_exception=args.pdb)
Example #10
0
def rlaunch():

    m_description = 'This program launches one or more Rockets. A Rocket grabs a job from the ' \
                    'central database and runs it. The "single-shot" option launches a single Rocket, ' \
                    'whereas the "rapidfire" option loops until all FireWorks are completed.'

    parser = ArgumentParser(description=m_description)
    subparsers = parser.add_subparsers(help='command', dest='command')
    single_parser = subparsers.add_parser('singleshot', help='launch a single Rocket')
    rapid_parser = subparsers.add_parser('rapidfire',
                                         help='launch multiple Rockets (loop until all FireWorks complete)')
    multi_parser = subparsers.add_parser('multi',
                                         help='launches multiple Rockets simultaneously')

    single_parser.add_argument('-f', '--fw_id', help='specific fw_id to run', default=None, type=int)
    single_parser.add_argument('--offline', help='run in offline mode (FW.json required)', action='store_true')

    rapid_parser.add_argument('--nlaunches', help='num_launches (int or "infinite"; '
                                                  'default 0 is all jobs in DB)', default=0)
    rapid_parser.add_argument('--timeout', help='timeout (secs) after which to quit (default None)',
                              default=None, type=int)
    rapid_parser.add_argument('--max_loops', help='after this many sleep loops, quit even in '
                                                  'infinite nlaunches mode (default -1 is infinite loops)',
                              default=-1, type=int)
    rapid_parser.add_argument('--sleep', help='sleep time between loops (secs)', default=None,
                              type=int)

    multi_parser.add_argument('num_jobs', help='the number of jobs to run in parallel', type=int)
    multi_parser.add_argument('--nlaunches', help='number of FireWorks to run in series per '
                                                  'parallel job (int or "infinite"; default 0 is '
                                                  'all jobs in DB)',
                              default=0)
    multi_parser.add_argument('--sleep', help='sleep time between loops in infinite launch mode'
                                              '(secs)',
                              default=None, type=int)
    multi_parser.add_argument('--timeout', help='timeout (secs) after which to quit (default None)',
                              default=None, type=int)
    multi_parser.add_argument('--nodefile', help='nodefile name or environment variable name '
                                                 'containing the node file name (for populating'
                                                 ' FWData only)',
                              default=None, type=str)
    multi_parser.add_argument('--ppn', help='processors per node (for populating FWData only)',
                              default=1, type=int)
    multi_parser.add_argument('--exclude_current_node', help="Don't use the script launching node"
                                                             "as compute node",
                              action="store_true")

    parser.add_argument('-l', '--launchpad_file', help='path to launchpad file', default=LAUNCHPAD_LOC)
    parser.add_argument('-w', '--fworker_file', help='path to fworker file', default=FWORKER_LOC)
    parser.add_argument('-c', '--config_dir', help='path to a directory containing the config file '
                                                   '(used if -l, -w unspecified)',
                        default=CONFIG_FILE_DIR)

    parser.add_argument('--loglvl', help='level to print log messages', default='INFO')
    parser.add_argument('-s', '--silencer', help='shortcut to mute log messages', action='store_true')


    args = parser.parse_args()

    signal.signal(signal.SIGINT, handle_interrupt)  # graceful exit on ^C

    if not args.launchpad_file and os.path.exists(os.path.join(args.config_dir, 'my_launchpad.yaml')):
        args.launchpad_file = os.path.join(args.config_dir, 'my_launchpad.yaml')

    if not args.fworker_file and os.path.exists(os.path.join(args.config_dir, 'my_fworker.yaml')):
        args.fworker_file = os.path.join(args.config_dir, 'my_fworker.yaml')

    args.loglvl = 'CRITICAL' if args.silencer else args.loglvl

    if args.command == 'singleshot' and args.offline:
        launchpad = None
    else:
        launchpad = LaunchPad.from_file(args.launchpad_file) if args.launchpad_file else LaunchPad(
            strm_lvl=args.loglvl)

    if args.fworker_file:
        fworker = FWorker.from_file(args.fworker_file)
    else:
        fworker = FWorker()

    # prime addr lookups
    _log = get_fw_logger("rlaunch", stream_level="INFO")
    _log.info("Hostname/IP lookup (this will take a few seconds)")
    get_my_host()
    get_my_ip()

    if args.command == 'rapidfire':
        rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=args.nlaunches,
                  max_loops=args.max_loops, sleep_time=args.sleep, strm_lvl=args.loglvl,
                  timeout=args.timeout)
    elif args.command == 'multi':
        total_node_list = None
        if args.nodefile:
            if args.nodefile in os.environ:
                args.nodefile = os.environ[args.nodefile]
            with open(args.nodefile, 'r') as f:
                total_node_list = [line.strip() for line in f.readlines()]

        launch_multiprocess(launchpad, fworker, args.loglvl, args.nlaunches, args.num_jobs,
                            args.sleep, total_node_list, args.ppn, timeout=args.timeout,
                            exclude_current_node=args.exclude_current_node)
    else:
        launch_rocket(launchpad, fworker, args.fw_id, args.loglvl)
Example #11
0
def rlaunch():
    m_description = (
        "This program launches one or more Rockets. A Rocket retrieves a job from the "
        'central database and runs it. The "single-shot" option launches a single Rocket, '
        'whereas the "rapidfire" option loops until all FireWorks are completed.'
    )

    parser = ArgumentParser(description=m_description)
    subparsers = parser.add_subparsers(help="command", dest="command")
    single_parser = subparsers.add_parser("singleshot",
                                          help="launch a single Rocket")
    rapid_parser = subparsers.add_parser(
        "rapidfire",
        help="launch multiple Rockets (loop until all FireWorks complete)")
    multi_parser = subparsers.add_parser(
        "multi", help="launches multiple Rockets simultaneously")

    single_parser.add_argument("-f",
                               "--fw_id",
                               help="specific fw_id to run",
                               default=None,
                               type=int)
    single_parser.add_argument("--offline",
                               help="run in offline mode (FW.json required)",
                               action="store_true")
    single_parser.add_argument("--pdb",
                               help="shortcut to invoke debugger on error",
                               action="store_true")

    rapid_parser.add_argument("--nlaunches",
                              help='num_launches (int or "infinite"; '
                              "default 0 is all jobs in DB)",
                              default=0)
    rapid_parser.add_argument(
        "--timeout",
        help="timeout (secs) after which to quit (default None)",
        default=None,
        type=int)
    rapid_parser.add_argument(
        "--max_loops",
        help=
        "after this many sleep loops, quit even in infinite nlaunches mode (default -1 is infinite loops)",
        default=-1,
        type=int,
    )
    rapid_parser.add_argument("--sleep",
                              help="sleep time between loops (secs)",
                              default=None,
                              type=int)
    rapid_parser.add_argument(
        "--local_redirect",
        help="Redirect stdout and stderr to the launch directory",
        action="store_true")

    multi_parser.add_argument("num_jobs",
                              help="the number of jobs to run in parallel",
                              type=int)
    multi_parser.add_argument(
        "--nlaunches",
        help="number of FireWorks to run in series per "
        'parallel job (int or "infinite"; default 0 is '
        "all jobs in DB)",
        default=0,
    )
    multi_parser.add_argument(
        "--sleep",
        help="sleep time between loops in infinite launch mode (secs)",
        default=None,
        type=int)
    multi_parser.add_argument(
        "--timeout",
        help="timeout (secs) after which to quit (default None)",
        default=None,
        type=int)
    multi_parser.add_argument(
        "--nodefile",
        help="nodefile name or environment variable name "
        "containing the node file name (for populating"
        " FWData only)",
        default=None,
        type=str,
    )
    multi_parser.add_argument(
        "--ppn",
        help="processors per node (for populating FWData only)",
        default=1,
        type=int)
    multi_parser.add_argument(
        "--exclude_current_node",
        help="Don't use the script launching node as compute node",
        action="store_true")
    multi_parser.add_argument(
        "--local_redirect",
        help="Redirect stdout and stderr to the launch directory",
        action="store_true")

    parser.add_argument("-l",
                        "--launchpad_file",
                        help="path to launchpad file")
    parser.add_argument("-w", "--fworker_file", help="path to fworker file")
    parser.add_argument(
        "-c",
        "--config_dir",
        help=
        "path to a directory containing the config file (used if -l, -w unspecified)",
        default=CONFIG_FILE_DIR,
    )

    parser.add_argument("--loglvl",
                        help="level to print log messages",
                        default="INFO")
    parser.add_argument("-s",
                        "--silencer",
                        help="shortcut to mute log messages",
                        action="store_true")

    try:
        import argcomplete

        argcomplete.autocomplete(parser)
        # This supports bash autocompletion. To enable this, pip install
        # argcomplete, activate global completion, or add
        #      eval "$(register-python-argcomplete rlaunch)"
        # into your .bash_profile or .bashrc
    except ImportError:
        pass

    args = parser.parse_args()

    signal.signal(signal.SIGINT, handle_interrupt)  # graceful exit on ^C

    if not args.launchpad_file and os.path.exists(
            os.path.join(args.config_dir, "my_launchpad.yaml")):
        args.launchpad_file = os.path.join(args.config_dir,
                                           "my_launchpad.yaml")
    elif not args.launchpad_file:
        args.launchpad_file = LAUNCHPAD_LOC

    if not args.fworker_file and os.path.exists(
            os.path.join(args.config_dir, "my_fworker.yaml")):
        args.fworker_file = os.path.join(args.config_dir, "my_fworker.yaml")
    elif not args.fworker_file:
        args.fworker_file = FWORKER_LOC

    args.loglvl = "CRITICAL" if args.silencer else args.loglvl

    if args.command == "singleshot" and args.offline:
        launchpad = None
    else:
        launchpad = LaunchPad.from_file(
            args.launchpad_file) if args.launchpad_file else LaunchPad(
                strm_lvl=args.loglvl)

    if args.fworker_file:
        fworker = FWorker.from_file(args.fworker_file)
    else:
        fworker = FWorker()

    # prime addr lookups
    _log = get_fw_logger("rlaunch", stream_level="INFO")
    _log.info("Hostname/IP lookup (this will take a few seconds)")
    get_my_host()
    get_my_ip()

    if args.command == "rapidfire":
        rapidfire(
            launchpad,
            fworker=fworker,
            m_dir=None,
            nlaunches=args.nlaunches,
            max_loops=args.max_loops,
            sleep_time=args.sleep,
            strm_lvl=args.loglvl,
            timeout=args.timeout,
            local_redirect=args.local_redirect,
        )
    elif args.command == "multi":
        total_node_list = None
        if args.nodefile:
            if args.nodefile in os.environ:
                args.nodefile = os.environ[args.nodefile]
            with open(args.nodefile) as f:
                total_node_list = [line.strip() for line in f.readlines()]
        launch_multiprocess(
            launchpad,
            fworker,
            args.loglvl,
            args.nlaunches,
            args.num_jobs,
            args.sleep,
            total_node_list,
            args.ppn,
            timeout=args.timeout,
            exclude_current_node=args.exclude_current_node,
            local_redirect=args.local_redirect,
        )
    else:
        launch_rocket(launchpad,
                      fworker,
                      args.fw_id,
                      args.loglvl,
                      pdb_on_exception=args.pdb)