コード例 #1
0
ファイル: multi_launcher.py プロジェクト: xhqu1981/fireworks
def ping_multilaunch(port, stop_event):
    """
    A single manager to ping all launches during multiprocess launches

    Args:
        port (int): Listening port number of the DataServer
        stop_event (Thread.Event): stop event
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    fd = FWData()

    lp = ds.LaunchPad()
    while not stop_event.is_set():
        for pid, lid in fd.Running_IDs.items():
            if lid:
                try:
                    os.kill(pid, 0)  # throws OSError if the process is dead
                    lp.ping_launch(lid)
                except OSError:
                    fd.Running_IDs[pid] = None
                    fd.FiringState[pid] = False
                    pass  # means this process is dead!

        stop_event.wait(PING_TIME_SECS)
コード例 #2
0
ファイル: qchem_task.py プロジェクト: xhqu1981/rubicon
 def get_qchem_cmd(cls, qcinp, mol):
     physical_nproc_map = {"edison": 24,
                           "cori": 32,
                           "matgen": 16,
                           "LaptopQu.local": 2}
     numa_num_map = {"edison": 2,
                     "cori": 2,
                     "matgen": 2,
                     "LaptopQu.local": 1}
     hostname = cls.get_hostname()
     if hostname in physical_nproc_map:
         physical_nproc = physical_nproc_map[hostname]
     elif hostname == 'vesta':
         return ALCF_Utils.get_alcf_qchem_cmd(qcinp)
     else:
         return [["qchem"]] * 3
     natoms = len(mol)
     fw_data = FWData()
     if fw_data.MULTIPROCESSING and fw_data.SUB_NPROCS is not None:
         physical_nproc = int(fw_data.SUB_NPROCS)
     nproc = min(physical_nproc, natoms)
     numa_num = numa_num_map[hostname]
     nproc = max((nproc // numa_num) * numa_num, 1)
     half_nproc = min(physical_nproc // 2, natoms)
     half_nproc = max((half_nproc // numa_num) * numa_num, 1)
     qc_exe = shlex.split("qchem -np {}".format(nproc))
     half_cpus_cmd = shlex.split("qchem -np {}".format(half_nproc))
     openmp_cmd = shlex.split("qchem -nt {}".format(nproc))
     return qc_exe, half_cpus_cmd, openmp_cmd
コード例 #3
0
ファイル: multi_launcher.py プロジェクト: xhqu1981/fireworks
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout,
                      running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
        macro_sleep_time (int): secs to sleep between sub job resubmit
        local_redirect (bool): redirect standard input and output to local file
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    fw_data = FWData()
    fw_data.DATASERVER = ds
    fw_data.MULTIPROCESSING = True
    fw_data.NODE_LIST = node_list
    fw_data.SUB_NPROCS = sub_nproc
    fw_data.Running_IDs = running_ids_dict
    fw_data.FiringState = firing_state_dict
    fw_data.lp = launchpad
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
    fw_data.FiringState[os.getpid()] = True
    rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches,
              max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout,
              local_redirect=local_redirect)
    fw_data.FiringState[os.getpid()] = False
    while nlaunches == 0:
        time.sleep(1.5) # wait for LaunchPad to be initialized
        firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing]
        if len(firing_pids) > 0:
            # Some other sub jobs are still running
            macro_sleep_time = macro_sleep_time if macro_sleep_time \
                else sleep_time * len(fw_data.FiringState)
            log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time))
            time.sleep(macro_sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time))
            fw_data.FiringState[os.getpid()] = True
            rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches,
                      max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout,
                      local_redirect=local_redirect)
            fw_data.FiringState[os.getpid()] = False
        else:
            break
    log_multi(l_logger, 'Sub job finished')
コード例 #4
0
def stop_backgrounds(ping_stop, btask_stops):
    fd = FWData()
    if fd.MULTIPROCESSING:
        fd.Running_IDs[os.getpid()] = None
    elif ping_stop:
        ping_stop.set()

    for b in btask_stops:
        b.set()
コード例 #5
0
ファイル: rocket.py プロジェクト: jakirkham/fireworks
def stop_backgrounds(ping_stop, btask_stops):
    fd = FWData()
    if fd.MULTIPROCESSING:
        m = fd.DATASERVER
        m.Running_IDs()[os.getpid()] = None
    else:
        ping_stop.set()

    for b in btask_stops:
        b.set()
コード例 #6
0
ファイル: fw_utilities.py プロジェクト: flxb/fireworks
def log_multi(m_logger, msg, log_lvl='info'):
    """
    :param m_logger: (logger) The logger object
    :param msg: (str) a String to log
    :param log_lvl: (str) The level to log at
    """
    _log_fnc = getattr(m_logger, log_lvl.lower())
    if FWData().MULTIPROCESSING:
        _log_fnc("{} : ({})".format(msg,
                                    multiprocessing.current_process().name))
    else:
        _log_fnc(msg)
コード例 #7
0
ファイル: multi_launcher.py プロジェクト: flxb/fireworks
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire

    :param fworker: (FWorker) object
    :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever
    :param sleep: (int) secs to sleep between rapidfire loop iterations
    :param loglvl: (str) level at which to output logs to stdout
    :param port: (int) Listening port number of the shared object manage
    :param password: (str) security password to access the server
    :param node_list: ([str]) computer node list
    :param sub_nproc: (int) number of processors of the sub job
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    rapidfire(launchpad, fworker, None, nlaunches, -1, sleep, loglvl)
コード例 #8
0
ファイル: fw_utilities.py プロジェクト: tschaume/fireworks
def log_multi(m_logger, msg, log_lvl="info"):
    """
    Args:
        m_logger (logger): The logger object
        msg (str): a String to log
        log_lvl (str): The level to log at
    """
    _log_fnc = getattr(m_logger, log_lvl.lower())
    if FWData().MULTIPROCESSING:
        _log_fnc(f"{msg} : ({multiprocessing.current_process().name})")
    else:
        _log_fnc(msg)
コード例 #9
0
def start_ping_launch(launchpad: LaunchPad, launch_id: int) -> Union[Event, None]:
    fd = FWData()
    if fd.MULTIPROCESSING:
        if not launch_id:
            raise ValueError("Multiprocessing cannot be run in offline mode!")
        fd.Running_IDs[os.getpid()] = launch_id
        return None
    else:
        ping_stop = Event()
        ping_thread = Thread(target=ping_launch, args=(launchpad, launch_id, ping_stop, current_thread()))
        ping_thread.start()
        return ping_stop
コード例 #10
0
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list,
                      sub_nproc, timeout, running_ids_dict):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    FWData().DATASERVER = ds
    FWData().MULTIPROCESSING = True
    FWData().NODE_LIST = node_list
    FWData().SUB_NPROCS = sub_nproc
    FWData().Running_IDs = running_ids_dict
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout)
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        launch_ids = FWData().Running_IDs.values()
        live_ids = list(set(launch_ids) - {None})
        if len(live_ids) > 0:
            # Some other sub jobs are still running
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    sleep_time))
            time.sleep(sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(sleep_time))
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout)
        else:
            break
    log_multi(l_logger, 'Sub job finished')
コード例 #11
0
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time,
                        total_node_list=None, ppn=1, timeout=None, exclude_current_node=False):
    """
    Launch the jobs in the job packing mode.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker)
        loglvl (str): level at which to output logs
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        num_jobs(int): number of sub jobs
        sleep_time (int): secs to sleep between rapidfire loop iterations
        total_node_list ([str]): contents of NODEFILE (doesn't affect execution)
        ppn (int): processors per node (doesn't affect execution)
        timeout (int): # of seconds after which to stop the rapidfire process
        exclude_current_node: Don't use the script launching node as a compute node
    """
    # parse node file contents
    if exclude_current_node:
        host = get_my_host()
        l_dir = launchpad.get_logdir() if launchpad else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl)
        if host in total_node_list:
            log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host))
            total_node_list.remove(host)
        else:
            log_multi(l_logger, "The current node is not in the node list, keep the node list as is")
    node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn)

    # create shared dataserver
    ds = DataServer.setup(launchpad)
    port = ds.address[1]

    manager = Manager()
    running_ids_dict = manager.dict()

    # launch rapidfire processes
    processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists,
                              sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict)
    FWData().Running_IDs = running_ids_dict

    # start pinging service
    ping_stop = threading.Event()
    ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop))
    ping_thread.start()

    # wait for completion
    for p in processes:
        p.join()
    ping_stop.set()
    ping_thread.join()
    ds.shutdown()
コード例 #12
0
ファイル: multi_launcher.py プロジェクト: tschaume/fireworks
def ping_multilaunch(port, stop_event):
    """
    A single manager to ping all launches during multiprocess launches

    Args:
        port (int): Listening port number of the DataServer
        stop_event (Thread.Event): stop event
    """
    ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD)
    ds.connect()
    fd = FWData()

    lp = ds.LaunchPad()
    while not stop_event.is_set():
        for pid, lid in fd.Running_IDs.items():
            if lid:
                try:
                    os.kill(pid, 0)  # throws OSError if the process is dead
                    lp.ping_launch(lid)
                except OSError:  # means this process is dead!
                    fd.Running_IDs[pid] = None

        stop_event.wait(PING_TIME_SECS)
コード例 #13
0
ファイル: rocket.py プロジェクト: jakirkham/fireworks
def start_ping_launch(launchpad, launch_id):
    fd = FWData()
    if fd.MULTIPROCESSING:
        if not launch_id:
            raise ValueError("Multiprocessing cannot be run in offline mode!")
        m = fd.DATASERVER
        m.Running_IDs()[os.getpid()] = launch_id
        return None
    else:
        ping_stop = threading.Event()
        ping_thread = threading.Thread(target=ping_launch,
                                       args=(launchpad, launch_id, ping_stop, threading.currentThread()))
        ping_thread.start()
        return ping_stop
コード例 #14
0
 def _get_vasp_cmd_in_job_packing(fw_data, fw_env, mpi_cmd):
     tasks_per_node_flag = {
         "srun": "--ntasks-per-node",
         "mpirun": "--npernode",
         "aprun": "-N"
     }
     nodelist_flag = {
         "srun": "--nodelist",
         "mpirun": "--host",
         "aprun": "-L"
     }
     ranks_num_flag = {"srun": "--ntasks", "mpirun": "-n", "aprun": "-n"}
     nodes_spec = {
         "srun": "--nodes {}".format(len(fw_data.NODE_LIST)),
         "mpirun": "",
         "aprun": ""
     }
     verbose_flag = {"srun": "-v", "mpirun": "", "aprun": ""}
     mpirun = mpi_cmd.split()[0]
     fw_data = FWData()
     #  Don't honor the SLURM_NTASKS in case of job packing, Because SLURM_NTASKS is referring
     #  to total number of processes of the parent job
     sub_nproc = fw_data.SUB_NPROCS
     vasp_cmds = [
         fw_env.get("vasp_cmd", "vasp"),
         fw_env.get("gvasp_cmd", "gvasp")
     ]
     vasp_exes = [
         shlex.split(
             '{mpi_cmd} {verbose_flag} {nodes_spec} {ranks_flag} {nproc} {tpn_flag} {tpn} '
             '{nl_flag} {nl} {vasp_cmd}'.format(
                 mpi_cmd=mpi_cmd,
                 verbose_flag=verbose_flag,
                 nodes_spec=nodes_spec[mpirun],
                 ranks_flag=ranks_num_flag[mpirun],
                 nproc=sub_nproc,
                 tpn_flag=tasks_per_node_flag[mpirun],
                 tpn=int(fw_data.SUB_NPROCS) / len(fw_data.NODE_LIST),
                 nl_flag=nodelist_flag[mpirun],
                 nl=','.join(fw_data.NODE_LIST),
                 vasp_cmd=vasp_cmd)) for vasp_cmd in vasp_cmds
     ]
     v_exe, gv_exe = vasp_exes
     return v_exe, gv_exe
コード例 #15
0
ファイル: rocket_launcher.py プロジェクト: xhqu1981/fireworks
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None,
              strm_lvl='INFO', timeout=None, local_redirect=False):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories
    for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker object)
        m_dir (str): the directory in which to loop Rocket running
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops
        max_loops (int): maximum number of loops (default -1 is infinite)
        sleep_time (int): secs to sleep between rapidfire loop iterations
        strm_lvl (str): level at which to output logs to stdout
        timeout (int): of seconds after which to stop the rapidfire process
        local_redirect (bool): redirect standard input and output to local file
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = get_fworker(fworker)

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and \
                (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_')
            os.chdir(launcher_dir)
            if local_redirect:
                with redirect_local():
                    rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)
            else:
                rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl)

            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                time.sleep(0.15)
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        fd = FWData()
        if fd.MULTIPROCESSING:
            # sleeping time is not firing
            fd.FiringState = False
        time.sleep(sleep_time)
        if fd.MULTIPROCESSING:
            fd.FiringState = True
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
コード例 #16
0
 def set_nodelist_env(cls, ):
     fw_data = FWData()
     if fw_data.MULTIPROCESSING and fw_data.NODE_LIST is not None:
         nodelist = ",".join(fw_data.NODE_LIST)
         os.environ["QCNODE"] = nodelist
コード例 #17
0
ファイル: rocket.py プロジェクト: jakirkham/fireworks
    def run(self):
        """
        Run the rocket (check out a job from the database and execute it)
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id)
        else:  # offline mode
            m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            with open('FW_offline.json', 'r+') as f:
                d = json.loads(f.read())
                d['started_on'] = datetime.utcnow().isoformat()
                f.seek(0)
                f.write(json.dumps(d))
                f.truncate()

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(self.fworker.query))
            return False

        try:
            if '_launch_dir' in m_fw.spec and lp:
                prev_dir = launch_dir
                launch_dir = os.path.expandvars(m_fw.spec['_launch_dir'])
                if not os.path.abspath(launch_dir):
                    launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir))
                # thread-safe "mkdir -p"
                try:
                    os.makedirs(launch_dir)
                except OSError as exception:
                    if exception.errno != errno.EEXIST:
                        raise
                os.chdir(launch_dir)

                if not os.path.samefile(launch_dir, prev_dir):
                    lp.change_launch_dir(launch_id, launch_dir)

                if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                    try:
                        os.rmdir(prev_dir)
                    except:
                        pass

            if m_fw.spec.get('_recover_launch', None):
                launch_to_recover = lp.get_launch_by_id(m_fw.spec['_recover_launch']['_launch_id'])
                starting_task = launch_to_recover.action.stored_data.get('_exception', {}).get('_failed_task_n', 0)
                recover_launch_dir = launch_to_recover.launch_dir
                if lp:
                    lp.log_message(
                        logging.INFO,
                        'Recovering from task number {} in folder {}.'.format(starting_task, recover_launch_dir))
                if m_fw.spec['_recover_launch']['_recover_mode'] == 'cp' and launch_dir != recover_launch_dir:
                    if lp:
                        lp.log_message(
                            logging.INFO,
                            'Copying data from recovery folder {} to folder {}.'.format(recover_launch_dir, launch_dir))
                    distutils.dir_util.copy_tree(recover_launch_dir, launch_dir, update=1)

            else:
                starting_task = 0

            if lp:
                message = 'RUNNING fw_id: {} in directory: {}'.\
                    format(m_fw.fw_id, os.getcwd())
                lp.log_message(logging.INFO, message)

            # write FW.json and/or FW.yaml to the directory
            if PRINT_FW_JSON:
                m_fw.to_file('FW.json', indent=4)
            if PRINT_FW_YAML:
                m_fw.to_file('FW.yaml')

            my_spec = dict(m_fw.spec)  # make a copy of spec, don't override original
            my_spec["_fw_env"] = self.fworker.env

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            btask_stops = []
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the FireTasks!
            for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task):
                if lp:
                    lp.log_message(logging.INFO, "Task started: %s." % t.fw_name)

                if my_spec.get("_add_launchpad_and_fw_id"):
                    t.launchpad = self.launchpad
                    t.fw_id = m_fw.fw_id

                try:
                    m_action = t.run_task(my_spec)
                except BaseException as e:
                    traceback.print_exc()
                    tb = traceback.format_exc()
                    stop_backgrounds(ping_stop, btask_stops)
                    do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
                    # If the exception is serializable, save its details
                    try:
                        exception_details = e.to_dict()
                    except AttributeError:
                        exception_details = None
                    except BaseException as e:
                        if lp:
                            lp.log_message(logging.WARNING, "Exception couldn't be serialized: %s " % e)
                        exception_details = None

                    try:
                        m_task = t.to_dict()
                    except:
                        m_task = None

                    m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task,
                                                     '_exception': {'_stacktrace': tb, '_details': exception_details,
                                                                    '_failed_task_n': t_counter}}, exit=True)
                    m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

                    if lp:
                        lp.complete_launch(launch_id, m_action, 'FIZZLED')
                    else:
                        with open('FW_offline.json', 'r+') as f:
                            d = json.loads(f.read())
                            d['fwaction'] = m_action.to_dict()
                            d['state'] = 'FIZZLED'
                            f.seek(0)
                            f.write(json.dumps(d))
                            f.truncate()

                    return True

                # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)
                if lp:
                    lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name)
                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data['multiprocess_name'] = multiprocessing.current_process().name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

            if lp:
                lp.complete_launch(launch_id, m_action, 'COMPLETED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True

        except:
            # problems while processing the results. high probability of malformed data.
            traceback.print_exc()
            stop_backgrounds(ping_stop, btask_stops)
            # restore initial state to prevent the raise of further exceptions
            if lp:
                lp.restore_backup_data(launch_id, m_fw.fw_id)

            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # the action produced by the task is discarded
            m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None,
                                             '_exception': {'_stacktrace': traceback.format_exc(),
                                             '_details': None}}, exit=True)

            try:
                m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)
            except:
                traceback.print_exc()

            if lp:
                lp.complete_launch(launch_id, m_action, 'FIZZLED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True
コード例 #18
0
    def run(self):
        """
        Run the rocket (check out a job from the database and execute it)
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir,
                                             self.fw_id)
        else:  # offline mode
            m_fw = FireWork.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            with open('FW_offline.json', 'r+') as f:
                d = json.loads(f.read())
                d['started_on'] = datetime.utcnow().isoformat()
                f.seek(0)
                f.write(json.dumps(d))
                f.truncate()

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(
                self.fworker.query))
            return False

        if '_launch_dir' in m_fw.spec:
            prev_dir = launch_dir
            os.chdir(m_fw.spec['_launch_dir'])
            launch_dir = os.path.abspath(os.getcwd())

            if lp:
                lp._change_launch_dir(launch_id, launch_dir)

            if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                try:
                    os.rmdir(prev_dir)
                except:
                    pass

        # write FW.json and/or FW.yaml to the directory
        if PRINT_FW_JSON:
            m_fw.to_file('FW.json', indent=4)
        if PRINT_FW_YAML:
            m_fw.to_file('FW.yaml')

        try:
            my_spec = dict(
                m_fw.spec)  # make a copy of spec, don't override original

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            btask_stops = []
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the FireTasks!
            for my_task in m_fw.tasks:
                m_action = my_task.run_task(my_spec)

                # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)

                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data[
                    'multiprocess_name'] = multiprocessing.current_process(
                    ).name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            if lp:
                lp.complete_launch(launch_id, m_action, 'COMPLETED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True

        except:
            stop_backgrounds(ping_stop, btask_stops)
            traceback.print_exc()
            try:
                m_action = FWAction(stored_data={
                    '_message': 'runtime error during task',
                    '_task': my_task.to_dict(),
                    '_exception': traceback.format_exc()
                },
                                    exit=True)
            except:
                m_action = FWAction(stored_data={
                    '_message': 'runtime error during task',
                    '_task': None,
                    '_exception': traceback.format_exc()
                },
                                    exit=True)
            if lp:
                lp.complete_launch(launch_id, m_action, 'FIZZLED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True
コード例 #19
0
    def run_task(self, fw_spec):

        # write a file containing the formula and task_type for somewhat
        # easier file system browsing
        self._write_formula_file(fw_spec)

        fw_env = fw_spec.get("_fw_env", {})

        if "mpi_cmd" in fw_env:
            mpi_cmd = fw_spec["_fw_env"]["mpi_cmd"]
        elif which("mpirun"):
            mpi_cmd = "mpirun"
        elif which("aprun"):
            mpi_cmd = "aprun"
        else:
            raise ValueError("No MPI command found!")

        # TODO: last two env vars, i.e. SGE and LoadLeveler, are untested
        env_vars = ['PBS_NP', 'SLURM_NTASKS', 'NSLOTS', 'LOADL_TOTAL_TASKS']
        nproc = None
        for env_var in env_vars:
            nproc = os.environ.get(env_var, None)
            if nproc is not None:
                break
        if nproc is None:
            raise ValueError(
                "None of the env vars {} found to set nproc!".format(env_vars))

        fw_data = FWData()
        if (not fw_data.MULTIPROCESSING) or (fw_data.NODE_LIST is None):
            if "srun" in mpi_cmd:
                mpi_cmd += " -v"
            v_exe = shlex.split('{} -n {} {}'.format(
                mpi_cmd, nproc, fw_env.get("vasp_cmd", "vasp")))
            gv_exe = shlex.split('{} -n {} {}'.format(
                mpi_cmd, nproc, fw_env.get("gvasp_cmd", "gvasp")))
        else:
            v_exe, gv_exe = self._get_vasp_cmd_in_job_packing(
                fw_data, fw_env, mpi_cmd)

        print('host:', os.environ['HOSTNAME'])

        stderr_file = "std_err.txt"
        for job in self.jobs:
            job.vasp_cmd = v_exe
            job.gamma_vasp_cmd = gv_exe
            job.stderr_file = stderr_file
        if v_exe[0] == "srun":
            scancel_terminator = ScancelJobStepTerminator(stderr_file)
            terminate_func = scancel_terminator.cancel_job_step
        else:
            terminate_func = None

        incar_errors = check_incar(fw_spec['task_type'])
        if incar_errors:
            raise ValueError(
                "Critical error: INCAR does not pass checks: {}".format(
                    incar_errors))

        logging.basicConfig(level=logging.DEBUG)

        c = Custodian(self.handlers,
                      self.jobs,
                      max_errors=self.max_errors,
                      gzipped_output=False,
                      validators=[VasprunXMLValidator()],
                      terminate_func=terminate_func)  # manual gzip
        custodian_out = c.run()

        if self.gzip_output:
            for f in os.listdir(os.getcwd()):
                if not f.lower().endswith("gz") and not f.endswith(
                        ".OU") and not f.endswith(".ER"):
                    with open(f, 'rb') as f_in, \
                            GzipFile('{}.gz'.format(f), 'wb') as f_out:
                        f_out.writelines(f_in)
                    os.remove(f)

        all_errors = set()
        for run in custodian_out:
            for correction in run['corrections']:
                all_errors.update(correction['errors'])

        stored_data = {'error_list': list(all_errors)}
        update_spec = {
            'prev_vasp_dir': os.getcwd(),
            'prev_task_type': fw_spec['task_type'],
            'mpsnl': fw_spec['mpsnl'],
            'snlgroup_id': fw_spec['snlgroup_id'],
            'run_tags': fw_spec['run_tags'],
            'parameters': fw_spec.get('parameters')
        }

        return FWAction(stored_data=stored_data, update_spec=update_spec)
コード例 #20
0
    def run(self, pdb_on_exception=False):
        """
        Run the rocket (check out a job from the database and execute it)

        Args:
            pdb_on_exception (bool): whether to invoke the debugger on
                a caught exception.  Default False.
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())
        logdir = lp.get_logdir() if lp else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=logdir,
                                 stream_level=ROCKET_STREAM_LOGLEVEL)

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id)
        else:  # offline mode
            m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            fpath = zpath("FW_offline.json")
            with zopen(fpath) as f_in:
                d = json.loads(f_in.read())
                d['started_on'] = datetime.utcnow().isoformat()
                with zopen(fpath, "wt") as f_out:
                    f_out.write(json.dumps(d, ensure_ascii=False))

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(self.fworker.query))
            return False

        final_state = None
        ping_stop = None
        btask_stops = []

        try:
            if '_launch_dir' in m_fw.spec and lp:
                prev_dir = launch_dir
                launch_dir = os.path.expandvars(m_fw.spec['_launch_dir'])
                if not os.path.abspath(launch_dir):
                    launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir))
                # thread-safe "mkdir -p"
                try:
                    os.makedirs(launch_dir)
                except OSError as exception:
                    if exception.errno != errno.EEXIST:
                        raise
                os.chdir(launch_dir)

                if not os.path.samefile(launch_dir, prev_dir):
                    lp.change_launch_dir(launch_id, launch_dir)

                if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                    try:
                        os.rmdir(prev_dir)
                    except Exception:
                        pass

            recovery = m_fw.spec.get('_recovery', None)
            if recovery:
                recovery_dir = recovery.get('_prev_dir')
                recovery_mode = recovery.get('_mode')
                starting_task = recovery.get('_task_n')
                all_stored_data.update(recovery.get('_all_stored_data'))
                all_update_spec.update(recovery.get('_all_update_spec'))
                all_mod_spec.extend(recovery.get('_all_mod_spec'))
                if lp:
                    l_logger.log(
                        logging.INFO,
                        'Recovering from task number {} in folder {}.'.format(starting_task,
                                                                              recovery_dir))
                if recovery_mode == 'cp' and launch_dir != recovery_dir:
                    if lp:
                        l_logger.log(
                            logging.INFO,
                            'Copying data from recovery folder {} to folder {}.'.format(recovery_dir,
                                                                                        launch_dir))
                    distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1)

            else:
                starting_task = 0
                files_in = m_fw.spec.get("_files_in", {})
                prev_files = m_fw.spec.get("_files_prev", {})
                for f in set(files_in.keys()).intersection(prev_files.keys()):
                    # We use zopen for the file objects for transparent handling
                    # of zipped files. shutil.copyfileobj does the actual copy
                    # in chunks that avoid memory issues.
                    with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout:
                        shutil.copyfileobj(fin, fout)

            if lp:
                message = 'RUNNING fw_id: {} in directory: {}'. \
                    format(m_fw.fw_id, os.getcwd())
                l_logger.log(logging.INFO, message)

            # write FW.json and/or FW.yaml to the directory
            if PRINT_FW_JSON:
                m_fw.to_file('FW.json', indent=4)
            if PRINT_FW_YAML:
                m_fw.to_file('FW.yaml')

            my_spec = dict(m_fw.spec)  # make a copy of spec, don't override original
            my_spec["_fw_env"] = self.fworker.env

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the Firetasks!
            for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task):
                checkpoint = {'_task_n': t_counter,
                              '_all_stored_data': all_stored_data,
                              '_all_update_spec': all_update_spec,
                              '_all_mod_spec': all_mod_spec}
                Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint)

                if lp:
                    l_logger.log(logging.INFO, "Task started: %s." % t.fw_name)

                if my_spec.get("_add_launchpad_and_fw_id"):
                    t.fw_id = m_fw.fw_id
                    if FWData().MULTIPROCESSING:
                        # hack because AutoProxy manager can't access attributes
                        t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict())
                    else:
                        t.launchpad = self.launchpad

                if my_spec.get("_add_fworker"):
                    t.fworker = self.fworker

                try:
                    m_action = t.run_task(my_spec)
                except BaseException as e:
                    traceback.print_exc()
                    tb = traceback.format_exc()
                    stop_backgrounds(ping_stop, btask_stops)
                    do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
                    # If the exception is serializable, save its details
                    if pdb_on_exception:
                        pdb.post_mortem()
                    try:
                        exception_details = e.to_dict()
                    except AttributeError:
                        exception_details = None
                    except BaseException as e:
                        if lp:
                            l_logger.log(logging.WARNING,
                                         "Exception couldn't be serialized: %s " % e)
                        exception_details = None

                    try:
                        m_task = t.to_dict()
                    except Exception:
                        m_task = None

                    m_action = FWAction(stored_data={'_message': 'runtime error during task',
                                                     '_task': m_task,
                                                     '_exception': {'_stacktrace': tb,
                                                                    '_details': exception_details}},
                                        exit=True)
                    m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

                    if lp:
                        final_state = 'FIZZLED'
                        lp.complete_launch(launch_id, m_action, final_state)
                    else:
                        fpath = zpath("FW_offline.json")
                        with zopen(fpath) as f_in:
                            d = json.loads(f_in.read())
                            d['fwaction'] = m_action.to_dict()
                            d['state'] = 'FIZZLED'
                            d['completed_on'] = datetime.utcnow().isoformat()
                            with zopen(fpath, "wt") as f_out:
                                f_out.write(json.dumps(d, ensure_ascii=False))

                    return True

                # read in a FWAction from a file, in case the task is not Python and cannot return
                # it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this
                # particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)
                if lp:
                    l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name)
                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data['multiprocess_name'] = multiprocessing.current_process().name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

            if lp:
                final_state = 'COMPLETED'
                lp.complete_launch(launch_id, m_action, final_state)
            else:

                fpath = zpath("FW_offline.json")
                with zopen(fpath) as f_in:
                    d = json.loads(f_in.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    with zopen(fpath, "wt") as f_out:
                        f_out.write(json.dumps(d, ensure_ascii=False))

            return True

        except LockedWorkflowError as e:
            l_logger.log(logging.DEBUG, traceback.format_exc())
            l_logger.log(logging.WARNING,
                         "Firework {} reached final state {} but couldn't complete the update of "
                         "the database. Reason: {}\nRefresh the WF to recover the result "
                         "(lpad admin refresh -i {}).".format(
                             self.fw_id, final_state, e, self.fw_id))
            return True

        except Exception:
            # problems while processing the results. high probability of malformed data.
            traceback.print_exc()
            stop_backgrounds(ping_stop, btask_stops)
            # restore initial state to prevent the raise of further exceptions
            if lp:
                lp.restore_backup_data(launch_id, m_fw.fw_id)

            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # the action produced by the task is discarded
            m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None,
                                             '_exception': {'_stacktrace': traceback.format_exc(),
                                                            '_details': None}},
                                exit=True)

            try:
                m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)
            except Exception:
                traceback.print_exc()

            if lp:
                try:
                    lp.complete_launch(launch_id, m_action, 'FIZZLED')
                except LockedWorkflowError as e:
                    l_logger.log(logging.DEBUG, traceback.format_exc())
                    l_logger.log(logging.WARNING,
                                 "Firework {} fizzled but couldn't complete the update of the database."
                                 " Reason: {}\nRefresh the WF to recover the result "
                                 "(lpad admin refresh -i {}).".format(
                                     self.fw_id, final_state, e, self.fw_id))
                    return True
            else:
                fpath = zpath("FW_offline.json")
                with zopen(fpath) as f_in:
                    d = json.loads(f_in.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    with zopen(fpath, "wt") as f_out:
                        f_out.write(json.dumps(d, ensure_ascii=False))

            return True
コード例 #21
0
def rapidfire(launchpad,
              fworker=None,
              m_dir=None,
              nlaunches=0,
              max_loops=-1,
              sleep_time=None,
              strm_lvl='INFO',
              timeout=None,
              local_redirect=False):
    """
    Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories
    for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad.

    Args:
        launchpad (LaunchPad)
        fworker (FWorker object)
        m_dir (str): the directory in which to loop Rocket running
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops
        max_loops (int): maximum number of loops (default -1 is infinite)
        sleep_time (int): secs to sleep between rapidfire loop iterations
        strm_lvl (str): level at which to output logs to stdout
        timeout (int): of seconds after which to stop the rapidfire process
        local_redirect (bool): redirect standard input and output to local file
    """

    sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS
    curdir = m_dir if m_dir else os.getcwd()
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=launchpad.get_logdir(),
                             stream_level=strm_lvl)
    nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches)
    fworker = get_fworker(fworker)

    num_launched = 0
    start_time = datetime.now()
    num_loops = 0

    while num_loops != max_loops and (
            not timeout or
        (datetime.now() - start_time).total_seconds() < timeout):
        skip_check = False  # this is used to speed operation
        while (skip_check or launchpad.run_exists(fworker)) and \
                (not timeout or (datetime.now() - start_time).total_seconds() < timeout):
            os.chdir(curdir)
            launcher_dir = create_datestamp_dir(curdir,
                                                l_logger,
                                                prefix='launcher_')
            os.chdir(launcher_dir)
            if local_redirect:
                with redirect_local():
                    rocket_ran = launch_rocket(launchpad,
                                               fworker,
                                               strm_lvl=strm_lvl)
            else:
                rocket_ran = launch_rocket(launchpad,
                                           fworker,
                                           strm_lvl=strm_lvl)

            if rocket_ran:
                num_launched += 1
            elif not os.listdir(launcher_dir):
                # remove the empty shell of a directory
                os.chdir(curdir)
                os.rmdir(launcher_dir)
            if num_launched == nlaunches:
                break
            if launchpad.run_exists(fworker):
                skip_check = True  # don't wait, pull the next FW right away
            else:
                # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF
                time.sleep(0.15)
                skip_check = False
        if num_launched == nlaunches or nlaunches == 0:
            break
        log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time))
        fd = FWData()
        if fd.MULTIPROCESSING:
            # sleeping time is not firing
            fd.FiringState = False
        time.sleep(sleep_time)
        if fd.MULTIPROCESSING:
            fd.FiringState = True
        num_loops += 1
        log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time))
    os.chdir(curdir)
コード例 #22
0
ファイル: multi_launcher.py プロジェクト: xhqu1981/fireworks
def rapidfire_process(fworker,
                      nlaunches,
                      sleep,
                      loglvl,
                      port,
                      node_list,
                      sub_nproc,
                      timeout,
                      running_ids_dict,
                      local_redirect,
                      firing_state_dict,
                      macro_sleep_time=None):
    """
    Initializes shared data with multiprocessing parameters and starts a rapidfire.

    Args:
        fworker (FWorker): object
        nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever
        sleep (int): secs to sleep between rapidfire loop iterations
        loglvl (str): level at which to output logs to stdout
        port (int): Listening port number of the shared object manage
        password (str): security password to access the server
        node_list ([str]): computer node list
        sub_nproc (int): number of processors of the sub job
        timeout (int): # of seconds after which to stop the rapidfire process
        macro_sleep_time (int): secs to sleep between sub job resubmit
        local_redirect (bool): redirect standard input and output to local file
    """
    ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD)
    ds.connect()
    launchpad = ds.LaunchPad()
    fw_data = FWData()
    fw_data.DATASERVER = ds
    fw_data.MULTIPROCESSING = True
    fw_data.NODE_LIST = node_list
    fw_data.SUB_NPROCS = sub_nproc
    fw_data.Running_IDs = running_ids_dict
    fw_data.FiringState = firing_state_dict
    fw_data.lp = launchpad
    sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS
    l_dir = launchpad.get_logdir() if launchpad else None
    l_logger = get_fw_logger('rocket.launcher',
                             l_dir=l_dir,
                             stream_level=loglvl)
    fw_data.FiringState[os.getpid()] = True
    rapidfire(launchpad,
              fworker=fworker,
              m_dir=None,
              nlaunches=nlaunches,
              max_loops=-1,
              sleep_time=sleep,
              strm_lvl=loglvl,
              timeout=timeout,
              local_redirect=local_redirect)
    fw_data.FiringState[os.getpid()] = False
    while nlaunches == 0:
        time.sleep(1.5)  # wait for LaunchPad to be initialized
        firing_pids = [
            pid for pid, is_firing in fw_data.FiringState.items() if is_firing
        ]
        if len(firing_pids) > 0:
            # Some other sub jobs are still running
            macro_sleep_time = macro_sleep_time if macro_sleep_time \
                else sleep_time * len(fw_data.FiringState)
            log_multi(
                l_logger,
                'Sleeping for {} secs before resubmit sub job'.format(
                    macro_sleep_time))
            time.sleep(macro_sleep_time)
            log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time))
            fw_data.FiringState[os.getpid()] = True
            rapidfire(launchpad,
                      fworker=fworker,
                      m_dir=None,
                      nlaunches=nlaunches,
                      max_loops=-1,
                      sleep_time=sleep,
                      strm_lvl=loglvl,
                      timeout=timeout,
                      local_redirect=local_redirect)
            fw_data.FiringState[os.getpid()] = False
        else:
            break
    log_multi(l_logger, 'Sub job finished')