def ping_multilaunch(port, stop_event): """ A single manager to ping all launches during multiprocess launches Args: port (int): Listening port number of the DataServer stop_event (Thread.Event): stop event """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() fd = FWData() lp = ds.LaunchPad() while not stop_event.is_set(): for pid, lid in fd.Running_IDs.items(): if lid: try: os.kill(pid, 0) # throws OSError if the process is dead lp.ping_launch(lid) except OSError: fd.Running_IDs[pid] = None fd.FiringState[pid] = False pass # means this process is dead! stop_event.wait(PING_TIME_SECS)
def get_qchem_cmd(cls, qcinp, mol): physical_nproc_map = {"edison": 24, "cori": 32, "matgen": 16, "LaptopQu.local": 2} numa_num_map = {"edison": 2, "cori": 2, "matgen": 2, "LaptopQu.local": 1} hostname = cls.get_hostname() if hostname in physical_nproc_map: physical_nproc = physical_nproc_map[hostname] elif hostname == 'vesta': return ALCF_Utils.get_alcf_qchem_cmd(qcinp) else: return [["qchem"]] * 3 natoms = len(mol) fw_data = FWData() if fw_data.MULTIPROCESSING and fw_data.SUB_NPROCS is not None: physical_nproc = int(fw_data.SUB_NPROCS) nproc = min(physical_nproc, natoms) numa_num = numa_num_map[hostname] nproc = max((nproc // numa_num) * numa_num, 1) half_nproc = min(physical_nproc // 2, natoms) half_nproc = max((half_nproc // numa_num) * numa_num, 1) qc_exe = shlex.split("qchem -np {}".format(nproc)) half_cpus_cmd = shlex.split("qchem -np {}".format(half_nproc)) openmp_cmd = shlex.split("qchem -nt {}".format(nproc)) return qc_exe, half_cpus_cmd, openmp_cmd
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')
def stop_backgrounds(ping_stop, btask_stops): fd = FWData() if fd.MULTIPROCESSING: fd.Running_IDs[os.getpid()] = None elif ping_stop: ping_stop.set() for b in btask_stops: b.set()
def stop_backgrounds(ping_stop, btask_stops): fd = FWData() if fd.MULTIPROCESSING: m = fd.DATASERVER m.Running_IDs()[os.getpid()] = None else: ping_stop.set() for b in btask_stops: b.set()
def log_multi(m_logger, msg, log_lvl='info'): """ :param m_logger: (logger) The logger object :param msg: (str) a String to log :param log_lvl: (str) The level to log at """ _log_fnc = getattr(m_logger, log_lvl.lower()) if FWData().MULTIPROCESSING: _log_fnc("{} : ({})".format(msg, multiprocessing.current_process().name)) else: _log_fnc(msg)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc): """ Initializes shared data with multiprocessing parameters and starts a rapidfire :param fworker: (FWorker) object :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param sleep: (int) secs to sleep between rapidfire loop iterations :param loglvl: (str) level at which to output logs to stdout :param port: (int) Listening port number of the shared object manage :param password: (str) security password to access the server :param node_list: ([str]) computer node list :param sub_nproc: (int) number of processors of the sub job """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc rapidfire(launchpad, fworker, None, nlaunches, -1, sleep, loglvl)
def log_multi(m_logger, msg, log_lvl="info"): """ Args: m_logger (logger): The logger object msg (str): a String to log log_lvl (str): The level to log at """ _log_fnc = getattr(m_logger, log_lvl.lower()) if FWData().MULTIPROCESSING: _log_fnc(f"{msg} : ({multiprocessing.current_process().name})") else: _log_fnc(msg)
def start_ping_launch(launchpad: LaunchPad, launch_id: int) -> Union[Event, None]: fd = FWData() if fd.MULTIPROCESSING: if not launch_id: raise ValueError("Multiprocessing cannot be run in offline mode!") fd.Running_IDs[os.getpid()] = launch_id return None else: ping_stop = Event() ping_thread = Thread(target=ping_launch, args=(launchpad, launch_id, ping_stop, current_thread())) ping_thread.start() return ping_stop
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( sleep_time)) time.sleep(sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(sleep_time)) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) else: break log_multi(l_logger, 'Sub job finished')
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict) FWData().Running_IDs = running_ids_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def ping_multilaunch(port, stop_event): """ A single manager to ping all launches during multiprocess launches Args: port (int): Listening port number of the DataServer stop_event (Thread.Event): stop event """ ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD) ds.connect() fd = FWData() lp = ds.LaunchPad() while not stop_event.is_set(): for pid, lid in fd.Running_IDs.items(): if lid: try: os.kill(pid, 0) # throws OSError if the process is dead lp.ping_launch(lid) except OSError: # means this process is dead! fd.Running_IDs[pid] = None stop_event.wait(PING_TIME_SECS)
def start_ping_launch(launchpad, launch_id): fd = FWData() if fd.MULTIPROCESSING: if not launch_id: raise ValueError("Multiprocessing cannot be run in offline mode!") m = fd.DATASERVER m.Running_IDs()[os.getpid()] = launch_id return None else: ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_launch, args=(launchpad, launch_id, ping_stop, threading.currentThread())) ping_thread.start() return ping_stop
def _get_vasp_cmd_in_job_packing(fw_data, fw_env, mpi_cmd): tasks_per_node_flag = { "srun": "--ntasks-per-node", "mpirun": "--npernode", "aprun": "-N" } nodelist_flag = { "srun": "--nodelist", "mpirun": "--host", "aprun": "-L" } ranks_num_flag = {"srun": "--ntasks", "mpirun": "-n", "aprun": "-n"} nodes_spec = { "srun": "--nodes {}".format(len(fw_data.NODE_LIST)), "mpirun": "", "aprun": "" } verbose_flag = {"srun": "-v", "mpirun": "", "aprun": ""} mpirun = mpi_cmd.split()[0] fw_data = FWData() # Don't honor the SLURM_NTASKS in case of job packing, Because SLURM_NTASKS is referring # to total number of processes of the parent job sub_nproc = fw_data.SUB_NPROCS vasp_cmds = [ fw_env.get("vasp_cmd", "vasp"), fw_env.get("gvasp_cmd", "gvasp") ] vasp_exes = [ shlex.split( '{mpi_cmd} {verbose_flag} {nodes_spec} {ranks_flag} {nproc} {tpn_flag} {tpn} ' '{nl_flag} {nl} {vasp_cmd}'.format( mpi_cmd=mpi_cmd, verbose_flag=verbose_flag, nodes_spec=nodes_spec[mpirun], ranks_flag=ranks_num_flag[mpirun], nproc=sub_nproc, tpn_flag=tasks_per_node_flag[mpirun], tpn=int(fw_data.SUB_NPROCS) / len(fw_data.NODE_LIST), nl_flag=nodelist_flag[mpirun], nl=','.join(fw_data.NODE_LIST), vasp_cmd=vasp_cmd)) for vasp_cmd in vasp_cmds ] v_exe, gv_exe = vasp_exes return v_exe, gv_exe
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None, local_redirect=False): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process local_redirect (bool): redirect standard input and output to local file """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and (not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) if local_redirect: with redirect_local(): rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) else: rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) fd = FWData() if fd.MULTIPROCESSING: # sleeping time is not firing fd.FiringState = False time.sleep(sleep_time) if fd.MULTIPROCESSING: fd.FiringState = True num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def set_nodelist_env(cls, ): fw_data = FWData() if fw_data.MULTIPROCESSING and fw_data.NODE_LIST is not None: nodelist = ",".join(fw_data.NODE_LIST) os.environ["QCNODE"] = nodelist
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass if m_fw.spec.get('_recover_launch', None): launch_to_recover = lp.get_launch_by_id(m_fw.spec['_recover_launch']['_launch_id']) starting_task = launch_to_recover.action.stored_data.get('_exception', {}).get('_failed_task_n', 0) recover_launch_dir = launch_to_recover.launch_dir if lp: lp.log_message( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recover_launch_dir)) if m_fw.spec['_recover_launch']['_recover_mode'] == 'cp' and launch_dir != recover_launch_dir: if lp: lp.log_message( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recover_launch_dir, launch_dir)) distutils.dir_util.copy_tree(recover_launch_dir, launch_dir, update=1) else: starting_task = 0 if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) lp.log_message(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): if lp: lp.log_message(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.launchpad = self.launchpad t.fw_id = m_fw.fw_id try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: lp.log_message(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details, '_failed_task_n': t_counter}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except: traceback.print_exc() if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = FireWork.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format( self.fworker.query)) return False if '_launch_dir' in m_fw.spec: prev_dir = launch_dir os.chdir(m_fw.spec['_launch_dir']) launch_dir = os.path.abspath(os.getcwd()) if lp: lp._change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') try: my_spec = dict( m_fw.spec) # make a copy of spec, don't override original # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for my_task in m_fw.tasks: m_action = my_task.run_task(my_spec) # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data[ 'multiprocess_name'] = multiprocessing.current_process( ).name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: stop_backgrounds(ping_stop, btask_stops) traceback.print_exc() try: m_action = FWAction(stored_data={ '_message': 'runtime error during task', '_task': my_task.to_dict(), '_exception': traceback.format_exc() }, exit=True) except: m_action = FWAction(stored_data={ '_message': 'runtime error during task', '_task': None, '_exception': traceback.format_exc() }, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def run_task(self, fw_spec): # write a file containing the formula and task_type for somewhat # easier file system browsing self._write_formula_file(fw_spec) fw_env = fw_spec.get("_fw_env", {}) if "mpi_cmd" in fw_env: mpi_cmd = fw_spec["_fw_env"]["mpi_cmd"] elif which("mpirun"): mpi_cmd = "mpirun" elif which("aprun"): mpi_cmd = "aprun" else: raise ValueError("No MPI command found!") # TODO: last two env vars, i.e. SGE and LoadLeveler, are untested env_vars = ['PBS_NP', 'SLURM_NTASKS', 'NSLOTS', 'LOADL_TOTAL_TASKS'] nproc = None for env_var in env_vars: nproc = os.environ.get(env_var, None) if nproc is not None: break if nproc is None: raise ValueError( "None of the env vars {} found to set nproc!".format(env_vars)) fw_data = FWData() if (not fw_data.MULTIPROCESSING) or (fw_data.NODE_LIST is None): if "srun" in mpi_cmd: mpi_cmd += " -v" v_exe = shlex.split('{} -n {} {}'.format( mpi_cmd, nproc, fw_env.get("vasp_cmd", "vasp"))) gv_exe = shlex.split('{} -n {} {}'.format( mpi_cmd, nproc, fw_env.get("gvasp_cmd", "gvasp"))) else: v_exe, gv_exe = self._get_vasp_cmd_in_job_packing( fw_data, fw_env, mpi_cmd) print('host:', os.environ['HOSTNAME']) stderr_file = "std_err.txt" for job in self.jobs: job.vasp_cmd = v_exe job.gamma_vasp_cmd = gv_exe job.stderr_file = stderr_file if v_exe[0] == "srun": scancel_terminator = ScancelJobStepTerminator(stderr_file) terminate_func = scancel_terminator.cancel_job_step else: terminate_func = None incar_errors = check_incar(fw_spec['task_type']) if incar_errors: raise ValueError( "Critical error: INCAR does not pass checks: {}".format( incar_errors)) logging.basicConfig(level=logging.DEBUG) c = Custodian(self.handlers, self.jobs, max_errors=self.max_errors, gzipped_output=False, validators=[VasprunXMLValidator()], terminate_func=terminate_func) # manual gzip custodian_out = c.run() if self.gzip_output: for f in os.listdir(os.getcwd()): if not f.lower().endswith("gz") and not f.endswith( ".OU") and not f.endswith(".ER"): with open(f, 'rb') as f_in, \ GzipFile('{}.gz'.format(f), 'wb') as f_out: f_out.writelines(f_in) os.remove(f) all_errors = set() for run in custodian_out: for correction in run['corrections']: all_errors.update(correction['errors']) stored_data = {'error_list': list(all_errors)} update_spec = { 'prev_vasp_dir': os.getcwd(), 'prev_task_type': fw_spec['task_type'], 'mpsnl': fw_spec['mpsnl'], 'snlgroup_id': fw_spec['snlgroup_id'], 'run_tags': fw_spec['run_tags'], 'parameters': fw_spec.get('parameters') } return FWAction(stored_data=stored_data, update_spec=update_spec)
def run(self, pdb_on_exception=False): """ Run the rocket (check out a job from the database and execute it) Args: pdb_on_exception (bool): whether to invoke the debugger on a caught exception. Default False. """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) logdir = lp.get_logdir() if lp else None l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=ROCKET_STREAM_LOGLEVEL) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['started_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False final_state = None ping_stop = None btask_stops = [] try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except Exception: pass recovery = m_fw.spec.get('_recovery', None) if recovery: recovery_dir = recovery.get('_prev_dir') recovery_mode = recovery.get('_mode') starting_task = recovery.get('_task_n') all_stored_data.update(recovery.get('_all_stored_data')) all_update_spec.update(recovery.get('_all_update_spec')) all_mod_spec.extend(recovery.get('_all_mod_spec')) if lp: l_logger.log( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recovery_dir)) if recovery_mode == 'cp' and launch_dir != recovery_dir: if lp: l_logger.log( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recovery_dir, launch_dir)) distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1) else: starting_task = 0 files_in = m_fw.spec.get("_files_in", {}) prev_files = m_fw.spec.get("_files_prev", {}) for f in set(files_in.keys()).intersection(prev_files.keys()): # We use zopen for the file objects for transparent handling # of zipped files. shutil.copyfileobj does the actual copy # in chunks that avoid memory issues. with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout: shutil.copyfileobj(fin, fout) if lp: message = 'RUNNING fw_id: {} in directory: {}'. \ format(m_fw.fw_id, os.getcwd()) l_logger.log(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the Firetasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): checkpoint = {'_task_n': t_counter, '_all_stored_data': all_stored_data, '_all_update_spec': all_update_spec, '_all_mod_spec': all_mod_spec} Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint) if lp: l_logger.log(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.fw_id = m_fw.fw_id if FWData().MULTIPROCESSING: # hack because AutoProxy manager can't access attributes t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict()) else: t.launchpad = self.launchpad if my_spec.get("_add_fworker"): t.fworker = self.fworker try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details if pdb_on_exception: pdb.post_mortem() try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: l_logger.log(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except Exception: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'FIZZLED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True # read in a FWAction from a file, in case the task is not Python and cannot return # it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this # particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'COMPLETED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} reached final state {} but couldn't complete the update of " "the database. Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True except Exception: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except Exception: traceback.print_exc() if lp: try: lp.complete_launch(launch_id, m_action, 'FIZZLED') except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} fizzled but couldn't complete the update of the database." " Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True
def rapidfire(launchpad, fworker=None, m_dir=None, nlaunches=0, max_loops=-1, sleep_time=None, strm_lvl='INFO', timeout=None, local_redirect=False): """ Keeps running Rockets in m_dir until we reach an error. Automatically creates subdirectories for each Rocket. Usually stops when we run out of FireWorks from the LaunchPad. Args: launchpad (LaunchPad) fworker (FWorker object) m_dir (str): the directory in which to loop Rocket running nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop until max_loops max_loops (int): maximum number of loops (default -1 is infinite) sleep_time (int): secs to sleep between rapidfire loop iterations strm_lvl (str): level at which to output logs to stdout timeout (int): of seconds after which to stop the rapidfire process local_redirect (bool): redirect standard input and output to local file """ sleep_time = sleep_time if sleep_time else RAPIDFIRE_SLEEP_SECS curdir = m_dir if m_dir else os.getcwd() l_logger = get_fw_logger('rocket.launcher', l_dir=launchpad.get_logdir(), stream_level=strm_lvl) nlaunches = -1 if nlaunches == 'infinite' else int(nlaunches) fworker = get_fworker(fworker) num_launched = 0 start_time = datetime.now() num_loops = 0 while num_loops != max_loops and ( not timeout or (datetime.now() - start_time).total_seconds() < timeout): skip_check = False # this is used to speed operation while (skip_check or launchpad.run_exists(fworker)) and \ (not timeout or (datetime.now() - start_time).total_seconds() < timeout): os.chdir(curdir) launcher_dir = create_datestamp_dir(curdir, l_logger, prefix='launcher_') os.chdir(launcher_dir) if local_redirect: with redirect_local(): rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) else: rocket_ran = launch_rocket(launchpad, fworker, strm_lvl=strm_lvl) if rocket_ran: num_launched += 1 elif not os.listdir(launcher_dir): # remove the empty shell of a directory os.chdir(curdir) os.rmdir(launcher_dir) if num_launched == nlaunches: break if launchpad.run_exists(fworker): skip_check = True # don't wait, pull the next FW right away else: # add a small amount of buffer breathing time for DB to refresh in case we have a dynamic WF time.sleep(0.15) skip_check = False if num_launched == nlaunches or nlaunches == 0: break log_multi(l_logger, 'Sleeping for {} secs'.format(sleep_time)) fd = FWData() if fd.MULTIPROCESSING: # sleeping time is not firing fd.FiringState = False time.sleep(sleep_time) if fd.MULTIPROCESSING: fd.FiringState = True num_loops += 1 log_multi(l_logger, 'Checking for FWs to run...'.format(sleep_time)) os.chdir(curdir)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [ pid for pid, is_firing in fw_data.FiringState.items() if is_firing ] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')