def apply_action(self, action, fw_id): """ Apply a FWAction on a Firework in the Workflow. Args: action (FWAction): action to apply fw_id (int): id of Firework on which to apply the action Returns: [int]: list of Firework ids that were updated or new """ updated_ids = [] # update the spec of the children FireWorks if action.update_spec: for cfid in self.links[fw_id]: self.id_fw[cfid].spec.update(action.update_spec) updated_ids.append(cfid) # update the spec of the children FireWorks using DictMod language if action.mod_spec: for cfid in self.links[fw_id]: for mod in action.mod_spec: apply_mod(mod, self.id_fw[cfid].spec) updated_ids.append(cfid) # defuse children if action.defuse_children: for cfid in self.links[fw_id]: self.id_fw[cfid].state = 'DEFUSED' self.fw_states[cfid] = 'DEFUSED' updated_ids.append(cfid) # defuse workflow if action.defuse_workflow: for fw_id in self.links.nodes: if self.id_fw[fw_id].state not in ['FIZZLED', 'COMPLETED']: self.id_fw[fw_id].state = 'DEFUSED' self.fw_states[fw_id] = 'DEFUSED' updated_ids.append(fw_id) # add detour FireWorks. This should be done *before* additions if action.detours: for wf in action.detours: new_updates = self.append_wf(wf, [fw_id], detour=True, pull_spec_mods=False) if len(set(updated_ids).intersection(new_updates)) > 0: raise ValueError( "Cannot use duplicated fw_ids when dynamically detouring workflows!") updated_ids.extend(new_updates) # add additional FireWorks if action.additions: for wf in action.additions: new_updates = self.append_wf(wf, [fw_id], detour=False, pull_spec_mods=False) if len(set(updated_ids).intersection(new_updates)) > 0: raise ValueError( "Cannot use duplicated fw_ids when dynamically adding workflows!") updated_ids.extend(new_updates) return list(set(updated_ids))
def apply_action(self, action, fw_id): updated_ids = [] if action.command in ['CONTINUE', 'BREAK']: # Do nothing pass if action.command == 'DEFUSE': # mark all children as defused for cfid in self.links[fw_id]: self.id_fw[cfid].state = 'DEFUSED' updated_ids.append(cfid) if action.command == 'MODIFY' or 'CREATE': for cfid in self.links[fw_id]: for mod in action.mod_spec.get('dict_mods', []): apply_mod(mod, self.id_fw[cfid].spec) updated_ids.append(cfid) if action.command == 'CREATE': create_fw = action.mod_spec['create_fw'] self.links[fw_id].append(create_fw.fw_id) self.links[create_fw.fw_id] = [ ] # TODO: allow this to be children of original FW self.id_fw[create_fw.fw_id] = create_fw updated_ids.append(create_fw.fw_id) # TODO: implement the remaining actions return updated_ids
def apply_action(self, action, fw_id): updated_ids = [] if action.command in ['CONTINUE', 'BREAK']: # Do nothing pass if action.command == 'DEFUSE': # mark all children as defused for cfid in self.links[fw_id]: self.id_fw[cfid].state = 'DEFUSED' updated_ids.append(cfid) if action.command == 'MODIFY' or 'CREATE': for cfid in self.links[fw_id]: for mod in action.mod_spec.get('dict_mods', []): apply_mod(mod, self.id_fw[cfid].spec) updated_ids.append(cfid) if action.command == 'CREATE': create_fw = action.mod_spec['create_fw'] self.links[fw_id].append(create_fw.fw_id) self.links[create_fw.fw_id] = [] # TODO: allow this to be children of original FW self.id_fw[create_fw.fw_id] = create_fw updated_ids.append(create_fw.fw_id) # TODO: implement the remaining actions return updated_ids
def add_wf_to_fws(self, new_wf, fw_ids, pull_spec_mods=True, detour=False): """ Internal method to add a workflow as a child to a Firework Note: detours must have children that have STATE_RANK that is WAITING or below :param new_wf: (Workflow) New Workflow to add :param fw_ids: ([int]) ids of the parent Fireworks on which to add the Workflow :param pull_spec_mods: (bool) pull spec mods of COMPLETED parents :param detour: (bool) add children of the current Firework to the Workflow's leaves :return: ([int]) list of Firework ids that were updated or new """ updated_ids = [] root_ids = new_wf.root_fw_ids leaf_ids = new_wf.leaf_fw_ids for new_fw in new_wf.fws: if new_fw.fw_id >= 0: # note - this is also used later in the 'detour' code raise ValueError( 'FireWorks to add must use a negative fw_id! Got fw_id: ' '{}'.format( new_fw.fw_id)) self.id_fw[new_fw.fw_id] = new_fw # add new_fw to id_fw for fw_id in fw_ids: if new_fw.fw_id in leaf_ids: if detour: # make sure all of these links are WAITING, else the DETOUR is not well defined ready_run = [(f >= 0 and Firework.STATE_RANKS[self.fw_states[f]] > 1) for f in self.links[fw_id]] if any(ready_run): raise ValueError("Detour option only works if all children of detours are not READY to run and have not already run") self.links[new_fw.fw_id] = [f for f in self.links[fw_id] if f >= 0] # add children of current FW to new FW else: self.links[new_fw.fw_id] = [] else: self.links[new_fw.fw_id] = new_wf.links[new_fw.fw_id] updated_ids.append(new_fw.fw_id) for fw_id in fw_ids: for root_id in root_ids: self.links[fw_id].append(root_id) # add the root id as my child if pull_spec_mods: # re-apply some actions of the parent m_fw = self.id_fw[fw_id] # get the parent FW m_launch = self._get_representative_launch(m_fw) # get Launch of parent if m_launch: # pull spec update if m_launch.state == 'COMPLETED' and m_launch.action.update_spec: new_wf.id_fw[root_id].spec.update(m_launch.action.update_spec) # pull spec mods if m_launch.state == 'COMPLETED' and m_launch.action.mod_spec: for mod in m_launch.action.mod_spec: apply_mod(mod, new_wf.id_fw[root_id].spec) for new_fw in new_wf.fws: updated_ids = self.refresh(new_fw.fw_id, set(updated_ids)) return updated_ids
def apply_action(self, action, fw_id): """ Apply a FWAction on a Firework in the Workflow :param action: (FWAction) action to apply :param fw_id: (int) id of Firework on which to apply the action :return: ([int]) list of Firework ids that were updated or new """ updated_ids = [] # update the spec of the children FireWorks if action.update_spec: for cfid in self.links[fw_id]: self.id_fw[cfid].spec.update(action.update_spec) updated_ids.append(cfid) # update the spec of the children FireWorks using DictMod language if action.mod_spec: for cfid in self.links[fw_id]: for mod in action.mod_spec: apply_mod(mod, self.id_fw[cfid].spec) updated_ids.append(cfid) # defuse children if action.defuse_children: for cfid in self.links[fw_id]: self.id_fw[cfid].state = 'DEFUSED' updated_ids.append(cfid) # add detour FireWorks # this should be done *before* additions if action.detours: for wf in action.detours: new_updates = self.add_wf_to_fws(wf, [fw_id], pull_spec_mods=False, detour=True) if len(set(updated_ids).intersection(new_updates)) > 0: raise ValueError( "Cannot use duplicated fw_ids when dynamically detouring workflows!" ) updated_ids.extend(new_updates) # add additional FireWorks if action.additions: for wf in action.additions: new_updates = self.add_wf_to_fws(wf, [fw_id], pull_spec_mods=False, detour=False) if len(set(updated_ids).intersection(new_updates)) > 0: raise ValueError( "Cannot use duplicated fw_ids when dynamically adding workflows!" ) updated_ids.extend(new_updates) return list(set(updated_ids))
def apply_action(self, action, fw_id): """ Apply a FWAction on a Firework in the Workflow :param action: (FWAction) action to apply :param fw_id: (int) id of Firework on which to apply the action :return: ([int]) list of Firework ids that were updated or new """ updated_ids = [] # update the spec of the children FireWorks if action.update_spec: for cfid in self.links[fw_id]: self.id_fw[cfid].spec.update(action.update_spec) updated_ids.append(cfid) # update the spec of the children FireWorks using DictMod language if action.mod_spec: for cfid in self.links[fw_id]: for mod in action.mod_spec: apply_mod(mod, self.id_fw[cfid].spec) updated_ids.append(cfid) # defuse children if action.defuse_children: for cfid in self.links[fw_id]: self.id_fw[cfid].state = 'DEFUSED' updated_ids.append(cfid) # add detour FireWorks # this should be done *before* additions if action.detours: for wf in action.detours: new_updates = self.add_wf_to_fws(wf, [fw_id], pull_spec_mods=False, detour=True) if len(set(updated_ids).intersection(new_updates)) > 0: raise ValueError( "Cannot use duplicated fw_ids when dynamically detouring workflows!") updated_ids.extend(new_updates) # add additional FireWorks if action.additions: for wf in action.additions: new_updates = self.add_wf_to_fws(wf, [fw_id], pull_spec_mods=False, detour=False) if len(set(updated_ids).intersection(new_updates)) > 0: raise ValueError( "Cannot use duplicated fw_ids when dynamically adding workflows!") updated_ids.extend(new_updates) return list(set(updated_ids))
def run_task(self, fw_spec): incar_name = self.get("input_filename", "INCAR") incar = Incar.from_file(incar_name) incar_update = env_chk(self.get('incar_update'), fw_spec) incar_multiply = env_chk(self.get('incar_multiply'), fw_spec) incar_dictmod = env_chk(self.get('incar_dictmod'), fw_spec) if incar_update: incar.update(incar_update) if incar_multiply: for k in incar_multiply: incar[k] = incar[k] * incar_multiply[k] if incar_dictmod: apply_mod(incar_dictmod, incar) incar.write_file(self.get("output_filename", "INCAR"))
def run_task(self, fw_spec): # load INCAR incar_name = self.get("input_filename", "INCAR") incar = Incar.from_file(incar_name) # process FireWork env values via env_chk incar_update = env_chk(self.get('incar_update'), fw_spec) incar_multiply = env_chk(self.get('incar_multiply'), fw_spec) incar_dictmod = env_chk(self.get('incar_dictmod'), fw_spec) if incar_update: incar.update(incar_update) if incar_multiply: for k in incar_multiply: incar[k] = incar[k] * incar_multiply[k] if incar_dictmod: apply_mod(incar_dictmod, incar) # write INCAR incar.write_file(self.get("output_filename", "INCAR"))
def run_task(self, fw_spec): incar_name = self.get("input_filename", "INCAR") incar = Incar.from_file(incar_name) incar_update = env_chk(self.get('incar_update'), fw_spec) incar_multiply = env_chk(self.get('incar_multiply'), fw_spec) incar_dictmod = env_chk(self.get('incar_dictmod'), fw_spec) if incar_update: incar.update(incar_update) if incar_multiply: for k in incar_multiply: if hasattr(incar[k], '__iter__'): # is list-like incar[k] = list(np.multiply(incar[k], incar_multiply[k])) else: incar[k] = incar[k] * incar_multiply[k] if incar_dictmod: apply_mod(incar_dictmod, incar) incar.write_file(self.get("output_filename", "INCAR"))
def run_task(self, fw_spec): incar_name = self.get("input_filename", "INCAR") incar = Incar.from_file(incar_name) incar_update = env_chk(self.get("incar_update"), fw_spec) incar_multiply = env_chk(self.get("incar_multiply"), fw_spec) incar_dictmod = env_chk(self.get("incar_dictmod"), fw_spec) if incar_update: incar.update(incar_update) if incar_multiply: for k in incar_multiply: if hasattr(incar[k], "__iter__"): # is list-like incar[k] = list(np.multiply(incar[k], incar_multiply[k])) else: incar[k] = incar[k] * incar_multiply[k] if incar_dictmod: apply_mod(incar_dictmod, incar) incar.write_file(self.get("output_filename", "INCAR"))
def append_wf(self, new_wf, fw_ids, detour=False, pull_spec_mods=False): """ Method to add a workflow as a child to a Firework Note: detours must have children that have STATE_RANK that is WAITING or below Args: new_wf (Workflow): New Workflow to add. fw_ids ([int]): ids of the parent Fireworks on which to add the Workflow. detour (bool): add children of the current Firework to the Workflow's leaves. pull_spec_mods (bool): pull spec mods of COMPLETED parents, refreshes the WF states. Returns: [int]: list of Firework ids that were updated or new """ updated_ids = [] root_ids = new_wf.root_fw_ids leaf_ids = new_wf.leaf_fw_ids # make sure detour runs do not link to ready/running/completed/etc. runs if detour: for fw_id in fw_ids: if fw_id in self.links: # make sure all of these links are WAITING, else the DETOUR is not well defined ready_run = [ (f >= 0 and Firework.STATE_RANKS[self.fw_states[f]] > 1) for f in self.links[fw_id] ] if any(ready_run): raise ValueError( "fw_id: {}: Detour option only works if all children " "of detours are not READY to run and have not " "already run".format(fw_id)) # make sure all new child fws have negative fw_id for new_fw in new_wf.fws: if new_fw.fw_id >= 0: # note: this is also used later in the 'detour' code raise ValueError( 'FireWorks to add must use a negative fw_id! Got fw_id: {}' .format(new_fw.fw_id)) # completed checks - go ahead and append for new_fw in new_wf.fws: self.id_fw[new_fw.fw_id] = new_fw # add new_fw to id_fw if new_fw.fw_id in leaf_ids: if detour: for fw_id in fw_ids: # add children of current FW to new FW self.links[new_fw.fw_id] = [ f for f in self.links[fw_id] if f >= 0 ] else: self.links[new_fw.fw_id] = [] else: self.links[new_fw.fw_id] = new_wf.links[new_fw.fw_id] updated_ids.append(new_fw.fw_id) for fw_id in fw_ids: for root_id in root_ids: self.links[fw_id].append( root_id) # add the root id as my child if pull_spec_mods: # re-apply some actions of the parent m_fw = self.id_fw[fw_id] # get the parent FW m_launch = self._get_representative_launch( m_fw) # get Launch of parent if m_launch: # pull spec update if m_launch.state == 'COMPLETED' and m_launch.action.update_spec: new_wf.id_fw[root_id].spec.update( m_launch.action.update_spec) # pull spec mods if m_launch.state == 'COMPLETED' and m_launch.action.mod_spec: for mod in m_launch.action.mod_spec: apply_mod(mod, new_wf.id_fw[root_id].spec) for new_fw in new_wf.fws: updated_ids = self.refresh(new_fw.fw_id, set(updated_ids)) return updated_ids
def run(self, pdb_on_exception=False): """ Run the rocket (check out a job from the database and execute it) Args: pdb_on_exception (bool): whether to invoke the debugger on a caught exception. Default False. """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) logdir = lp.get_logdir() if lp else None l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=ROCKET_STREAM_LOGLEVEL) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['started_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False final_state = None ping_stop = None btask_stops = [] try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except Exception: pass recovery = m_fw.spec.get('_recovery', None) if recovery: recovery_dir = recovery.get('_prev_dir') recovery_mode = recovery.get('_mode') starting_task = recovery.get('_task_n') all_stored_data.update(recovery.get('_all_stored_data')) all_update_spec.update(recovery.get('_all_update_spec')) all_mod_spec.extend(recovery.get('_all_mod_spec')) if lp: l_logger.log( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recovery_dir)) if recovery_mode == 'cp' and launch_dir != recovery_dir: if lp: l_logger.log( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recovery_dir, launch_dir)) distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1) else: starting_task = 0 files_in = m_fw.spec.get("_files_in", {}) prev_files = m_fw.spec.get("_files_prev", {}) for f in set(files_in.keys()).intersection(prev_files.keys()): # We use zopen for the file objects for transparent handling # of zipped files. shutil.copyfileobj does the actual copy # in chunks that avoid memory issues. with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout: shutil.copyfileobj(fin, fout) if lp: message = 'RUNNING fw_id: {} in directory: {}'. \ format(m_fw.fw_id, os.getcwd()) l_logger.log(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the Firetasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): checkpoint = {'_task_n': t_counter, '_all_stored_data': all_stored_data, '_all_update_spec': all_update_spec, '_all_mod_spec': all_mod_spec} Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint) if lp: l_logger.log(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.fw_id = m_fw.fw_id if FWData().MULTIPROCESSING: # hack because AutoProxy manager can't access attributes t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict()) else: t.launchpad = self.launchpad if my_spec.get("_add_fworker"): t.fworker = self.fworker try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details if pdb_on_exception: pdb.post_mortem() try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: l_logger.log(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except Exception: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'FIZZLED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True # read in a FWAction from a file, in case the task is not Python and cannot return # it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this # particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'COMPLETED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} reached final state {} but couldn't complete the update of " "the database. Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True except Exception: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except Exception: traceback.print_exc() if lp: try: lp.complete_launch(launch_id, m_action, 'FIZZLED') except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} fizzled but couldn't complete the update of the database." " Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False if '_launch_dir' in m_fw.spec: prev_dir = launch_dir os.chdir(m_fw.spec['_launch_dir']) launch_dir = os.path.abspath(os.getcwd()) if lp: lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) lp.log_message(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') try: my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for t in m_fw.tasks: lp.log_message(logging.INFO, "Task started: %s." % t.fw_name) m_action = t.run_task(my_spec) # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor traceback.print_exc() try: m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': t.to_dict(), '_exception': traceback.format_exc()}, exit=True) except: m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': traceback.format_exc()}, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = FireWork.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format( self.fworker.query)) return False if '_launch_dir' in m_fw.spec: prev_dir = launch_dir os.chdir(m_fw.spec['_launch_dir']) launch_dir = os.path.abspath(os.getcwd()) if lp: lp._change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') try: my_spec = dict( m_fw.spec) # make a copy of spec, don't override original # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for my_task in m_fw.tasks: m_action = my_task.run_task(my_spec) # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data[ 'multiprocess_name'] = multiprocessing.current_process( ).name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: stop_backgrounds(ping_stop, btask_stops) traceback.print_exc() try: m_action = FWAction(stored_data={ '_message': 'runtime error during task', '_task': my_task.to_dict(), '_exception': traceback.format_exc() }, exit=True) except: m_action = FWAction(stored_data={ '_message': 'runtime error during task', '_task': None, '_exception': traceback.format_exc() }, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def run(self, pdb_on_exception=False): """ Run the rocket (check out a job from the database and execute it) Args: pdb_on_exception (bool): whether to invoke the debugger on a caught exception. Default False. """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) logdir = lp.get_logdir() if lp else None l_logger = get_fw_logger('rocket.launcher', l_dir=logdir, stream_level=ROCKET_STREAM_LOGLEVEL) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['started_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False final_state = None ping_stop = None btask_stops = [] try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass recovery = m_fw.spec.get('_recovery', None) if recovery: recovery_dir = recovery.get('_prev_dir') recovery_mode = recovery.get('_mode') starting_task = recovery.get('_task_n') all_stored_data.update(recovery.get('_all_stored_data')) all_update_spec.update(recovery.get('_all_update_spec')) all_mod_spec.extend(recovery.get('_all_mod_spec')) if lp: l_logger.log( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recovery_dir)) if recovery_mode == 'cp' and launch_dir != recovery_dir: if lp: l_logger.log( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recovery_dir, launch_dir)) distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1) else: starting_task = 0 files_in = m_fw.spec.get("_files_in", {}) prev_files = m_fw.spec.get("_files_prev", {}) for f in set(files_in.keys()).intersection(prev_files.keys()): # We use zopen for the file objects for transparent handling # of zipped files. shutil.copyfileobj does the actual copy # in chunks that avoid memory issues. with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout: shutil.copyfileobj(fin, fout) if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) l_logger.log(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the Firetasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): checkpoint = {'_task_n': t_counter, '_all_stored_data': all_stored_data, '_all_update_spec': all_update_spec, '_all_mod_spec': all_mod_spec} Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint) if lp: l_logger.log(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.fw_id = m_fw.fw_id if FWData().MULTIPROCESSING: # hack because AutoProxy manager can't access attributes t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict()) else: t.launchpad = self.launchpad if my_spec.get("_add_fworker"): t.fworker = self.fworker try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details if pdb_on_exception: pdb.post_mortem() try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: l_logger.log(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'FIZZLED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True # read in a FWAction from a file, in case the task is not Python and cannot return # it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this # particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: final_state = 'COMPLETED' lp.complete_launch(launch_id, m_action, final_state) else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} reached final state {} but couldn't complete the update of " "the database. Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True except: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except: traceback.print_exc() if lp: try: lp.complete_launch(launch_id, m_action, 'FIZZLED') except LockedWorkflowError as e: l_logger.log(logging.DEBUG, traceback.format_exc()) l_logger.log(logging.WARNING, "Firework {} fizzled but couldn't complete the update of the database." " Reason: {}\nRefresh the WF to recover the result " "(lpad admin refresh -i {}).".format( self.fw_id, final_state, e, self.fw_id)) return True else: fpath = zpath("FW_offline.json") with zopen(fpath) as f_in: d = json.loads(f_in.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' d['completed_on'] = datetime.utcnow().isoformat() with zopen(fpath, "wt") as f_out: f_out.write(json.dumps(d, ensure_ascii=False)) return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) lp.log_message(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') try: if '_launch_dir' in m_fw.spec: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) launch_dir = os.path.abspath(os.getcwd()) if lp: lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass if m_fw.spec.get('_recover_launch', None): launch_to_recover = lp.get_launch_by_id(m_fw.spec['_recover_launch']['_launch_id']) starting_task = launch_to_recover.action.stored_data.get('_exception', {}).get('_failed_task_n', 0) recover_launch_dir = launch_to_recover.launch_dir if lp: lp.log_message( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recover_launch_dir)) if m_fw.spec['_recover_launch']['_recover_mode'] == 'cp' and launch_dir != recover_launch_dir: if lp: lp.log_message( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recover_launch_dir, launch_dir)) distutils.dir_util.copy_tree(recover_launch_dir, launch_dir, update=1) else: starting_task = 0 my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): if lp: lp.log_message(logging.INFO, "Task started: %s." % t.fw_name) try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: lp.log_message(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details, '_failed_task_n': t_counter}}, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def run(self): """ Run the rocket (check out a job from the database and execute it) """ all_stored_data = {} # combined stored data for *all* the Tasks all_update_spec = {} # combined update_spec for *all* the Tasks all_mod_spec = [] # combined mod_spec for *all* the Tasks lp = self.launchpad launch_dir = os.path.abspath(os.getcwd()) # check a FW job out of the launchpad if lp: m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id) else: # offline mode m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json")) # set the run start time with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['started_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() launch_id = None # we don't need this in offline mode... if not m_fw: print("No FireWorks are ready to run and match query! {}".format(self.fworker.query)) return False try: if '_launch_dir' in m_fw.spec and lp: prev_dir = launch_dir launch_dir = os.path.expandvars(m_fw.spec['_launch_dir']) if not os.path.abspath(launch_dir): launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir)) # thread-safe "mkdir -p" try: os.makedirs(launch_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise os.chdir(launch_dir) if not os.path.samefile(launch_dir, prev_dir): lp.change_launch_dir(launch_id, launch_dir) if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS: try: os.rmdir(prev_dir) except: pass if m_fw.spec.get('_recover_launch', None): launch_to_recover = lp.get_launch_by_id(m_fw.spec['_recover_launch']['_launch_id']) starting_task = launch_to_recover.action.stored_data.get('_exception', {}).get('_failed_task_n', 0) recover_launch_dir = launch_to_recover.launch_dir if lp: lp.log_message( logging.INFO, 'Recovering from task number {} in folder {}.'.format(starting_task, recover_launch_dir)) if m_fw.spec['_recover_launch']['_recover_mode'] == 'cp' and launch_dir != recover_launch_dir: if lp: lp.log_message( logging.INFO, 'Copying data from recovery folder {} to folder {}.'.format(recover_launch_dir, launch_dir)) distutils.dir_util.copy_tree(recover_launch_dir, launch_dir, update=1) else: starting_task = 0 if lp: message = 'RUNNING fw_id: {} in directory: {}'.\ format(m_fw.fw_id, os.getcwd()) lp.log_message(logging.INFO, message) # write FW.json and/or FW.yaml to the directory if PRINT_FW_JSON: m_fw.to_file('FW.json', indent=4) if PRINT_FW_YAML: m_fw.to_file('FW.yaml') my_spec = dict(m_fw.spec) # make a copy of spec, don't override original my_spec["_fw_env"] = self.fworker.env # set up heartbeat (pinging the server that we're still alive) ping_stop = start_ping_launch(lp, launch_id) # start background tasks btask_stops = [] if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: btask_stops.append(start_background_task(bt, m_fw.spec)) # execute the FireTasks! for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task): if lp: lp.log_message(logging.INFO, "Task started: %s." % t.fw_name) if my_spec.get("_add_launchpad_and_fw_id"): t.launchpad = self.launchpad t.fw_id = m_fw.fw_id try: m_action = t.run_task(my_spec) except BaseException as e: traceback.print_exc() tb = traceback.format_exc() stop_backgrounds(ping_stop, btask_stops) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # If the exception is serializable, save its details try: exception_details = e.to_dict() except AttributeError: exception_details = None except BaseException as e: if lp: lp.log_message(logging.WARNING, "Exception couldn't be serialized: %s " % e) exception_details = None try: m_task = t.to_dict() except: m_task = None m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task, '_exception': {'_stacktrace': tb, '_details': exception_details, '_failed_task_n': t_counter}}, exit=True) m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly if os.path.exists('FWAction.json'): m_action = FWAction.from_file('FWAction.json') elif os.path.exists('FWAction.yaml'): m_action = FWAction.from_file('FWAction.yaml') if not m_action: m_action = FWAction() # update the global stored data with the data to store and update from this particular Task all_stored_data.update(m_action.stored_data) all_update_spec.update(m_action.update_spec) all_mod_spec.extend(m_action.mod_spec) # update spec for next task as well my_spec.update(m_action.update_spec) for mod in m_action.mod_spec: apply_mod(mod, my_spec) if lp: lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name) if m_action.skip_remaining_tasks: break # add job packing info if this is needed if FWData().MULTIPROCESSING and STORE_PACKING_INFO: all_stored_data['multiprocess_name'] = multiprocessing.current_process().name # perform finishing operation stop_backgrounds(ping_stop, btask_stops) for b in btask_stops: b.set() do_ping(lp, launch_id) # one last ping, esp if there is a monitor # last background monitors if '_background_tasks' in my_spec: for bt in my_spec['_background_tasks']: if bt.run_on_finish: for task in bt.tasks: task.run_task(m_fw.spec) m_action.stored_data = all_stored_data m_action.mod_spec = all_mod_spec m_action.update_spec = all_update_spec m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) if lp: lp.complete_launch(launch_id, m_action, 'COMPLETED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'COMPLETED' d['completed_on'] = datetime.utcnow().isoformat() f.seek(0) f.write(json.dumps(d)) f.truncate() return True except: # problems while processing the results. high probability of malformed data. traceback.print_exc() stop_backgrounds(ping_stop, btask_stops) # restore initial state to prevent the raise of further exceptions if lp: lp.restore_backup_data(launch_id, m_fw.fw_id) do_ping(lp, launch_id) # one last ping, esp if there is a monitor # the action produced by the task is discarded m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None, '_exception': {'_stacktrace': traceback.format_exc(), '_details': None}}, exit=True) try: m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir) except: traceback.print_exc() if lp: lp.complete_launch(launch_id, m_action, 'FIZZLED') else: with open('FW_offline.json', 'r+') as f: d = json.loads(f.read()) d['fwaction'] = m_action.to_dict() d['state'] = 'FIZZLED' f.seek(0) f.write(json.dumps(d)) f.truncate() return True
def append_wf(self, new_wf, fw_ids, detour=False, pull_spec_mods=False): """ Method to add a workflow as a child to a Firework Note: detours must have children that have STATE_RANK that is WAITING or below Args: new_wf (Workflow): New Workflow to add. fw_ids ([int]): ids of the parent Fireworks on which to add the Workflow. detour (bool): add children of the current Firework to the Workflow's leaves. pull_spec_mods (bool): pull spec mods of COMPLETED parents, refreshes the WF states. Returns: [int]: list of Firework ids that were updated or new """ updated_ids = [] root_ids = new_wf.root_fw_ids leaf_ids = new_wf.leaf_fw_ids # make sure detour runs do not link to ready/running/completed/etc. runs if detour: for fw_id in fw_ids: if fw_id in self.links: # make sure all of these links are WAITING, else the DETOUR is not well defined ready_run = [(f >= 0 and Firework.STATE_RANKS[self.fw_states[f]] > 1) for f in self.links[fw_id]] if any(ready_run): raise ValueError("fw_id: {}: Detour option only works if all children " "of detours are not READY to run and have not " "already run".format(fw_id)) # make sure all new child fws have negative fw_id for new_fw in new_wf.fws: if new_fw.fw_id >= 0: # note: this is also used later in the 'detour' code raise ValueError( 'FireWorks to add must use a negative fw_id! Got fw_id: {}'.format(new_fw.fw_id)) # completed checks - go ahead and append for new_fw in new_wf.fws: self.id_fw[new_fw.fw_id] = new_fw # add new_fw to id_fw if new_fw.fw_id in leaf_ids: if detour: for fw_id in fw_ids: # add children of current FW to new FW self.links[new_fw.fw_id] = [f for f in self.links[fw_id] if f >= 0] else: self.links[new_fw.fw_id] = [] else: self.links[new_fw.fw_id] = new_wf.links[new_fw.fw_id] updated_ids.append(new_fw.fw_id) for fw_id in fw_ids: for root_id in root_ids: self.links[fw_id].append(root_id) # add the root id as my child if pull_spec_mods: # re-apply some actions of the parent m_fw = self.id_fw[fw_id] # get the parent FW m_launch = self._get_representative_launch(m_fw) # get Launch of parent if m_launch: # pull spec update if m_launch.state == 'COMPLETED' and m_launch.action.update_spec: new_wf.id_fw[root_id].spec.update(m_launch.action.update_spec) # pull spec mods if m_launch.state == 'COMPLETED' and m_launch.action.mod_spec: for mod in m_launch.action.mod_spec: apply_mod(mod, new_wf.id_fw[root_id].spec) for new_fw in new_wf.fws: updated_ids = self.refresh(new_fw.fw_id, set(updated_ids)) return updated_ids
def add_wf_to_fws(self, new_wf, fw_ids, pull_spec_mods=True, detour=False): """ Internal method to add a workflow as a child to a Firework Note: detours must have children that have STATE_RANK that is WAITING or below :param new_wf: (Workflow) New Workflow to add :param fw_ids: ([int]) ids of the parent Fireworks on which to add the Workflow :param pull_spec_mods: (bool) pull spec mods of COMPLETED parents :param detour: (bool) add children of the current Firework to the Workflow's leaves :return: ([int]) list of Firework ids that were updated or new """ updated_ids = [] root_ids = new_wf.root_fw_ids leaf_ids = new_wf.leaf_fw_ids for new_fw in new_wf.fws: if new_fw.fw_id >= 0: # note - this is also used later in the 'detour' code raise ValueError( 'FireWorks to add must use a negative fw_id! Got fw_id: ' '{}'.format(new_fw.fw_id)) self.id_fw[new_fw.fw_id] = new_fw # add new_fw to id_fw for fw_id in fw_ids: if new_fw.fw_id in leaf_ids: if detour: # make sure all of these links are WAITING, else the DETOUR is not well defined ready_run = [ (f >= 0 and Firework.STATE_RANKS[self.fw_states[f]] > 1) for f in self.links[fw_id] ] if any(ready_run): raise ValueError( "fw_id: {}: Detour option only works if all children of detours are not READY to run and have not already run" .format(fw_id)) self.links[new_fw.fw_id] = [ f for f in self.links[fw_id] if f >= 0 ] # add children of current FW to new FW else: self.links[new_fw.fw_id] = [] else: self.links[new_fw.fw_id] = new_wf.links[new_fw.fw_id] updated_ids.append(new_fw.fw_id) for fw_id in fw_ids: for root_id in root_ids: self.links[fw_id].append( root_id) # add the root id as my child if pull_spec_mods: # re-apply some actions of the parent m_fw = self.id_fw[fw_id] # get the parent FW m_launch = self._get_representative_launch( m_fw) # get Launch of parent if m_launch: # pull spec update if m_launch.state == 'COMPLETED' and m_launch.action.update_spec: new_wf.id_fw[root_id].spec.update( m_launch.action.update_spec) # pull spec mods if m_launch.state == 'COMPLETED' and m_launch.action.mod_spec: for mod in m_launch.action.mod_spec: apply_mod(mod, new_wf.id_fw[root_id].spec) for new_fw in new_wf.fws: updated_ids = self.refresh(new_fw.fw_id, set(updated_ids)) return updated_ids