def retry_task(failpath, newopts=None): log = get_logger() if not os.path.isfile(failpath): raise RuntimeError("failure yaml file {} does not exist".format(failpath)) fyml = None with open(failpath, 'r') as f: fyml = yaml.load(f) step = fyml['step'] rawdir = fyml['rawdir'] proddir = fyml['proddir'] name = fyml['task'] grph = fyml['graph'] origopts = fyml['opts'] nproc = fyml['procs'] comm = None rank = 0 if nproc > 1: from mpi4py import MPI comm = MPI.COMM_WORLD nworld = comm.size rank = comm.rank if nworld != nproc: if rank == 0: log.warn("WARNING: original task was run with {} processes, re-running with {} instead".format(nproc, nworld)) opts = origopts if newopts is not None: log.warn("WARNING: overriding original options") opts = newopts try: run_task(step, rawdir, proddir, grph, opts, comm=comm) except: log.error("Retry Failed") raise else: if rank == 0: os.remove(failpath) return
def runcmd(cmd, args=None, inputs=[], outputs=[], clobber=False): """ Runs a command, checking for inputs and outputs Args: cmd : command string to run with os.system() inputs : list of filename inputs that must exist before running outputs : list of output filenames that should be created clobber : if True, run even if outputs already exist Returns: error code from command or input/output checking; 0 is good TODO: Should it raise an exception instead? Notes: If any inputs are missing, don't run cmd. If outputs exist and have timestamps after all inputs, don't run cmd. """ log = desispec.log.get_logger() #- Check that inputs exist err = 0 input_time = 0 #- timestamp of latest input file for x in inputs: if not os.path.exists(x): log.error("missing input " + x) err = 1 else: input_time = max(input_time, os.stat(x).st_mtime) if err > 0: return err #- Check if outputs already exist and that their timestamp is after #- the last input timestamp already_done = (not clobber) and (len(outputs) > 0) if not clobber: for x in outputs: if not os.path.exists(x): already_done = False break if len(inputs) > 0 and os.stat(x).st_mtime < input_time: already_done = False break if already_done: log.info("SKIPPING: {}".format(cmd)) return 0 #- Green light to go; print input/output info #- Use log.level to decide verbosity, but avoid long prefixes log.info(time.asctime()) log.info("RUNNING: {}".format(cmd)) if log.level <= desispec.log.INFO: if len(inputs) > 0: print(" Inputs") for x in inputs: print(" ", x) if len(outputs) > 0: print(" Outputs") for x in outputs: print(" ", x) #- run command if callable(cmd): if args is None: return cmd() else: return cmd(*args) else: if args is None: err = os.system(cmd) else: raise ValueError("Don't provide args unless cmd is function") log.info(time.asctime()) if err > 0: log.critical("FAILED {}".format(cmd)) return err #- Check for outputs err = 0 for x in outputs: if not os.path.exists(x): log.error("missing output " + x) err = 2 if err > 0: return err log.info("SUCCESS: {}".format(cmd)) return 0
def runcmd(cmd, args=None, inputs=[], outputs=[], clobber=False): """ Runs a command, checking for inputs and outputs Args: cmd : command string to run with os.system() inputs : list of filename inputs that must exist before running outputs : list of output filenames that should be created clobber : if True, run even if outputs already exist Returns: error code from command or input/output checking; 0 is good TODO: Should it raise an exception instead? Notes: If any inputs are missing, don't run cmd. If outputs exist and have timestamps after all inputs, don't run cmd. """ log = desispec.log.get_logger() #- Check that inputs exist err = 0 input_time = 0 #- timestamp of latest input file for x in inputs: if not os.path.exists(x): log.error("missing input "+x) err = 1 else: input_time = max(input_time, os.stat(x).st_mtime) if err > 0: return err #- Check if outputs already exist and that their timestamp is after #- the last input timestamp already_done = (not clobber) and (len(outputs) > 0) if not clobber: for x in outputs: if not os.path.exists(x): already_done = False break if len(inputs)>0 and os.stat(x).st_mtime < input_time: already_done = False break if already_done: log.info("SKIPPING: {}".format(cmd)) return 0 #- Green light to go; print input/output info #- Use log.level to decide verbosity, but avoid long prefixes log.info(time.asctime()) log.info("RUNNING: {}".format(cmd)) if log.level <= desispec.log.INFO: if len(inputs) > 0: print(" Inputs") for x in inputs: print(" ", x) if len(outputs) > 0: print(" Outputs") for x in outputs: print(" ", x) #- run command if callable(cmd): if args is None: return cmd() else: return cmd(*args) else: if args is None: err = os.system(cmd) else: raise ValueError("Don't provide args unless cmd is function") log.info(time.asctime()) if err > 0: log.critical("FAILED {}".format(cmd)) return err #- Check for outputs err = 0 for x in outputs: if not os.path.exists(x): log.error("missing output "+x) err = 2 if err > 0: return err log.info("SUCCESS: {}".format(cmd)) return 0
def retry_task(failpath, newopts=None): ''' Attempt to re-run a failed task. This takes the path to a yaml file containing the information about a failed task (such a file is written by run_step() when a task fails). This yaml file contains the truncated dependecy graph for the single task, as well as the options that were used when running the task. It also contains information about the number of processes that were being used. This function attempts to load mpi4py and use the MPI.COMM_WORLD communicator to re-run the task. If COMM_WORLD has a different number of processes than were originally used, a warning is printed. A warning is also printed if the options are being overridden. If the task completes successfully, the failed yaml file is deleted. Args: failpath (str): the path to the failure yaml file. newopts (dict): the dictionary of options to use in place of the original ones. Returns: Nothing. ''' log = get_logger() if not os.path.isfile(failpath): raise RuntimeError("failure yaml file {} does not exist".format(failpath)) fyml = None with open(failpath, 'r') as f: fyml = yaml.load(f) step = fyml['step'] rawdir = fyml['rawdir'] proddir = fyml['proddir'] name = fyml['task'] grph = fyml['graph'] origopts = fyml['opts'] nproc = fyml['procs'] comm = None rank = 0 if nproc > 1: from mpi4py import MPI comm = MPI.COMM_WORLD nworld = comm.size rank = comm.rank if nworld != nproc: if rank == 0: log.warn("WARNING: original task was run with {} processes, re-running with {} instead".format(nproc, nworld)) opts = origopts if newopts is not None: log.warn("WARNING: overriding original options") opts = newopts try: run_task(step, rawdir, proddir, grph, opts, comm=comm) except: log.error("Retry Failed") raise else: if rank == 0: os.remove(failpath) return
def run_step(step, rawdir, proddir, grph, opts, comm=None, taskproc=1): ''' Run a whole single step of the pipeline. This function first takes the communicator and the requested processes per task and splits the communicator to form groups of processes of the desired size. It then takes the full dependency graph and extracts all the tasks for a given step. These tasks are then distributed among the groups of processes. Each process group loops over its assigned tasks. For each task, it redirects stdout/stderr to a per-task file and calls run_task(). If any process in the group throws an exception, then the traceback and all information (graph and options) needed to re-run the task are written to disk. After all process groups have finished, the state of the full graph is merged from all processes. This way a failure of one process on one task will be propagated as a failed task to all processes. Args: step (str): the pipeline step to process. rawdir (str): the path to the raw data directory. proddir (str): the path to the production directory. grph (dict): the dependency graph. opts (dict): the global options. comm (mpi4py.Comm): the full communicator to use for whole step. taskproc (int): the number of processes to use for a single task. Returns: Nothing. ''' log = get_logger() nproc = 1 rank = 0 if comm is not None: nproc = comm.size rank = comm.rank if taskproc > nproc: raise RuntimeError("cannot have {} processes per task with only {} processes".format(taskproc, nproc)) # Get the tasks that need to be done for this step. Mark all completed # tasks as done. tasks = None if rank == 0: # For this step, compute all the tasks that we need to do alltasks = [] for name, nd in sorted(list(grph.items())): if nd['type'] in step_file_types[step]: alltasks.append(name) # For each task, prune if it is finished tasks = [] for t in alltasks: if 'state' in grph[t].keys(): if grph[t]['state'] != 'done': tasks.append(t) else: tasks.append(t) if comm is not None: tasks = comm.bcast(tasks, root=0) grph = comm.bcast(grph, root=0) ntask = len(tasks) # Get the options for this step. options = opts[step] # Now every process has the full list of tasks. If we have multiple # processes for each task, split the communicator. comm_group = comm comm_rank = None group = rank ngroup = nproc group_rank = 0 if comm is not None: if taskproc > 1: ngroup = int(nproc / taskproc) group = int(rank / taskproc) group_rank = rank % taskproc comm_group = comm.Split(color=group, key=group_rank) comm_rank = comm.Split(color=group_rank, key=group) else: comm_group = None comm_rank = comm # Now we divide up the tasks among the groups of processes as # equally as possible. group_ntask = 0 group_firsttask = 0 if group < ngroup: # only assign tasks to whole groups if ntask < ngroup: if group < ntask: group_ntask = 1 group_firsttask = group else: group_ntask = 0 else: if step == 'zfind': # We load balance the bricks across process groups based # on the number of targets per brick. All bricks with # < taskproc targets are weighted the same. if ntask <= ngroup: # distribute uniform in this case group_firsttask, group_ntask = dist_uniform(ntask, ngroup, group) else: bricksizes = [ grph[x]['ntarget'] for x in tasks ] worksizes = [ taskproc if (x < taskproc) else x for x in bricksizes ] if rank == 0: log.debug("zfind {} groups".format(ngroup)) workstr = "" for w in worksizes: workstr = "{}{} ".format(workstr, w) log.debug("zfind work sizes = {}".format(workstr)) group_firsttask, group_ntask = dist_discrete(worksizes, ngroup, group) if group_rank == 0: worksum = np.sum(worksizes[group_firsttask:group_firsttask+group_ntask]) log.debug("group {} has tasks {}-{} sum = {}".format(group, group_firsttask, group_firsttask+group_ntask-1, worksum)) else: group_firsttask, group_ntask = dist_uniform(ntask, ngroup, group) # every group goes and does its tasks... faildir = os.path.join(proddir, 'run', 'failed') logdir = os.path.join(proddir, 'run', 'logs') if group_ntask > 0: for t in range(group_firsttask, group_firsttask + group_ntask): # if group_rank == 0: # print("group {} starting task {}".format(group, tasks[t])) # sys.stdout.flush() # slice out just the graph for this task (night, gname) = graph_name_split(tasks[t]) nfaildir = os.path.join(faildir, night) nlogdir = os.path.join(logdir, night) tgraph = graph_slice(grph, names=[tasks[t]], deps=True) ffile = os.path.join(nfaildir, "{}_{}.yaml".format(step, tasks[t])) # For this task, we will temporarily redirect stdout and stderr # to a task-specific log file. with stdouterr_redirected(to=os.path.join(nlogdir, "{}.log".format(gname)), comm=comm_group): try: # if the step previously failed, clear that file now if group_rank == 0: if os.path.isfile(ffile): os.remove(ffile) # if group_rank == 0: # print("group {} runtask {}".format(group, tasks[t])) # sys.stdout.flush() log.debug("running step {} task {} (group {}/{} with {} processes)".format(step, tasks[t], (group+1), ngroup, taskproc)) run_task(step, rawdir, proddir, tgraph, options, comm=comm_group) # mark step as done in our group's graph # if group_rank == 0: # print("group {} start graph_mark {}".format(group, tasks[t])) # sys.stdout.flush() graph_mark(grph, tasks[t], state='done', descend=False) # if group_rank == 0: # print("group {} end graph_mark {}".format(group, tasks[t])) # sys.stdout.flush() except: # The task threw an exception. We want to dump all information # that will be needed to re-run the run_task() function on just # this task. msg = "FAILED: step {} task {} (group {}/{} with {} processes)".format(step, tasks[t], (group+1), ngroup, taskproc) log.error(msg) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) log.error(''.join(lines)) fyml = {} fyml['step'] = step fyml['rawdir'] = rawdir fyml['proddir'] = proddir fyml['task'] = tasks[t] fyml['graph'] = tgraph fyml['opts'] = options fyml['procs'] = taskproc if not os.path.isfile(ffile): log.error('Dumping yaml graph to '+ffile) # we are the first process to hit this with open(ffile, 'w') as f: yaml.dump(fyml, f, default_flow_style=False) # mark the step as failed in our group's local graph graph_mark(grph, tasks[t], state='fail', descend=True) if comm_group is not None: comm_group.barrier() # Now we take the graphs from all groups and merge their states #sys.stdout.flush() if comm is not None: # print("proc {} hit merge barrier".format(rank)) # sys.stdout.flush() # comm.barrier() if group_rank == 0: # print("proc {} joining merge".format(rank)) # sys.stdout.flush() graph_merge_state(grph, comm=comm_rank) if comm_group is not None: # print("proc {} joining bcast".format(rank)) # sys.stdout.flush() grph = comm_group.bcast(grph, root=0) return grph
def run_step(step, rawdir, proddir, grph, opts, comm=None, taskproc=1): log = get_logger() nproc = 1 rank = 0 if comm is not None: nproc = comm.size rank = comm.rank if taskproc > nproc: raise RuntimeError("cannot have {} processes per task with only {} processes".format(taskproc, nproc)) # Get the tasks that need to be done for this step. Mark all completed # tasks as done. tasks = None if rank == 0: # For this step, compute all the tasks that we need to do alltasks = [] for name, nd in sorted(list(grph.items())): if nd['type'] in step_file_types[step]: alltasks.append(name) # For each task, prune if it is finished tasks = [] for t in alltasks: if 'state' in grph[t].keys(): if grph[t]['state'] != 'done': tasks.append(t) else: tasks.append(t) if comm is not None: tasks = comm.bcast(tasks, root=0) grph = comm.bcast(grph, root=0) ntask = len(tasks) # Get the options for this step. options = opts[step] # Now every process has the full list of tasks. If we have multiple # processes for each task, split the communicator. comm_group = comm comm_rank = None group = rank ngroup = nproc group_rank = 0 if comm is not None: if taskproc > 1: ngroup = int(nproc / taskproc) group = int(rank / taskproc) group_rank = rank % taskproc comm_group = comm.Split(color=group, key=group_rank) comm_rank = comm.Split(color=group_rank, key=group) else: comm_group = None comm_rank = comm # Now we divide up the tasks among the groups of processes as # equally as possible. group_ntask = 0 group_firsttask = 0 if group < ngroup: # only assign tasks to whole groups if ntask < ngroup: if group < ntask: group_ntask = 1 group_firsttask = group else: group_ntask = 0 else: if step == 'zfind': # We load balance the bricks across process groups based # on the number of targets per brick. All bricks with # < taskproc targets are weighted the same. if ntask <= ngroup: # distribute uniform in this case group_firsttask, group_ntask = dist_uniform(ntask, ngroup, group) else: bricksizes = [ grph[x]['ntarget'] for x in tasks ] worksizes = [ taskproc if (x < taskproc) else x for x in bricksizes ] if rank == 0: log.debug("zfind {} groups".format(ngroup)) workstr = "" for w in worksizes: workstr = "{}{} ".format(workstr, w) log.debug("zfind work sizes = {}".format(workstr)) group_firsttask, group_ntask = dist_discrete(worksizes, ngroup, group) if group_rank == 0: worksum = np.sum(worksizes[group_firsttask:group_firsttask+group_ntask]) log.debug("group {} has tasks {}-{} sum = {}".format(group, group_firsttask, group_firsttask+group_ntask-1, worksum)) else: group_firsttask, group_ntask = dist_uniform(ntask, ngroup, group) # every group goes and does its tasks... faildir = os.path.join(proddir, 'run', 'failed') logdir = os.path.join(proddir, 'run', 'logs') if group_ntask > 0: for t in range(group_firsttask, group_firsttask + group_ntask): # if group_rank == 0: # print("group {} starting task {}".format(group, tasks[t])) # sys.stdout.flush() # slice out just the graph for this task (night, gname) = graph_name_split(tasks[t]) nfaildir = os.path.join(faildir, night) nlogdir = os.path.join(logdir, night) tgraph = graph_slice(grph, names=[tasks[t]], deps=True) ffile = os.path.join(nfaildir, "{}_{}.yaml".format(step, tasks[t])) # For this task, we will temporarily redirect stdout and stderr # to a task-specific log file. with stdouterr_redirected(to=os.path.join(nlogdir, "{}.log".format(gname)), comm=comm_group): try: # if the step previously failed, clear that file now if group_rank == 0: if os.path.isfile(ffile): os.remove(ffile) # if group_rank == 0: # print("group {} runtask {}".format(group, tasks[t])) # sys.stdout.flush() log.debug("running step {} task {} (group {}/{} with {} processes)".format(step, tasks[t], (group+1), ngroup, taskproc)) run_task(step, rawdir, proddir, tgraph, options, comm=comm_group) # mark step as done in our group's graph # if group_rank == 0: # print("group {} start graph_mark {}".format(group, tasks[t])) # sys.stdout.flush() graph_mark(grph, tasks[t], state='done', descend=False) # if group_rank == 0: # print("group {} end graph_mark {}".format(group, tasks[t])) # sys.stdout.flush() except: # The task threw an exception. We want to dump all information # that will be needed to re-run the run_task() function on just # this task. msg = "FAILED: step {} task {} (group {}/{} with {} processes)".format(step, tasks[t], (group+1), ngroup, taskproc) log.error(msg) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) log.error(''.join(lines)) fyml = {} fyml['step'] = step fyml['rawdir'] = rawdir fyml['proddir'] = proddir fyml['task'] = tasks[t] fyml['graph'] = tgraph fyml['opts'] = options fyml['procs'] = taskproc if not os.path.isfile(ffile): log.error('Dumping yaml graph to '+ffile) # we are the first process to hit this with open(ffile, 'w') as f: yaml.dump(fyml, f, default_flow_style=False) # mark the step as failed in our group's local graph graph_mark(grph, tasks[t], state='fail', descend=True) if comm_group is not None: comm_group.barrier() # Now we take the graphs from all groups and merge their states #sys.stdout.flush() if comm is not None: # print("proc {} hit merge barrier".format(rank)) # sys.stdout.flush() # comm.barrier() if group_rank == 0: # print("proc {} joining merge".format(rank)) # sys.stdout.flush() graph_merge_state(grph, comm=comm_rank) if comm_group is not None: # print("proc {} joining bcast".format(rank)) # sys.stdout.flush() grph = comm_group.bcast(grph, root=0) return grph
def retry_task(failpath, newopts=None): ''' Attempt to re-run a failed task. This takes the path to a yaml file containing the information about a failed task (such a file is written by run_step() when a task fails). This yaml file contains the truncated dependecy graph for the single task, as well as the options that were used when running the task. It also contains information about the number of processes that were being used. This function attempts to load mpi4py and use the MPI.COMM_WORLD communicator to re-run the task. If COMM_WORLD has a different number of processes than were originally used, a warning is printed. A warning is also printed if the options are being overridden. If the task completes successfully, the failed yaml file is deleted. Args: failpath (str): the path to the failure yaml file. newopts (dict): the dictionary of options to use in place of the original ones. Returns: Nothing. ''' log = get_logger() if not os.path.isfile(failpath): raise RuntimeError( "failure yaml file {} does not exist".format(failpath)) fyml = None with open(failpath, 'r') as f: fyml = yaml.load(f) step = fyml['step'] rawdir = fyml['rawdir'] proddir = fyml['proddir'] name = fyml['task'] grph = fyml['graph'] origopts = fyml['opts'] nproc = fyml['procs'] comm = None rank = 0 nworld = 1 if nproc > 1: from mpi4py import MPI comm = MPI.COMM_WORLD nworld = comm.size rank = comm.rank if nworld != nproc: if rank == 0: log.warning( "WARNING: original task was run with {} processes, re-running with {} instead" .format(nproc, nworld)) opts = origopts if newopts is not None: log.warning("WARNING: overriding original options") opts = newopts logdir = os.path.join(proddir, 'run', 'logs') (night, gname) = graph_name_split(name) nlogdir = os.path.join(logdir, night) # For this task, we will temporarily redirect stdout and stderr # to a task-specific log file. tasklog = os.path.join(nlogdir, "{}.log".format(gname)) if rank == 0: if os.path.isfile(tasklog): os.remove(tasklog) if comm is not None: comm.barrier() failcount = 0 with stdouterr_redirected(to=tasklog, comm=comm): try: log.debug("re-trying step {}, task {} with {} processes".format( step, name, nworld)) run_task(step, rawdir, proddir, grph, opts, comm=comm) except: failcount += 1 msg = "FAILED: step {} task {} process {}".format(step, name, rank) log.error(msg) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) log.error(''.join(lines)) if comm is not None: comm.barrier() failcount = comm.allreduce(failcount) if rank == 0: if failcount > 0: log.error("{} of {} processes raised an exception".format( failcount, nworld)) else: # success, clear failure file now if os.path.isfile(failpath): os.remove(failpath) return
def run_step(step, rawdir, proddir, grph, opts, comm=None, taskproc=1): ''' Run a whole single step of the pipeline. This function first takes the communicator and the requested processes per task and splits the communicator to form groups of processes of the desired size. It then takes the full dependency graph and extracts all the tasks for a given step. These tasks are then distributed among the groups of processes. Each process group loops over its assigned tasks. For each task, it redirects stdout/stderr to a per-task file and calls run_task(). If any process in the group throws an exception, then the traceback and all information (graph and options) needed to re-run the task are written to disk. After all process groups have finished, the state of the full graph is merged from all processes. This way a failure of one process on one task will be propagated as a failed task to all processes. Args: step (str): the pipeline step to process. rawdir (str): the path to the raw data directory. proddir (str): the path to the production directory. grph (dict): the dependency graph. opts (dict): the global options. comm (mpi4py.Comm): the full communicator to use for whole step. taskproc (int): the number of processes to use for a single task. Returns: Nothing. ''' log = get_logger() nproc = 1 rank = 0 if comm is not None: nproc = comm.size rank = comm.rank if taskproc > nproc: raise RuntimeError( "cannot have {} processes per task with only {} processes".format( taskproc, nproc)) # Get the tasks that need to be done for this step. Mark all completed # tasks as done. tasks = None if rank == 0: # For this step, compute all the tasks that we need to do alltasks = [] for name, nd in sorted(grph.items()): if nd['type'] in step_file_types[step]: alltasks.append(name) # For each task, prune if it is finished tasks = [] for t in alltasks: if 'state' in grph[t]: if grph[t]['state'] != 'done': tasks.append(t) else: tasks.append(t) if comm is not None: tasks = comm.bcast(tasks, root=0) grph = comm.bcast(grph, root=0) ntask = len(tasks) # Get the options for this step. options = opts[step] # Now every process has the full list of tasks. If we have multiple # processes for each task, split the communicator. comm_group = comm comm_rank = None group = rank ngroup = nproc group_rank = 0 if comm is not None: if taskproc > 1: ngroup = int(nproc / taskproc) group = int(rank / taskproc) group_rank = rank % taskproc comm_group = comm.Split(color=group, key=group_rank) comm_rank = comm.Split(color=group_rank, key=group) else: comm_group = None comm_rank = comm # Now we divide up the tasks among the groups of processes as # equally as possible. group_ntask = 0 group_firsttask = 0 if group < ngroup: # only assign tasks to whole groups if ntask < ngroup: if group < ntask: group_ntask = 1 group_firsttask = group else: group_ntask = 0 else: if step == 'zfind': # We load balance the bricks across process groups based # on the number of targets per brick. All bricks with # < taskproc targets are weighted the same. if ntask <= ngroup: # distribute uniform in this case group_firsttask, group_ntask = dist_uniform( ntask, ngroup, group) else: bricksizes = [grph[x]['ntarget'] for x in tasks] worksizes = [ taskproc if (x < taskproc) else x for x in bricksizes ] if rank == 0: log.debug("zfind {} groups".format(ngroup)) workstr = "" for w in worksizes: workstr = "{}{} ".format(workstr, w) log.debug("zfind work sizes = {}".format(workstr)) group_firsttask, group_ntask = dist_discrete( worksizes, ngroup, group) if group_rank == 0: worksum = np.sum( worksizes[group_firsttask:group_firsttask + group_ntask]) log.debug("group {} has tasks {}-{} sum = {}".format( group, group_firsttask, group_firsttask + group_ntask - 1, worksum)) else: group_firsttask, group_ntask = dist_uniform( ntask, ngroup, group) # every group goes and does its tasks... faildir = os.path.join(proddir, 'run', 'failed') logdir = os.path.join(proddir, 'run', 'logs') failcount = 0 group_failcount = 0 if group_ntask > 0: for t in range(group_firsttask, group_firsttask + group_ntask): # if group_rank == 0: # print("group {} starting task {}".format(group, tasks[t])) # sys.stdout.flush() # slice out just the graph for this task (night, gname) = graph_name_split(tasks[t]) # check if all inputs exist missing = 0 if group_rank == 0: for iname in grph[tasks[t]]['in']: ind = grph[iname] fspath = graph_path(rawdir, proddir, iname, ind['type']) if not os.path.exists(fspath): missing += 1 log.error( "skipping step {} task {} due to missing input {}". format(step, tasks[t], fspath)) if comm_group is not None: missing = comm_group.bcast(missing, root=0) if missing > 0: if group_rank == 0: group_failcount += 1 continue nfaildir = os.path.join(faildir, night) nlogdir = os.path.join(logdir, night) tgraph = graph_slice(grph, names=[tasks[t]], deps=True) ffile = os.path.join(nfaildir, "{}_{}.yaml".format(step, tasks[t])) # For this task, we will temporarily redirect stdout and stderr # to a task-specific log file. tasklog = os.path.join(nlogdir, "{}.log".format(gname)) if group_rank == 0: if os.path.isfile(tasklog): os.remove(tasklog) if comm_group is not None: comm_group.barrier() with stdouterr_redirected(to=tasklog, comm=comm_group): try: # if the step previously failed, clear that file now if group_rank == 0: if os.path.isfile(ffile): os.remove(ffile) log.debug( "running step {} task {} (group {}/{} with {} processes)" .format(step, tasks[t], (group + 1), ngroup, taskproc)) # All processes in comm_group will either return from this or ALL will # raise an exception run_task(step, rawdir, proddir, tgraph, options, comm=comm_group) # mark step as done in our group's graph graph_mark(grph, tasks[t], state='done', descend=False) except: # The task threw an exception. We want to dump all information # that will be needed to re-run the run_task() function on just # this task. if group_rank == 0: group_failcount += 1 msg = "FAILED: step {} task {} (group {}/{} with {} processes)".format( step, tasks[t], (group + 1), ngroup, taskproc) log.error(msg) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception( exc_type, exc_value, exc_traceback) log.error(''.join(lines)) fyml = {} fyml['step'] = step fyml['rawdir'] = rawdir fyml['proddir'] = proddir fyml['task'] = tasks[t] fyml['graph'] = tgraph fyml['opts'] = options fyml['procs'] = taskproc if not os.path.isfile(ffile): log.error('Dumping yaml graph to ' + ffile) # we are the first process to hit this with open(ffile, 'w') as f: yaml.dump(fyml, f, default_flow_style=False) # mark the step as failed in our group's local graph graph_mark(grph, tasks[t], state='fail', descend=True) if comm_group is not None: group_failcount = comm_group.bcast(group_failcount, root=0) # Now we take the graphs from all groups and merge their states failcount = group_failcount if comm is not None: if group_rank == 0: graph_merge_state(grph, comm=comm_rank) failcount = comm_rank.allreduce(failcount) if comm_group is not None: grph = comm_group.bcast(grph, root=0) failcount = comm_group.bcast(failcount, root=0) return grph, ntask, failcount
def retry_task(failpath, newopts=None): ''' Attempt to re-run a failed task. This takes the path to a yaml file containing the information about a failed task (such a file is written by run_step() when a task fails). This yaml file contains the truncated dependecy graph for the single task, as well as the options that were used when running the task. It also contains information about the number of processes that were being used. This function attempts to load mpi4py and use the MPI.COMM_WORLD communicator to re-run the task. If COMM_WORLD has a different number of processes than were originally used, a warning is printed. A warning is also printed if the options are being overridden. If the task completes successfully, the failed yaml file is deleted. Args: failpath (str): the path to the failure yaml file. newopts (dict): the dictionary of options to use in place of the original ones. Returns: Nothing. ''' log = get_logger() if not os.path.isfile(failpath): raise RuntimeError( "failure yaml file {} does not exist".format(failpath)) fyml = None with open(failpath, 'r') as f: fyml = yaml.load(f) step = fyml['step'] rawdir = fyml['rawdir'] proddir = fyml['proddir'] name = fyml['task'] grph = fyml['graph'] origopts = fyml['opts'] nproc = fyml['procs'] comm = None rank = 0 if nproc > 1: from mpi4py import MPI comm = MPI.COMM_WORLD nworld = comm.size rank = comm.rank if nworld != nproc: if rank == 0: log.warn( "WARNING: original task was run with {} processes, re-running with {} instead" .format(nproc, nworld)) opts = origopts if newopts is not None: log.warn("WARNING: overriding original options") opts = newopts try: run_task(step, rawdir, proddir, grph, opts, comm=comm) except: log.error("Retry Failed") raise else: if rank == 0: os.remove(failpath) return