def stdouterr_redirected(to=os.devnull, comm=None): ''' Based on http://stackoverflow.com/questions/5081657 import os with stdouterr_redirected(to=filename): print("from Python") os.system("echo non-Python applications are also supported") ''' fd = sys.stdout.fileno() fde = sys.stderr.fileno() ##### assert that Python and C stdio write using the same file descriptor ####assert libc.fileno(ctypes.c_void_p.in_dll(libc, "stdout")) == fd == 1 def _redirect_stdout(to): sys.stdout.close() # + implicit flush() os.dup2(to.fileno(), fd) # fd writes to 'to' file sys.stdout = os.fdopen(fd, 'w') # Python writes to fd sys.stderr.close() # + implicit flush() os.dup2(to.fileno(), fde) # fd writes to 'to' file sys.stderr = os.fdopen(fde, 'w') # Python writes to fd # update desi logging to use new stdout log = get_logger() while len(log.handlers) > 0: h = log.handlers[0] log.removeHandler(h) # Add the current stdout. ch = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( '%(levelname)s:%(filename)s:%(lineno)s:%(funcName)s: %(message)s') ch.setFormatter(formatter) log.addHandler(ch) with os.fdopen(os.dup(fd), 'w') as old_stdout: if comm is None: with open(to, 'w') as file: _redirect_stdout(to=file) else: for p in range(comm.size): if p == comm.rank: with open(to, 'w') as file: _redirect_stdout(to=file) comm.barrier() try: if (comm is None) or (comm.rank == 0): log.info("Begin log redirection to {} at {}".format( to, time.asctime())) sys.stdout.flush() yield # allow code to be run with the redirected stdout finally: if (comm is None) or (comm.rank == 0): log.info("End log redirection to {} at {}".format( to, time.asctime())) sys.stdout.flush() _redirect_stdout(to=old_stdout) # restore stdout. # buffering and flags such as # CLOEXEC may be different return
def stdouterr_redirected(to=os.devnull, comm=None): ''' Based on http://stackoverflow.com/questions/5081657 import os with stdouterr_redirected(to=filename): print("from Python") os.system("echo non-Python applications are also supported") ''' fd = sys.stdout.fileno() fde = sys.stderr.fileno() ##### assert that Python and C stdio write using the same file descriptor ####assert libc.fileno(ctypes.c_void_p.in_dll(libc, "stdout")) == fd == 1 def _redirect_stdout(to): sys.stdout.close() # + implicit flush() os.dup2(to.fileno(), fd) # fd writes to 'to' file sys.stdout = os.fdopen(fd, 'w') # Python writes to fd sys.stderr.close() # + implicit flush() os.dup2(to.fileno(), fde) # fd writes to 'to' file sys.stderr = os.fdopen(fde, 'w') # Python writes to fd # update desi logging to use new stdout log = get_logger() while len(log.handlers) > 0: h = log.handlers[0] log.removeHandler(h) # Add the current stdout. ch = logging.StreamHandler(sys.stdout) formatter = logging.Formatter('%(levelname)s:%(filename)s:%(lineno)s:%(funcName)s: %(message)s') ch.setFormatter(formatter) log.addHandler(ch) with os.fdopen(os.dup(fd), 'w') as old_stdout: if comm is None: with open(to, 'w') as file: _redirect_stdout(to=file) else: for p in range(comm.size): if p == comm.rank: with open(to, 'w') as file: _redirect_stdout(to=file) comm.barrier() try: if (comm is None) or (comm.rank == 0): log.info("Begin log redirection to {} at {}".format(to, time.asctime())) sys.stdout.flush() yield # allow code to be run with the redirected stdout finally: if (comm is None) or (comm.rank == 0): log.info("End log redirection to {} at {}".format(to, time.asctime())) sys.stdout.flush() _redirect_stdout(to=old_stdout) # restore stdout. # buffering and flags such as # CLOEXEC may be different return
def runcmd(cmd, args=None, inputs=[], outputs=[], clobber=False): """ Runs a command, checking for inputs and outputs Args: cmd : command string to run with os.system() inputs : list of filename inputs that must exist before running outputs : list of output filenames that should be created clobber : if True, run even if outputs already exist Returns: error code from command or input/output checking; 0 is good TODO: Should it raise an exception instead? Notes: If any inputs are missing, don't run cmd. If outputs exist and have timestamps after all inputs, don't run cmd. """ log = desispec.log.get_logger() #- Check that inputs exist err = 0 input_time = 0 #- timestamp of latest input file for x in inputs: if not os.path.exists(x): log.error("missing input " + x) err = 1 else: input_time = max(input_time, os.stat(x).st_mtime) if err > 0: return err #- Check if outputs already exist and that their timestamp is after #- the last input timestamp already_done = (not clobber) and (len(outputs) > 0) if not clobber: for x in outputs: if not os.path.exists(x): already_done = False break if len(inputs) > 0 and os.stat(x).st_mtime < input_time: already_done = False break if already_done: log.info("SKIPPING: {}".format(cmd)) return 0 #- Green light to go; print input/output info #- Use log.level to decide verbosity, but avoid long prefixes log.info(time.asctime()) log.info("RUNNING: {}".format(cmd)) if log.level <= desispec.log.INFO: if len(inputs) > 0: print(" Inputs") for x in inputs: print(" ", x) if len(outputs) > 0: print(" Outputs") for x in outputs: print(" ", x) #- run command if callable(cmd): if args is None: return cmd() else: return cmd(*args) else: if args is None: err = os.system(cmd) else: raise ValueError("Don't provide args unless cmd is function") log.info(time.asctime()) if err > 0: log.critical("FAILED {}".format(cmd)) return err #- Check for outputs err = 0 for x in outputs: if not os.path.exists(x): log.error("missing output " + x) err = 2 if err > 0: return err log.info("SUCCESS: {}".format(cmd)) return 0
def runcmd(cmd, args=None, inputs=[], outputs=[], clobber=False): """ Runs a command, checking for inputs and outputs Args: cmd : command string to run with os.system() inputs : list of filename inputs that must exist before running outputs : list of output filenames that should be created clobber : if True, run even if outputs already exist Returns: error code from command or input/output checking; 0 is good TODO: Should it raise an exception instead? Notes: If any inputs are missing, don't run cmd. If outputs exist and have timestamps after all inputs, don't run cmd. """ log = desispec.log.get_logger() #- Check that inputs exist err = 0 input_time = 0 #- timestamp of latest input file for x in inputs: if not os.path.exists(x): log.error("missing input "+x) err = 1 else: input_time = max(input_time, os.stat(x).st_mtime) if err > 0: return err #- Check if outputs already exist and that their timestamp is after #- the last input timestamp already_done = (not clobber) and (len(outputs) > 0) if not clobber: for x in outputs: if not os.path.exists(x): already_done = False break if len(inputs)>0 and os.stat(x).st_mtime < input_time: already_done = False break if already_done: log.info("SKIPPING: {}".format(cmd)) return 0 #- Green light to go; print input/output info #- Use log.level to decide verbosity, but avoid long prefixes log.info(time.asctime()) log.info("RUNNING: {}".format(cmd)) if log.level <= desispec.log.INFO: if len(inputs) > 0: print(" Inputs") for x in inputs: print(" ", x) if len(outputs) > 0: print(" Outputs") for x in outputs: print(" ", x) #- run command if callable(cmd): if args is None: return cmd() else: return cmd(*args) else: if args is None: err = os.system(cmd) else: raise ValueError("Don't provide args unless cmd is function") log.info(time.asctime()) if err > 0: log.critical("FAILED {}".format(cmd)) return err #- Check for outputs err = 0 for x in outputs: if not os.path.exists(x): log.error("missing output "+x) err = 2 if err > 0: return err log.info("SUCCESS: {}".format(cmd)) return 0
def run_steps(first, last, rawdir, proddir, spectrographs=None, nightstr=None, comm=None): ''' Run multiple sequential pipeline steps. This function first takes the communicator and the requested processes per task and splits the communicator to form groups of processes of the desired size. It then takes the full dependency graph and extracts all the tasks for a given step. These tasks are then distributed among the groups of processes. Each process group loops over its assigned tasks. For each task, it redirects stdout/stderr to a per-task file and calls run_task(). If any process in the group throws an exception, then the traceback and all information (graph and options) needed to re-run the task are written to disk. After all process groups have finished, the state of the full graph is merged from all processes. This way a failure of one process on one task will be propagated as a failed task to all processes. Args: step (str): the pipeline step to process. rawdir (str): the path to the raw data directory. proddir (str): the path to the production directory. grph (dict): the dependency graph. opts (dict): the global options. comm (mpi4py.Comm): the full communicator to use for whole step. taskproc (int): the number of processes to use for a single task. Returns: Nothing. ''' log = get_logger() rank = 0 nproc = 1 if comm is not None: rank = comm.rank nproc = comm.size # get the full graph grph = None if rank == 0: grph = graph_read_prod(proddir, nightstr=nightstr, spectrographs=spectrographs) prod_state(rawdir, proddir, grph) if comm is not None: grph = comm.bcast(grph, root=0) # read run options from disk rundir = os.path.join(proddir, "run") optfile = os.path.join(rundir, "options.yaml") opts = None if rank == 0: opts = read_options(optfile) if comm is not None: opts = comm.bcast(opts, root=0) # compute the ordered list of steps to run firststep = None if first is None: firststep = 0 else: s = 0 for st in run_step_types: if st == first: firststep = s s += 1 laststep = None if last is None: laststep = len(run_step_types) else: s = 1 for st in run_step_types: if st == last: laststep = s s += 1 if rank == 0: log.info("running steps {} to {}".format(run_step_types[firststep], run_step_types[laststep-1])) # Assign the desired number of processes per task steptaskproc = {} for st in run_step_types: steptaskproc[st] = 1 steptaskproc['bootcalib'] = 1 steptaskproc['specex'] = 20 steptaskproc['psfcombine'] = 1 steptaskproc['extract'] = 20 steptaskproc['fiberflat'] = 1 steptaskproc['sky'] = 1 steptaskproc['stdstars'] = 1 steptaskproc['fluxcal'] = 1 steptaskproc['procexp'] = 1 steptaskproc['zfind'] = 48 jobid = None if rank == 0: if 'SLURM_JOBID' in os.environ.keys(): jobid = "slurm-{}".format(os.environ['SLURM_JOBID']) else: jobid = os.getpid() statefile = None statedot = None if rank == 0: stateroot = "state_{}-{}_{}".format(run_step_types[firststep], run_step_types[laststep-1], jobid) statefile = os.path.join(rundir, "{}.yaml".format(stateroot)) statedot = os.path.join(rundir, "{}.dot".format(stateroot)) # Mark our steps as in progress for st in range(firststep, laststep): for name, nd in grph.items(): if nd['type'] in step_file_types[run_step_types[st]]: if 'state' in nd.keys(): if nd['state'] != 'done': graph_mark(grph, name, 'wait') else: graph_mark(grph, name, 'wait') if rank == 0: graph_write(statefile, grph) with open(statedot, 'w') as f: graph_dot(grph, f) # Run the steps. Each step updates the graph in place to track # the state of all nodes. for st in range(firststep, laststep): runfile = None if rank == 0: log.info("starting step {} at {}".format(run_step_types[st], time.asctime())) taskproc = steptaskproc[run_step_types[st]] if taskproc > nproc: taskproc = nproc grph = run_step(run_step_types[st], rawdir, proddir, grph, opts, comm=comm, taskproc=taskproc) if comm is not None: comm.barrier() if rank == 0: log.info("completed step {} at {}".format(run_step_types[st], time.asctime())) if rank == 0: graph_write(statefile, grph) with open(statedot, 'w') as f: graph_dot(grph, f) log.info("finished steps {} to {}".format(run_step_types[firststep], run_step_types[laststep-1])) return
def run_steps(first, last, rawdir, proddir, spectrographs=None, nightstr=None, comm=None): log = get_logger() rank = 0 nproc = 1 if comm is not None: rank = comm.rank nproc = comm.size # get the full graph grph = None if rank == 0: grph = graph_read_prod(proddir, nightstr=nightstr, spectrographs=spectrographs) prod_state(rawdir, proddir, grph) if comm is not None: grph = comm.bcast(grph, root=0) # read run options from disk rundir = os.path.join(proddir, "run") optfile = os.path.join(rundir, "options.yaml") opts = None if rank == 0: opts = read_options(optfile) if comm is not None: opts = comm.bcast(opts, root=0) # compute the ordered list of steps to run firststep = None if first is None: firststep = 0 else: s = 0 for st in run_step_types: if st == first: firststep = s s += 1 laststep = None if last is None: laststep = len(run_step_types) else: s = 1 for st in run_step_types: if st == last: laststep = s s += 1 if rank == 0: log.info("running steps {} to {}".format(run_step_types[firststep], run_step_types[laststep-1])) # Assign the desired number of processes per task steptaskproc = {} for st in run_step_types: steptaskproc[st] = 1 steptaskproc['bootcalib'] = 1 steptaskproc['specex'] = 20 steptaskproc['psfcombine'] = 1 steptaskproc['extract'] = 20 steptaskproc['fiberflat'] = 1 steptaskproc['sky'] = 1 steptaskproc['stdstars'] = 1 steptaskproc['fluxcal'] = 1 steptaskproc['procexp'] = 1 steptaskproc['zfind'] = 48 jobid = None if rank == 0: if 'SLURM_JOBID' in os.environ.keys(): jobid = "slurm-{}".format(os.environ['SLURM_JOBID']) else: jobid = os.getpid() statefile = None statedot = None if rank == 0: stateroot = "state_{}-{}_{}".format(run_step_types[firststep], run_step_types[laststep-1], jobid) statefile = os.path.join(rundir, "{}.yaml".format(stateroot)) statedot = os.path.join(rundir, "{}.dot".format(stateroot)) # Mark our steps as in progress for st in range(firststep, laststep): for name, nd in grph.items(): if nd['type'] in step_file_types[run_step_types[st]]: if 'state' in nd.keys(): if nd['state'] != 'done': graph_mark(grph, name, 'wait') else: graph_mark(grph, name, 'wait') if rank == 0: graph_write(statefile, grph) with open(statedot, 'w') as f: graph_dot(grph, f) # Run the steps. Each step updates the graph in place to track # the state of all nodes. for st in range(firststep, laststep): runfile = None if rank == 0: log.info("starting step {} at {}".format(run_step_types[st], time.asctime())) taskproc = steptaskproc[run_step_types[st]] if taskproc > nproc: taskproc = nproc grph = run_step(run_step_types[st], rawdir, proddir, grph, opts, comm=comm, taskproc=taskproc) if comm is not None: comm.barrier() if rank == 0: log.info("completed step {} at {}".format(run_step_types[st], time.asctime())) if rank == 0: graph_write(statefile, grph) with open(statedot, 'w') as f: graph_dot(grph, f) log.info("finished steps {} to {}".format(run_step_types[firststep], run_step_types[laststep-1])) return
def run_steps(first, last, rawdir, proddir, spectrographs=None, nightstr=None, comm=None): ''' Run multiple sequential pipeline steps. This function first takes the communicator and the requested processes per task and splits the communicator to form groups of processes of the desired size. It then takes the full dependency graph and extracts all the tasks for a given step. These tasks are then distributed among the groups of processes. Each process group loops over its assigned tasks. For each task, it redirects stdout/stderr to a per-task file and calls run_task(). If any process in the group throws an exception, then the traceback and all information (graph and options) needed to re-run the task are written to disk. After all process groups have finished, the state of the full graph is merged from all processes. This way a failure of one process on one task will be propagated as a failed task to all processes. Args: step (str): the pipeline step to process. rawdir (str): the path to the raw data directory. proddir (str): the path to the production directory. grph (dict): the dependency graph. opts (dict): the global options. comm (mpi4py.Comm): the full communicator to use for whole step. taskproc (int): the number of processes to use for a single task. Returns: Nothing. ''' log = get_logger() rank = 0 nproc = 1 if comm is not None: rank = comm.rank nproc = comm.size # get the full graph grph = None if rank == 0: grph = graph_read_prod(proddir, nightstr=nightstr, spectrographs=spectrographs) prod_state(rawdir, proddir, grph) if comm is not None: grph = comm.bcast(grph, root=0) # read run options from disk rundir = os.path.join(proddir, "run") optfile = os.path.join(rundir, "options.yaml") opts = None if rank == 0: opts = read_options(optfile) if comm is not None: opts = comm.bcast(opts, root=0) # compute the ordered list of steps to run firststep = None if first is None: firststep = 0 else: s = 0 for st in run_step_types: if st == first: firststep = s s += 1 laststep = None if last is None: laststep = len(run_step_types) else: s = 1 for st in run_step_types: if st == last: laststep = s s += 1 if rank == 0: log.info("running steps {} to {}".format(run_step_types[firststep], run_step_types[laststep - 1])) # Assign the desired number of processes per task steptaskproc = {} for st in run_step_types: steptaskproc[st] = 1 steptaskproc['bootcalib'] = 1 steptaskproc['specex'] = 20 steptaskproc['psfcombine'] = 1 steptaskproc['extract'] = 20 steptaskproc['fiberflat'] = 1 steptaskproc['sky'] = 1 steptaskproc['stdstars'] = 1 steptaskproc['fluxcal'] = 1 steptaskproc['procexp'] = 1 steptaskproc['zfind'] = 48 jobid = None if rank == 0: if 'SLURM_JOBID' in os.environ: jobid = "slurm-{}".format(os.environ['SLURM_JOBID']) else: jobid = os.getpid() statefile = None statedot = None if rank == 0: stateroot = "state_{}-{}_{}".format(run_step_types[firststep], run_step_types[laststep - 1], jobid) statefile = os.path.join(rundir, "{}.yaml".format(stateroot)) statedot = os.path.join(rundir, "{}.dot".format(stateroot)) # Mark our steps as in progress for st in range(firststep, laststep): for name, nd in grph.items(): if nd['type'] in step_file_types[run_step_types[st]]: if 'state' in nd: if nd['state'] != 'done': graph_mark(grph, name, 'wait') else: graph_mark(grph, name, 'wait') if rank == 0: graph_write(statefile, grph) with open(statedot, 'w') as f: graph_dot(grph, f) # Run the steps. Each step updates the graph in place to track # the state of all nodes. for st in range(firststep, laststep): runfile = None if rank == 0: log.info("starting step {} at {}".format(run_step_types[st], time.asctime())) taskproc = steptaskproc[run_step_types[st]] if taskproc > nproc: taskproc = nproc grph, ntask, failtask = run_step(run_step_types[st], rawdir, proddir, grph, opts, comm=comm, taskproc=taskproc) if rank == 0: log.info("completed step {} at {}".format(run_step_types[st], time.asctime())) log.info(" {} total tasks, {} failures".format(ntask, failtask)) graph_write(statefile, grph) with open(statedot, 'w') as f: graph_dot(grph, f) if ntask == failtask: if rank == 0: log.info("step {}: all tasks failed, quiting at {}".format( run_step_types[st], time.asctime())) break if comm is not None: comm.barrier() if rank == 0: log.info("finished steps {} to {}".format( run_step_types[firststep], run_step_types[laststep - 1])) return