def postprocess(self): """Submit a postprocessing script after collation""" assert self.postscript envmod.setup() envmod.module('load', 'pbs') cmd = 'qsub {script}'.format(script=self.postscript) cmd = shlex.split(cmd) rc = sp.call(cmd) assert rc == 0, 'Postprocessing script submission failed.'
def repython(version, script_path): """Update the Python environment modules to the specified ``version`` and replace the current process with an updated Python execution running the script specified by ``script_path``. """ # Establish the environment modules envmod.setup() if not os.environ['MODULESHOME']: print('payu: warning: Environment modules unavailable; aborting ' 'reversion.') return # Ensure that payu is loaded try: envmod.module('use', os.environ['PAYU_MODULEPATH']) envmod.module('load', os.environ['PAYU_MODULENAME']) except KeyError: pass # NOTE: Older versions (<2.7) require the version as a tuple version_tuple = tuple(int(i) for i in version.split('.')) module_name = os.path.join('python', version) python_modules = [ m for m in os.environ['LOADEDMODULES'].split(':') if m.startswith('python') ] if sys.version_info < version_tuple or module_name not in python_modules: # First unload all python (and supporting) modules python_modules = [ m for m in os.environ['LOADEDMODULES'].split(':') if m.startswith('python') ] for mod in python_modules: envmod.module('unload', mod) # Replace with specified version envmod.module('load', module_name) # Replace the current python process with the updated version os.execl(script_path, *sys.argv)
def repython(version, script_path): """Update the Python environment modules to the specified ``version`` and replace the current process with an updated Python execution running the script specified by ``script_path``. """ # Establish the environment modules envmod.setup() if not os.environ['MODULESHOME']: print('payu: warning: Environment modules unavailable; aborting ' 'reversion.') return # Ensure that payu is loaded try: envmod.module('use', os.environ['PAYU_MODULEPATH']) envmod.module('load', os.environ['PAYU_MODULENAME']) except KeyError: pass # NOTE: Older versions (<2.7) require the version as a tuple version_tuple = tuple(int(i) for i in version.split('.')) module_name = os.path.join('python', version) python_modules = [m for m in os.environ['LOADEDMODULES'].split(':') if m.startswith('python')] if sys.version_info < version_tuple or module_name not in python_modules: # First unload all python (and supporting) modules python_modules = [m for m in os.environ['LOADEDMODULES'].split(':') if m.startswith('python')] for mod in python_modules: envmod.module('unload', mod) # Replace with specified version envmod.module('load', module_name) # Replace the current python process with the updated version os.execl(script_path, *sys.argv)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = os.environ.get('PBS_JOBID', '') for fname in (self.stdout_fname, self.stderr_fname): src = os.path.join(self.control_path, fname) # NOTE: This assumes standard .out/.err extensions dest = os.path.join(error_log_dir, fname[:-4] + '.' + job_id + fname[-4:]) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def submit_job(pbs_script, pbs_config, pbs_vars=None): """Submit a userscript the scheduler.""" # Initialisation if pbs_vars is None: pbs_vars = {} pbs_flags = [] pbs_queue = pbs_config.get('queue', 'normal') pbs_flags.append('-q {queue}'.format(queue=pbs_queue)) pbs_project = pbs_config.get('project', os.environ['PROJECT']) pbs_flags.append('-P {project}'.format(project=pbs_project)) pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs'] for res_key in pbs_resources: res_flags = [] res_val = pbs_config.get(res_key) if res_val: res_flags.append('{key}={val}'.format(key=res_key, val=res_val)) if res_flags: pbs_flags.append('-l {res}'.format(res=','.join(res_flags))) # TODO: Need to pass lab.config_path somehow... pbs_jobname = pbs_config.get('jobname', os.path.basename(os.getcwd())) if pbs_jobname: # PBSPro has a 15-character jobname limit pbs_flags.append('-N {name}'.format(name=pbs_jobname[:15])) pbs_priority = pbs_config.get('priority') if pbs_priority: pbs_flags.append('-p {priority}'.format(priority=pbs_priority)) pbs_flags.append('-l wd') pbs_join = pbs_config.get('join', 'n') if pbs_join not in ('oe', 'eo', 'n'): print('payu: error: unknown qsub IO stream join setting.') sys.exit(-1) else: pbs_flags.append('-j {join}'.format(join=pbs_join)) # Append environment variables to qsub command # TODO: Support full export of environment variables: `qsub -V` pbs_vstring = ','.join('{0}={1}'.format(k, v) for k, v in pbs_vars.items()) pbs_flags.append('-v ' + pbs_vstring) # Append any additional qsub flags here pbs_flags_extend = pbs_config.get('qsub_flags') if pbs_flags_extend: pbs_flags.append(pbs_flags_extend) if not os.path.isabs(pbs_script): # NOTE: PAYU_PATH is always set if `set_env_vars` was always called. # This is currently always true, but is not explicitly enforced. # So this conditional check is a bit redundant. payu_bin = pbs_vars.get('PAYU_PATH', os.path.dirname(sys.argv[0])) pbs_script = os.path.join(payu_bin, pbs_script) assert os.path.isfile(pbs_script) # Set up environment modules here for PBS. envmod.setup() envmod.module('load', 'pbs') # Construct job submission command cmd = 'qsub {flags} -- {python} {script}'.format(flags=' '.join(pbs_flags), python=sys.executable, script=pbs_script) print(cmd) subprocess.check_call(shlex.split(cmd))
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update( model.exec_path_local, 'libmpi.so' ) model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the full path to symlinked exec_name in work as some # older MPI libraries complained executable was not in PATH model_prog.append(os.path.join(model.work_path, model.exec_name)) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) # TODO: Consider making this default if self.config.get('coredump', False): enable_core_dump() # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # Dump out environment with open(self.env_fname, 'w') as file: file.write(yaml.dump(dict(os.environ), default_flow_style=False)) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. print(cmd) if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) f_out.close() f_err.close() self.finish_time = datetime.datetime.now() info = get_job_info() if info is None: # Not being run under PBS, reverse engineer environment info = { 'PAYU_PATH': os.path.dirname(self.payu_path) } # Add extra information to save to jobinfo info.update( { 'PAYU_CONTROL_DIR': self.control_path, 'PAYU_RUN_ID': self.run_id, 'PAYU_CURRENT_RUN': self.counter, 'PAYU_N_RUNS': self.n_runs, 'PAYU_JOB_STATUS': rc, 'PAYU_START_TIME': self.start_time.isoformat(), 'PAYU_FINISH_TIME': self.finish_time.isoformat(), 'PAYU_WALLTIME': "{0} s".format( (self.finish_time - self.start_time).total_seconds() ), } ) # Dump job info with open(self.job_fname, 'w') as file: file.write(yaml.dump(info, default_flow_style=False)) # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = get_job_id(short=False) if job_id == '': job_id = str(self.run_id)[:6] for fname in self.output_fnames: src = os.path.join(self.control_path, fname) stem, suffix = os.path.splitext(fname) dest = os.path.join(error_log_dir, ".".join((stem, job_id)) + suffix) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) error_script = self.userscripts.get('error') if error_script: self.run_userscript(error_script) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in self.output_fnames: f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def submit_job(pbs_script, pbs_config, pbs_vars=None): """Submit a userscript the scheduler.""" pbs_flags = [] pbs_queue = pbs_config.get('queue', 'normal') pbs_flags.append('-q {}'.format(pbs_queue)) pbs_project = pbs_config.get('project', os.environ['PROJECT']) pbs_flags.append('-P {}'.format(pbs_project)) pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs'] for res_key in pbs_resources: res_flags = [] res_val = pbs_config.get(res_key) if res_val: res_flags.append('{}={}'.format(res_key, res_val)) if res_flags: pbs_flags.append('-l {}'.format(','.join(res_flags))) pbs_jobname = pbs_config.get('jobname') if pbs_jobname: # PBSPro has a 15-character jobname limit pbs_flags.append('-N {}'.format(pbs_jobname[:15])) pbs_priority = pbs_config.get('priority') if pbs_priority: pbs_flags.append('-p {}'.format(pbs_priority)) pbs_flags.append('-l wd') pbs_join = pbs_config.get('join', 'oe') if pbs_join not in ('oe', 'eo', 'n'): print('payu: error: unknown qsub IO stream join setting.') sys.exit(-1) else: pbs_flags.append('-j {}'.format(pbs_join)) if pbs_vars: pbs_vstring = ','.join('{}={}'.format(k, v) for k, v in pbs_vars.iteritems()) pbs_flags.append('-v ' + pbs_vstring) # Append any additional qsub flags here pbs_flags_extend = pbs_config.get('qsub_flags') if pbs_flags_extend: pbs_flags.append(pbs_flags_extend) # Enable PBS, in case it's not available envmod.setup() envmod.module('load', 'pbs') # If script path does not exist, then check the PATH directories if not os.path.isabs(pbs_script): for path in os.environ['PATH'].split(':'): if os.path.isdir(path) and pbs_script in os.listdir(path): pbs_script = os.path.join(path, pbs_script) break # Construct full command cmd = 'qsub {} {}'.format(' '.join(pbs_flags), pbs_script) print(cmd) subprocess.check_call(shlex.split(cmd))
def collate(self): # Set the stacksize to be unlimited res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY)) collate_config = self.expt.config.get('collate', {}) # The mpi flag implies using mppnccombine-fast mpi = collate_config.get('mpi', False) if mpi: # Must use envmod to be able to load mpi modules for collation envmod.setup() self.expt.load_modules() default_exe = 'mppnccombine-fast' else: default_exe = 'mppnccombine' # Locate the FMS collation tool # Check config for collate executable mppnc_path = collate_config.get('exe') if mppnc_path is None: for f in os.listdir(self.expt.lab.bin_path): if f == default_exe: mppnc_path = os.path.join(self.expt.lab.bin_path, f) break else: if not os.path.isabs(mppnc_path): mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path) assert mppnc_path, 'No mppnccombine program found' # Check config for collate command line options collate_flags = collate_config.get('flags') if collate_flags is None: if mpi: collate_flags = '-r' else: collate_flags = '-n4 -z -m -r' if mpi: # The output file is the first argument after the flags # and mppnccombine-fast uses an explicit -o flag to specify # the output collate_flags = " ".join([collate_flags, '-o']) envmod.lib_update(mppnc_path, 'libmpi.so') # Import list of collated files to ignore collate_ignore = collate_config.get('ignore') if collate_ignore is None: collate_ignore = [] elif type(collate_ignore) != list: collate_ignore = [collate_ignore] # Generate collated file list and identify the first tile tile_fnames = {} fnames = Fms.get_uncollated_files(self.output_path) tile_fnames[self.output_path] = fnames print(tile_fnames) if (collate_config.get('restart', False) and self.prior_restart_path is not None): # Add uncollated restart files fnames = Fms.get_uncollated_files(self.prior_restart_path) tile_fnames[self.prior_restart_path] = fnames # mnc_tiles = defaultdict(list) mnc_tiles = defaultdict(defaultdict(list).copy) for t_dir in tile_fnames: for t_fname in tile_fnames[t_dir]: t_base, t_ext = os.path.splitext(t_fname) t_ext = t_ext.lstrip('.') # Skip any files listed in the ignore list if t_base in collate_ignore: continue mnc_tiles[t_dir][t_base].append(t_fname) # print(mnc_tiles) if mpi and collate_config.get('glob', True): for t_base in mnc_tiles: globstr = "{}.*".format(t_base) # Try an equivalent glob and check the same files are returned mnc_glob = fnmatch.filter(os.listdir(self.output_path), globstr) if mnc_tiles[t_base] == sorted(mnc_glob): mnc_tiles[t_base] = [globstr, ] print("Note: using globstr ({}) for collating {}" .format(globstr, t_base)) else: print("Warning: cannot use globstr {} to collate {}" .format(globstr, t_base)) if len(mnc_tiles[t_base]) > MPI_FORK_MAX_FILE_LIMIT: print("Warning: large number of tiles: {} " .format(len(mnc_tiles[t_base]))) print("Warning: collation will be slow and may fail") cpucount = int(collate_config.get('ncpus', multiprocessing.cpu_count())) if mpi: # Default to one for mpi nprocesses = int(collate_config.get('threads', 1)) else: nprocesses = int(collate_config.get('threads', cpucount)) ncpusperprocess = int(cpucount/nprocesses) if ncpusperprocess == 1 and mpi: print("Warning: running collate with mpirun on a single processor") pool = multiprocessing.Pool(processes=nprocesses) # Collate each tileset into a single file results = [] codes = [] outputs = [] for output_path in mnc_tiles: for nc_fname in mnc_tiles[output_path]: nc_path = os.path.join(output_path, nc_fname) # Remove the collated file if it already exists, since it is # probably from a failed collation attempt # TODO: Validate this somehow if os.path.isfile(nc_path): os.remove(nc_path) cmd = ' '.join([mppnc_path, collate_flags, nc_fname, ' '.join(mnc_tiles[output_path][nc_fname])]) if mpi: cmd = "mpirun -n {} {}".format(ncpusperprocess, cmd) print(cmd) results.append( pool.apply_async(cmdthread, args=(cmd, output_path))) pool.close() pool.join() for result in results: rc, op = result.get() codes.append(rc) outputs.append(op) # TODO: Categorise the return codes if any(rc is not None for rc in codes): for p, rc, op in zip(count(), codes, outputs): if rc is not None: print('payu: error: Thread {p} crashed with error code ' '{rc}.'.format(p=p, rc=rc), file=sys.stderr) print(' Error message:', file=sys.stderr) print(op.decode(), file=sys.stderr) sys.exit(-1)
def collate(self): # Set the stacksize to be unlimited res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY)) collate_config = self.expt.config.get('collate', {}) # The mpi flag implies using mppnccombine-fast mpi = collate_config.get('mpi', False) if mpi: # Must use envmod to be able to load mpi modules for collation envmod.setup() self.expt.load_modules() default_exe = 'mppnccombine-fast' else: default_exe = 'mppnccombine' # Locate the FMS collation tool # Check config for collate executable mppnc_path = collate_config.get('exe') if mppnc_path is None: for f in os.listdir(self.expt.lab.bin_path): if f == default_exe: mppnc_path = os.path.join(self.expt.lab.bin_path, f) break else: if not os.path.isabs(mppnc_path): mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path) assert mppnc_path, 'No mppnccombine program found' # Check config for collate command line options collate_flags = collate_config.get('flags') if collate_flags is None: if mpi: collate_flags = '-r' else: collate_flags = '-n4 -z -m -r' if mpi: # The output file is the first argument after the flags # and mppnccombine-fast uses an explicit -o flag to specify # the output collate_flags = " ".join([collate_flags, '-o']) mpi_module = envmod.lib_update(mppnc_path, 'libmpi.so') # Import list of collated files to ignore collate_ignore = collate_config.get('ignore') if collate_ignore is None: collate_ignore = [] elif type(collate_ignore) != list: collate_ignore = [collate_ignore] # Generate collated file list and identify the first tile tile_fnames = [f for f in os.listdir(self.output_path) if f[-4:].isdigit() and f[-8:-4] == '.nc.'] tile_fnames.sort() mnc_tiles = defaultdict(list) for t_fname in tile_fnames: t_base, t_ext = os.path.splitext(t_fname) t_ext = t_ext.lstrip('.') # Skip any files listed in the ignore list if t_base in collate_ignore: continue mnc_tiles[t_base].append(t_fname) if mpi and collate_config.get('glob', True): for t_base in mnc_tiles: globstr = "{}.*".format(t_base) # Try an equivalent glob and check the same files are returned mnc_glob = fnmatch.filter(os.listdir(self.output_path, globstr)) if mnc_tiles[t_base] == sorted(mnc_glob): mnc_tiles[t_base] = [globstr, ] print("Note: using globstr ({}) for collating {}" .format(globstr, t_base)) else: print("Warning: cannot use globstr {} to collate {}" .format(globstr, t_base)) if len(mnc_tiles[t_base]) > MPI_FORK_MAX_FILE_LIMIT: print("Warning: large number of tiles: {} " .format(len(mnc_tiles[t_base]))) print("Warning: collation will be slow and may fail") cpucount = int(collate_config.get('ncpus', multiprocessing.cpu_count())) if mpi: # Default to one for mpi nprocesses = int(collate_config.get('threads', 1)) else: nprocesses = int(collate_config.get('threads', cpucount)) ncpusperprocess = int(cpucount/nprocesses) if ncpusperprocess == 1 and mpi: print("Warning: running collate with mpirun on a single processor") pool = multiprocessing.Pool(processes=nprocesses) # Collate each tileset into a single file results = [] codes = [] outputs = [] for nc_fname in mnc_tiles: nc_path = os.path.join(self.output_path, nc_fname) # Remove the collated file if it already exists, since it is # probably from a failed collation attempt # TODO: Validate this somehow if os.path.isfile(nc_path): os.remove(nc_path) cmd = ' '.join([mppnc_path, collate_flags, nc_fname, ' '.join(mnc_tiles[nc_fname])]) if mpi: cmd = "mpirun -n {n} {cmd}".format( n=ncpusperprocess, cmd=cmd ) print(cmd) results.append( pool.apply_async(cmdthread, args=(cmd, self.output_path))) pool.close() pool.join() for result in results: rc, op = result.get() codes.append(rc) outputs.append(op) # TODO: Categorise the return codes if any(rc is not None for rc in codes): for p, rc, op in zip(count(), codes, outputs): if rc is not None: print('payu: error: Thread {p} crashed with error code ' '{rc}.'.format(p=p, rc=rc), file=sys.stderr) print(' Error message:', file=sys.stderr) print(op.decode(), file=sys.stderr) sys.exit(-1)
def submit_job(pbs_script, pbs_config, pbs_vars=None): """Submit a userscript the scheduler.""" # Initialisation if pbs_vars is None: pbs_vars = {} pbs_flags = [] pbs_queue = pbs_config.get('queue', 'normal') pbs_flags.append('-q {queue}'.format(queue=pbs_queue)) pbs_project = pbs_config.get('project', os.environ['PROJECT']) pbs_flags.append('-P {project}'.format(project=pbs_project)) pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs'] for res_key in pbs_resources: res_flags = [] res_val = pbs_config.get(res_key) if res_val: res_flags.append('{key}={val}'.format(key=res_key, val=res_val)) if res_flags: pbs_flags.append('-l {res}'.format(res=','.join(res_flags))) # TODO: Need to pass lab.config_path somehow... pbs_jobname = pbs_config.get('jobname', os.path.basename(os.getcwd())) if pbs_jobname: # PBSPro has a 15-character jobname limit pbs_flags.append('-N {name}'.format(name=pbs_jobname[:15])) pbs_priority = pbs_config.get('priority') if pbs_priority: pbs_flags.append('-p {priority}'.format(priority=pbs_priority)) pbs_flags.append('-l wd') pbs_join = pbs_config.get('join', 'n') if pbs_join not in ('oe', 'eo', 'n'): print('payu: error: unknown qsub IO stream join setting.') sys.exit(-1) else: pbs_flags.append('-j {join}'.format(join=pbs_join)) # Append environment variables to qsub command # TODO: Support full export of environment variables: `qsub -V` pbs_vstring = ','.join('{0}={1}'.format(k, v) for k, v in pbs_vars.items()) pbs_flags.append('-v ' + pbs_vstring) # Append any additional qsub flags here pbs_flags_extend = pbs_config.get('qsub_flags') if pbs_flags_extend: pbs_flags.append(pbs_flags_extend) if not os.path.isabs(pbs_script): # NOTE: PAYU_PATH is always set if `set_env_vars` was always called. # This is currently always true, but is not explicitly enforced. # So this conditional check is a bit redundant. payu_bin = pbs_vars.get('PAYU_PATH', os.path.dirname(sys.argv[0])) pbs_script = os.path.join(payu_bin, pbs_script) assert os.path.isfile(pbs_script) # Set up environment modules here for PBS. envmod.setup() envmod.module('load', 'pbs') # Construct job submission command cmd = 'qsub {flags} -- {python} {script}'.format( flags=' '.join(pbs_flags), python=sys.executable, script=pbs_script ) print(cmd) subprocess.check_call(shlex.split(cmd))
def generate_command(pbs_script, pbs_config, pbs_vars=None, python_exe=None): """Prepare a correct PBS command string""" pbs_env_init() # Initialisation if pbs_vars is None: pbs_vars = {} # Necessary for testing if python_exe is None: python_exe = sys.executable pbs_flags = [] pbs_queue = pbs_config.get('queue', 'normal') pbs_flags.append('-q {queue}'.format(queue=pbs_queue)) pbs_project = pbs_config.get('project', os.environ['PROJECT']) pbs_flags.append('-P {project}'.format(project=pbs_project)) pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs'] for res_key in pbs_resources: res_flags = [] res_val = pbs_config.get(res_key) if res_val: res_flags.append('{key}={val}'.format(key=res_key, val=res_val)) if res_flags: pbs_flags.append('-l {res}'.format(res=','.join(res_flags))) # TODO: Need to pass lab.config_path somehow... pbs_jobname = pbs_config.get('jobname', os.path.basename(os.getcwd())) if pbs_jobname: # PBSPro has a 15-character jobname limit pbs_flags.append('-N {name}'.format(name=pbs_jobname[:15])) pbs_priority = pbs_config.get('priority') if pbs_priority: pbs_flags.append('-p {priority}'.format(priority=pbs_priority)) pbs_flags.append('-l wd') pbs_join = pbs_config.get('join', 'n') if pbs_join not in ('oe', 'eo', 'n'): print('payu: error: unknown qsub IO stream join setting.') sys.exit(-1) else: pbs_flags.append('-j {join}'.format(join=pbs_join)) # Append environment variables to qsub command # TODO: Support full export of environment variables: `qsub -V` pbs_vstring = ','.join('{0}={1}'.format(k, v) for k, v in pbs_vars.items()) pbs_flags.append('-v ' + pbs_vstring) storages = set() storage_config = pbs_config.get('storage', {}) mounts = set(['/scratch', '/g/data']) for mount in storage_config: mounts.add(mount) for project in storage_config[mount]: storages.add(make_mount_string(encode_mount(mount), project)) # Append any additional qsub flags here pbs_flags_extend = pbs_config.get('qsub_flags') if pbs_flags_extend: pbs_flags.append(pbs_flags_extend) payu_path = pbs_vars.get('PAYU_PATH', os.path.dirname(sys.argv[0])) pbs_script = check_exe_path(payu_path, pbs_script) # Check for storage paths that might need to be mounted in the # python and script paths extra_search_paths = [python_exe, payu_path, pbs_script] laboratory_path = pbs_config.get('laboratory', None) if laboratory_path is not None: extra_search_paths.append(laboratory_path) short_path = pbs_config.get('shortpath', None) if short_path is not None: extra_search_paths.append(short_path) storages.update(find_mounts(extra_search_paths, mounts)) storages.update(find_mounts(get_manifest_paths(), mounts)) # Add storage flags. Note that these are sorted to get predictable # behaviour for testing pbs_flags_extend = '+'.join(sorted(storages)) if pbs_flags_extend: pbs_flags.append("-l storage={}".format(pbs_flags_extend)) # Set up environment modules here for PBS. envmod.setup() envmod.module('load', 'pbs') # Construct job submission command cmd = 'qsub {flags} -- {python} {script}'.format( flags=' '.join(pbs_flags), python=python_exe, script=pbs_script ) return cmd
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update( model.exec_path_local, 'libmpi.so' ) model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the full path to symlinked exec_name in work as some # older MPI libraries complained executable was not in PATH model_prog.append(os.path.join(model.work_path, model.exec_name)) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) # TODO: Consider making this default if self.config.get('coredump', False): enable_core_dump() # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # Dump out environment with open(self.env_fname, 'w') as file: file.write(yaml.dump(dict(os.environ), default_flow_style=False)) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. print(cmd) if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) f_out.close() f_err.close() self.finish_time = datetime.datetime.now() info = get_job_info() if info is None: # Not being run under PBS, reverse engineer environment info = { 'PAYU_PATH': os.path.dirname(self.payu_path) } # Add extra information to save to jobinfo info.update( { 'PAYU_CONTROL_DIR': self.control_path, 'PAYU_RUN_ID': self.run_id, 'PAYU_CURRENT_RUN': self.counter, 'PAYU_N_RUNS': self.n_runs, 'PAYU_JOB_STATUS': rc, 'PAYU_START_TIME': self.start_time.isoformat(), 'PAYU_FINISH_TIME': self.finish_time.isoformat(), 'PAYU_WALLTIME': "{0} s".format( (self.finish_time - self.start_time).total_seconds() ), } ) # Dump job info with open(self.job_fname, 'w') as file: file.write(yaml.dump(info, default_flow_style=False)) # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = get_job_id(short=False) if job_id == '': job_id = self.run_id[:6] for fname in self.output_fnames: src = os.path.join(self.control_path, fname) stem, suffix = os.path.splitext(fname) dest = os.path.join(error_log_dir, ".".join((stem, job_id)) + suffix) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in self.output_fnames: f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def submit_job(pbs_script, pbs_config, pbs_vars=None): """Submit a userscript the scheduler.""" pbs_flags = [] pbs_queue = pbs_config.get('queue', 'normal') pbs_flags.append('-q {}'.format(pbs_queue)) pbs_project = pbs_config.get('project', os.environ['PROJECT']) pbs_flags.append('-P {}'.format(pbs_project)) pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs'] for res_key in pbs_resources: res_flags = [] res_val = pbs_config.get(res_key) if res_val: res_flags.append('{}={}'.format(res_key, res_val)) if res_flags: pbs_flags.append('-l {}'.format(','.join(res_flags))) # TODO: Need to pass lab.config_path somehow... pbs_jobname = pbs_config.get('jobname', os.path.basename(os.getcwd())) if pbs_jobname: # PBSPro has a 15-character jobname limit pbs_flags.append('-N {}'.format(pbs_jobname[:15])) pbs_priority = pbs_config.get('priority') if pbs_priority: pbs_flags.append('-p {}'.format(pbs_priority)) pbs_flags.append('-l wd') pbs_join = pbs_config.get('join', 'n') if pbs_join not in ('oe', 'eo', 'n'): print('payu: error: unknown qsub IO stream join setting.') sys.exit(-1) else: pbs_flags.append('-j {}'.format(pbs_join)) if pbs_vars: pbs_vstring = ','.join('{}={}'.format(k, v) for k, v in pbs_vars.iteritems()) pbs_flags.append('-v ' + pbs_vstring) # Append any additional qsub flags here pbs_flags_extend = pbs_config.get('qsub_flags') if pbs_flags_extend: pbs_flags.append(pbs_flags_extend) # Enable PBS, in case it's not available envmod.setup() envmod.module('load', 'pbs') # If script path does not exist, then check the PATH directories if not os.path.isabs(pbs_script): for path in os.environ['PATH'].split(':'): if os.path.isdir(path) and pbs_script in os.listdir(path): pbs_script = os.path.join(path, pbs_script) break # Construct full command cmd = 'qsub {} {}'.format(' '.join(pbs_flags), pbs_script) print(cmd) subprocess.check_call(shlex.split(cmd))
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path_local, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the exec_name (without path) as this is now linked in work model_prog.append(model.exec_name) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format(runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs)) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = os.environ.get('PBS_JOBID', '') for fname in (self.stdout_fname, self.stderr_fname): src = os.path.join(self.control_path, fname) # NOTE: This assumes standard .out/.err extensions dest = os.path.join(error_log_dir, fname[:-4] + '.' + job_id + fname[-4:]) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)