def make_link(self, filepath): """ Payu integration function for creating symlinks in work directories which point back to the original file. """ # Check file exists. It may have been deleted but still in manifest if not os.path.exists(self.fullpath(filepath)): print('File not found: {filepath}'.format( filepath=self.fullpath(filepath))) if self.contains(filepath): print('removing from manifest') self.delete(filepath) self.needsync = True else: try: destdir = os.path.dirname(filepath) # Make destination directory if not already exists # Necessary because sometimes this is called before # individual model setup if not os.path.exists(destdir): os.makedirs(destdir) if self.copy_file(filepath): shutil.copy(self.fullpath(filepath), filepath) perm = (stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH | stat.S_IWUSR) os.chmod(filepath, perm) else: make_symlink(self.fullpath(filepath), filepath) except Exception: action = 'copying' if self.copy_file else 'linking' print('payu: error: {action} orig: {orig} ' 'local: {local}'.format(action=action, orig=self.fullpath(filepath), local=filepath)) raise
def make_link(self, filepath): """ Payu integration function for creating symlinks in work directories which point back to the original file. """ # Check file exists. It may have been deleted but still in manifest if not os.path.exists(self.fullpath(filepath)): print('File not found: {filepath}'.format( filepath=self.fullpath(filepath))) if self.contains(filepath): print('removing from manifest') self.delete(filepath) self.needsync = True else: try: if self.copy_file(filepath): shutil.copy(self.fullpath(filepath), filepath) else: make_symlink(self.fullpath(filepath), filepath) except: action = 'copying' if self.copy_file else 'linking' print('payu: error: {action} orig: {orig} ' 'local: {local}'.format(action=action, orig=self.fullpath(filepath), local=filepath)) raise
def setup(self): super(Oasis, self).setup() # Copy OASIS data to the other submodels # TODO: Parse namcouple to determine filelist # TODO: Let users map files to models input_files = [f for f in os.listdir(self.work_path) if f not in self.config_files] for model in self.expt.models: # Skip the oasis self-reference if model == self: continue # Skip models without a work_path (like access) if not hasattr(model, 'work_path'): continue mkdir_p(model.work_path) for f_name in (self.config_files + input_files): f_path = os.path.join(self.work_path, f_name) f_sympath = os.path.join(model.work_path, f_name) make_symlink(f_path, f_sympath) if self.expt.runtime: # TODO: Implement runtime patch to namcouple pass
def setup(self, force_archive=False): # Confirm that no output path already exists if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') mkdir_p(self.work_path) if force_archive: mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Archive the payu config # TODO: This just copies the existing config.yaml file, but we should # reconstruct a new file including default values config_src = os.path.join(self.control_path, 'config.yaml') config_dst = os.path.join(self.work_path) shutil.copy(config_src, config_dst) # Stripe directory in Lustre # TODO: Make this more configurable do_stripe = self.config.get('stripedio', False) if do_stripe: cmd = 'lfs setstripe -c 8 -s 8m {0}'.format(self.work_path) sp.check_call(shlex.split(cmd)) make_symlink(self.work_path, self.work_sym_path) # Set up all file manifests self.manifest.setup() for model in self.models: model.setup() # Call the macro-model setup if len(self.models) > 1: self.model.setup() self.manifest.check_manifests() # Copy manifests to work directory so they archived on completion manifest_path = os.path.join(self.work_path, 'manifests') self.manifest.copy_manifests(manifest_path) setup_script = self.userscripts.get('setup') if setup_script: self.run_userscript(setup_script) # Profiler setup expt_profs = self.config.get('profilers', []) if not isinstance(expt_profs, list): expt_profs = [expt_profs] for prof_name in expt_profs: ProfType = payu.profilers.index[prof_name] prof = ProfType(self) self.profilers.append(prof) # Testing prof.setup()
def setup(self): super(Oasis, self).setup() # Copy OASIS data to the other submodels # TODO: Parse namcouple to determine filelist # TODO: Let users map files to models input_files = [ f for f in os.listdir(self.work_path) if f not in self.config_files ] for model in self.expt.models: # Skip the oasis self-reference if model == self: continue # Skip models without a work_path (like access) if not hasattr(model, 'work_path'): continue mkdir_p(model.work_path) for f_name in (self.config_files + input_files): f_path = os.path.join(self.work_path, f_name) f_sympath = os.path.join(model.work_path, f_name) make_symlink(f_path, f_sympath) if self.expt.runtime: # TODO: Implement runtime patch to namcouple pass
def setup(self, force_archive=False): # Confirm that no output path already exists if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') mkdir_p(self.work_path) if force_archive: mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Archive the payu config # TODO: This just copies the existing config.yaml file, but we should # reconstruct a new file including default values config_src = os.path.join(self.control_path, 'config.yaml') config_dst = os.path.join(self.work_path) shutil.copy(config_src, config_dst) # Stripe directory in Lustre # TODO: Make this more configurable do_stripe = self.config.get('stripedio', False) if do_stripe: cmd = 'lfs setstripe -c 8 -s 8m {0}'.format(self.work_path) sp.check_call(shlex.split(cmd)) make_symlink(self.work_path, self.work_sym_path) # Set up all file manifests self.manifest.setup() for model in self.models: model.setup() # Call the macro-model setup if len(self.models) > 1: self.model.setup() # Use manifest to populate work directory self.manifest.make_links() # Copy manifests to work directory so they archived on completion self.manifest.copy_manifests(os.path.join(self.work_path,'manifests')) setup_script = self.userscripts.get('setup') if setup_script: self.run_userscript(setup_script) # Profiler setup expt_profs = self.config.get('profilers', []) if not isinstance(expt_profs, list): expt_profs = [expt_profs] for prof_name in expt_profs: ProfType = payu.profilers.index[prof_name] prof = ProfType(self) self.profilers.append(prof) # Testing prof.setup()
def archive(self): mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Remove work symlink if os.path.islink(self.work_sym_path): os.remove(self.work_sym_path) mkdir_p(self.restart_path) for model in self.models: model.archive() # Postprocess the model suite if len(self.models) > 1: self.model.archive() # Double-check that the run path does not exist if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') cmd = 'mv {} {}'.format(self.work_path, self.output_path) sp.check_call(shlex.split(cmd)) # Remove old restart files # TODO: Move to subroutine restart_freq = self.config.get('restart_freq', default_restart_freq) restart_history = self.config.get('restart_history', default_restart_history) # Remove any outdated restart files prior_restart_dirs = [ d for d in os.listdir(self.archive_path) if d.startswith('restart') ] for res_dir in prior_restart_dirs: res_idx = int(res_dir.lstrip('restart')) if (self.repeat_run or (not res_idx % restart_freq == 0 and res_idx <= (self.counter - restart_history))): res_path = os.path.join(self.archive_path, res_dir) # Only delete real directories; ignore symbolic restart links if os.path.isdir(res_path): shutil.rmtree(res_path) if self.config.get('collate', True): cmd = 'payu collate -i {}'.format(self.counter) sp.check_call(shlex.split(cmd)) if self.config.get('hpctoolkit', False): cmd = 'payu profile -i {}'.format(self.counter) sp.check_call(shlex.split(cmd)) archive_script = self.userscripts.get('archive') if archive_script: self.run_userscript(archive_script)
def archive(self): mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Remove work symlink if os.path.islink(self.work_sym_path): os.remove(self.work_sym_path) mkdir_p(self.restart_path) for model in self.models: model.archive() # Postprocess the model suite if len(self.models) > 1: self.model.archive() # Double-check that the run path does not exist if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') cmd = 'mv {} {}'.format(self.work_path, self.output_path) sp.check_call(shlex.split(cmd)) # Remove old restart files # TODO: Move to subroutine restart_freq = self.config.get('restart_freq', default_restart_freq) restart_history = self.config.get('restart_history', default_restart_history) # Remove any outdated restart files prior_restart_dirs = [d for d in os.listdir(self.archive_path) if d.startswith('restart')] for res_dir in prior_restart_dirs: res_idx = int(res_dir.lstrip('restart')) if (not res_idx % restart_freq == 0 and res_idx <= (self.counter - restart_history)): res_path = os.path.join(self.archive_path, res_dir) shutil.rmtree(res_path) if self.config.get('collate', True): cmd = 'payu collate -i {} -l {}'.format(self.counter, self.lab.basepath) sp.check_call(shlex.split(cmd)) if self.config.get('hpctoolkit', False): cmd = 'payu profile -i {}'.format(self.counter) sp.check_call(shlex.split(cmd)) archive_script = self.userscripts.get('archive') if archive_script: self.run_userscript(archive_script)
def setup(self): # Create experiment directory structure mkdir_p(self.work_input_path) mkdir_p(self.work_restart_path) mkdir_p(self.work_output_path) # Copy configuration files from control path for f_name in self.config_files: f_path = os.path.join(self.control_path, f_name) shutil.copy(f_path, self.work_path) for f_name in self.optional_config_files: f_path = os.path.join(self.control_path, f_name) try: shutil.copy(f_path, self.work_path) except IOError as exc: if exc.errno == errno.ENOENT: pass else: raise # Link restart files from prior run if self.prior_restart_path and not self.expt.repeat_run: restart_files = self.get_prior_restart_files() for f_name in restart_files: f_restart = os.path.join(self.prior_restart_path, f_name) f_input = os.path.join(self.work_init_path, f_name) if self.copy_restarts: shutil.copy(f_restart, f_input) else: make_symlink(f_restart, f_input) # Link input data for input_path in self.input_paths: input_files = os.listdir(input_path) for f_name in input_files: f_input = os.path.join(input_path, f_name) f_work_input = os.path.join(self.work_input_path, f_name) # Do not use input file if it is in RESTART if not os.path.exists(f_work_input): if self.copy_inputs: shutil.copy(f_input, f_work_input) else: make_symlink(f_input, f_work_input) # Some models overwrite their own input for restarts # (e.g. OASIS) if self.make_inputs_writeable: os.chmod(f_work_input, stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP) t_step = self.config.get('timestep') if t_step: self.set_timestep(t_step)
def setup(self): # FMS initialisation super(Mom, self).setup() if not self.top_level_model: # Make log dir mkdir_p(os.path.join(self.work_path, 'log')) input_nml_path = os.path.join(self.work_path, 'input.nml') input_nml = f90nml.read(input_nml_path) # Set the runtime if self.expt.runtime: ocean_solo_nml = input_nml['ocean_solo_nml'] ocean_solo_nml['years'] = self.expt.runtime['years'] ocean_solo_nml['months'] = self.expt.runtime['months'] ocean_solo_nml['days'] = self.expt.runtime['days'] ocean_solo_nml['seconds'] = self.expt.runtime.get('seconds', 0) input_nml.write(input_nml_path, force=True) # Construct the land CPU mask if self.expt.config.get('mask_table', False): # NOTE: This function actually creates a mask table using the # `check_mask` command line tool. But it is not very usable # since you need to know the number of masked CPUs to submit # the job. It needs a rethink of the submission process. self.create_mask_table(input_nml) # NOTE: Don't expect this to be here forever... # Attempt to set a mask table from the input if self.config.get('mask', False): mask_path = os.path.join(self.work_input_path, 'ocean_mask_table') # Remove any existing mask # (If no reference mask is available, then we will not use one) if os.path.isfile(mask_path): os.remove(mask_path) # Reference mask table assert ('layout' in input_nml['ocean_model_nml']) nx, ny = input_nml['ocean_model_nml'].get('layout') n_masked_cpus = nx * ny - self.config.get('ncpus') mask_table_fname = 'mask_table.{nmask}.{nx}x{ny}'.format( nmask=n_masked_cpus, nx=nx, ny=ny) ref_mask_path = os.path.join(self.work_input_path, mask_table_fname) # Set (or replace) mask table if reference is available if os.path.isfile(ref_mask_path): make_symlink(ref_mask_path, mask_path)
def setup(self): # Create experiment directory structure mkdir_p(self.work_input_path) mkdir_p(self.work_restart_path) mkdir_p(self.work_output_path) # Copy configuration files from control path for f_name in self.config_files: f_path = os.path.join(self.control_path, f_name) shutil.copy(f_path, self.work_path) for f_name in self.optional_config_files: f_path = os.path.join(self.control_path, f_name) try: shutil.copy(f_path, self.work_path) except IOError as exc: if exc.errno == errno.ENOENT: pass else: raise # Link restart files from prior run if self.prior_restart_path and not self.expt.repeat_run: restart_files = self.get_prior_restart_files() for f_name in restart_files: f_restart = os.path.join(self.prior_restart_path, f_name) f_input = os.path.join(self.work_init_path, f_name) if self.copy_restarts: shutil.copy(f_restart, f_input) else: make_symlink(f_restart, f_input) # Link input data for input_path in self.input_paths: input_files = os.listdir(input_path) for f_name in input_files: f_input = os.path.join(input_path, f_name) f_work_input = os.path.join(self.work_input_path, f_name) # Do not use input file if it is in RESTART if not os.path.exists(f_work_input): if self.copy_inputs: shutil.copy(f_input, f_work_input) else: make_symlink(f_input, f_work_input) timestep = self.config.get('timestep') if timestep: self.set_timestep(timestep)
def link_restart(self, fpath): input_work_path = os.path.join(self.work_path, fpath) # Exit if the restart file already exists if os.path.isfile(input_work_path): return input_path = None for i_path in self.input_paths: test_path = os.path.join(i_path, fpath) if os.path.isfile(test_path): input_path = test_path break assert input_path make_symlink(input_path, input_work_path)
def make_links(self): """ Payu integration function for creating symlinks in work directories which point back to the original file """ delete_list = [] for filepath in self: # Check file exists. It may have been deleted but still in manifest if not os.path.exists(self.fullpath(filepath)): delete_list.append(filepath) continue if self.copy_file(filepath): shutil.copy(self.fullpath(filepath), filepath) else: make_symlink(self.fullpath(filepath), filepath) for filepath in delete_list: print("File not found: {} removing from manifest".format(self.fullpath(filepath))) self.delete(filepath) self.needsync = True
def make_links(self): """ Payu integration function for creating symlinks in work directories which point back to the original file """ delete_list = [] for filepath in self: # Check file exists. It may have been deleted but still in manifest if not os.path.exists(self.fullpath(filepath)): delete_list.append(filepath) continue if self.copy_file(filepath): shutil.copy(self.fullpath(filepath), filepath) else: make_symlink(self.fullpath(filepath), filepath) for filepath in delete_list: print("File not found: {} removing from manifest".format( self.fullpath(filepath))) self.delete(filepath) self.needsync = True
def setup(self): super(UnifiedModel, self).setup() # Stage the UM restart file. if self.prior_restart_path and not self.expt.repeat_run: f_src = os.path.join(self.prior_restart_path, self.restart) f_dst = os.path.join(self.work_input_path, self.restart) if os.path.isfile(f_src): make_symlink(f_src, f_dst) # Set up environment variables needed to run UM. # Look for a python file in the config directory. um_env = imp.load_source("um_env", os.path.join(self.control_path, "um_env.py")) um_vars = um_env.vars assert len(self.input_paths) == 1 # Set paths in environment variables. for k in um_vars.keys(): um_vars[k] = um_vars[k].format(input_path=self.input_paths[0], work_path=self.work_path) os.environ.update(um_vars) # The above needs to be done in parexe also. # FIXME: a better way to do this or remove. parexe = os.path.join(self.work_path, "parexe") for line in fileinput.input(parexe, inplace=True): line = line.format(input_path=self.input_paths[0], work_path=self.work_path) print(line, end="") work_nml_path = os.path.join(self.work_path, "namelists") work_nml = f90nml.read(work_nml_path) # Modify namelists for a continuation run. if self.prior_output_path and not self.expt.repeat_run: prior_nml_path = os.path.join(self.prior_output_path, "namelists") prior_nml = f90nml.read(prior_nml_path) basis_time = prior_nml["NLSTCALL"]["MODEL_BASIS_TIME"] init_date = um_date_to_date(basis_time) resubmit_inc = prior_nml["NLSTCALL"]["RUN_RESUBMIT_INC"] runtime = um_time_to_time(resubmit_inc) run_start_date = cal.date_plus_seconds(init_date, runtime, cal.GREGORIAN) # Write out and save new calendar information. run_start_date_um = date_to_um_date(run_start_date) work_nml["NLSTCALL"]["MODEL_BASIS_TIME"] = run_start_date_um work_nml["NLSTCALL"]["ANCIL_REFTIME"] = run_start_date_um # Tell CABLE that this is a continuation run. cable_nml_path = os.path.join(self.work_path, "cable.nml") cable_nml = f90nml.read(cable_nml_path) cable_nml["cable"]["cable_user"]["CABLE_RUNTIME_COUPLED"] = False cable_nml.write(cable_nml_path, force=True) else: run_start_date = work_nml["NLSTCALL"]["MODEL_BASIS_TIME"] run_start_date = um_date_to_date(run_start_date) # Set the runtime for this run. if self.expt.runtime: run_runtime = cal.runtime_from_date( run_start_date, self.expt.runtime["years"], self.expt.runtime["months"], self.expt.runtime["days"], self.expt.runtime.get("seconds", 0), cal.GREGORIAN, ) run_runtime = time_to_um_time(run_runtime) work_nml["NLSTCALL"]["RUN_RESUBMIT_INC"] = run_runtime work_nml["NLSTCALL"]["RUN_TARGET_END"] = run_runtime work_nml["STSHCOMP"]["RUN_TARGET_END"] = run_runtime work_nml.write(work_nml_path, force=True)
def setup(self): super(UnifiedModel, self).setup() # Set up environment variables needed to run UM. # Look for a python file in the config directory. um_env = imp.load_source('um_env', os.path.join(self.control_path, 'um_env.py')) um_vars = um_env.vars # Stage the UM restart file. if self.prior_restart_path and not self.expt.repeat_run: f_src = os.path.join(self.prior_restart_path, self.restart) f_dst = os.path.join(self.work_input_path, self.restart) if os.path.isfile(f_src): make_symlink(f_src, f_dst) # every run is an NRUN with an updated ASTART file um_vars['ASTART'] = self.restart um_vars['TYPE'] = 'NRUN' # Set paths in environment variables. for k in um_vars.keys(): um_vars[k] = um_vars[k].format(input_path=self.input_paths[0], work_path=self.work_path) os.environ.update(um_vars) # The above needs to be done in parexe also. # FIXME: a better way to do this or remove. parexe = os.path.join(self.work_path, 'parexe') for line in fileinput.input(parexe, inplace=True): line = line.format(input_path=self.input_paths[0], work_path=self.work_path) print(line, end='') work_nml_path = os.path.join(self.work_path, 'namelists') work_nml = f90nml.read(work_nml_path) # Modify namelists for a continuation run. if self.prior_output_path and not self.expt.repeat_run: prior_nml_path = os.path.join(self.prior_output_path, 'namelists') prior_nml = f90nml.read(prior_nml_path) basis_time = prior_nml['NLSTCALL']['MODEL_BASIS_TIME'] init_date = um_date_to_date(basis_time) resubmit_inc = prior_nml['NLSTCALL']['RUN_RESUBMIT_INC'] runtime = um_time_to_time(resubmit_inc) run_start_date = cal.date_plus_seconds(init_date, runtime, cal.GREGORIAN) # Write out and save new calendar information. run_start_date_um = date_to_um_date(run_start_date) work_nml['NLSTCALL']['MODEL_BASIS_TIME'] = run_start_date_um work_nml['NLSTCALL']['ANCIL_REFTIME'] = run_start_date_um # Tell CABLE that this is a continuation run. cable_nml_path = os.path.join(self.work_path, 'cable.nml') cable_nml = f90nml.read(cable_nml_path) cable_nml['cable']['cable_user']['CABLE_RUNTIME_COUPLED'] = False cable_nml.write(cable_nml_path, force=True) else: run_start_date = work_nml['NLSTCALL']['MODEL_BASIS_TIME'] run_start_date = um_date_to_date(run_start_date) # Set the runtime for this run. if self.expt.runtime: run_runtime = cal.runtime_from_date( run_start_date, self.expt.runtime['years'], self.expt.runtime['months'], self.expt.runtime['days'], self.expt.runtime.get('seconds', 0), cal.GREGORIAN) run_runtime = time_to_um_time(run_runtime) work_nml['NLSTCALL']['RUN_RESUBMIT_INC'] = run_runtime work_nml['NLSTCALL']['RUN_TARGET_END'] = run_runtime work_nml['STSHCOMP']['RUN_TARGET_END'] = run_runtime work_nml.write(work_nml_path, force=True)
def setup(self): super(UnifiedModel, self).setup() # Set up environment variables needed to run UM. # Look for a python file in the config directory. um_env = imp.load_source('um_env', os.path.join(self.control_path, 'um_env.py')) um_vars = um_env.vars # Stage the UM restart file. if self.prior_restart_path and not self.expt.repeat_run: f_src = os.path.join(self.prior_restart_path, self.restart) f_dst = os.path.join(self.work_input_path, self.restart) if os.path.isfile(f_src): make_symlink(f_src, f_dst) # every run is an NRUN with an updated ASTART file um_vars['ASTART'] = self.restart um_vars['TYPE'] = 'NRUN' # Set paths in environment variables. for k in um_vars.keys(): um_vars[k] = um_vars[k].format(input_path=self.input_paths[0], work_path=self.work_path) os.environ.update(um_vars) # The above needs to be done in parexe also. # FIXME: a better way to do this or remove. parexe = os.path.join(self.work_path, 'parexe') for line in fileinput.input(parexe, inplace=True): line = line.format(input_path=self.input_paths[0], work_path=self.work_path) print(line, end='') work_nml_path = os.path.join(self.work_path, 'namelists') work_nml = f90nml.read(work_nml_path) restart_calendar_path = os.path.join(self.work_init_path, self.restart_calendar_file) # Modify namelists for a continuation run. if self.prior_restart_path and not self.expt.repeat_run \ and os.path.exists(restart_calendar_path): with open(restart_calendar_path, 'r') as restart_file: restart_info = yaml.load(restart_file) run_start_date = restart_info['end_date'] # Write out and save new calendar information. run_start_date_um = date_to_um_date(run_start_date) work_nml['NLSTCALL']['MODEL_BASIS_TIME'] = run_start_date_um work_nml['NLSTCALL']['ANCIL_REFTIME'] = run_start_date_um else: run_start_date = work_nml['NLSTCALL']['MODEL_BASIS_TIME'] run_start_date = um_date_to_date(run_start_date) # Set the runtime for this run. if self.expt.runtime: run_runtime = cal.runtime_from_date( run_start_date, self.expt.runtime['years'], self.expt.runtime['months'], self.expt.runtime['days'], self.expt.runtime.get('seconds', 0), cal.GREGORIAN) run_runtime = time_to_um_time(run_runtime) work_nml['NLSTCALL']['RUN_RESUBMIT_INC'] = run_runtime work_nml['NLSTCALL']['RUN_TARGET_END'] = run_runtime work_nml['STSHCOMP']['RUN_TARGET_END'] = run_runtime work_nml.write(work_nml_path, force=True)
def run(self, *user_flags): self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{} {} {}'.format(mpi_runcmd, ' '.join(mpi_flags), ' : '.join(mpi_progs)) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = os.environ.get('PBS_JOBID', '') for fname in (self.stdout_fname, self.stderr_fname): src = os.path.join(self.control_path, fname) # NOTE: This assumes standard .out/.err extensions dest = os.path.join(error_log_dir, fname[:-4] + '.' + job_id + fname[-4:]) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def setup(self): # FMS initialisation super(Mom, self).setup() if not self.top_level_model: # Make log dir mkdir_p(os.path.join(self.work_path, 'log')) input_nml_path = os.path.join(self.work_path, 'input.nml') input_nml = f90nml.read(input_nml_path) use_core2iaf = self.config.get('core2iaf') if use_core2iaf: self.core2iaf_setup() # Set the runtime if self.expt.runtime: ocean_solo_nml = input_nml['ocean_solo_nml'] ocean_solo_nml['years'] = self.expt.runtime['years'] ocean_solo_nml['months'] = self.expt.runtime['months'] ocean_solo_nml['days'] = self.expt.runtime['days'] ocean_solo_nml['seconds'] = self.expt.runtime.get('seconds', 0) input_nml.write(input_nml_path, force=True) # Construct the land CPU mask if self.expt.config.get('mask_table', False): # NOTE: This function actually creates a mask table using the # `check_mask` command line tool. But it is not very usable # since you need to know the number of masked CPUs to submit # the job. It needs a rethink of the submission process. self.create_mask_table(input_nml) # NOTE: Don't expect this to be here forever... # Attempt to set a mask table from the input if self.config.get('mask', False): mask_path = os.path.join(self.work_input_path, 'ocean_mask_table') # Remove any existing mask # (If no reference mask is available, then we will not use one) if os.path.isfile(mask_path): os.remove(mask_path) # Reference mask table assert('layout' in input_nml['ocean_model_nml']) nx, ny = input_nml['ocean_model_nml'].get('layout') n_masked_cpus = nx * ny - self.config.get('ncpus') mask_table_fname = 'mask_table.{nmask}.{nx}x{ny}'.format( nmask=n_masked_cpus, nx=nx, ny=ny ) ref_mask_path = os.path.join(self.work_input_path, mask_table_fname) # Set (or replace) mask table if reference is available if os.path.isfile(ref_mask_path): make_symlink(ref_mask_path, mask_path)
def archive(self): if not self.config.get('archive', True): print('payu: not archiving due to config.yaml setting.') return # Check there is a work directory, otherwise bail if not os.path.exists(self.work_sym_path): sys.exit('payu: error: No work directory to archive.') mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Remove work symlink if os.path.islink(self.work_sym_path): os.remove(self.work_sym_path) mkdir_p(self.restart_path) for model in self.models: model.archive() # Postprocess the model suite if len(self.models) > 1: self.model.archive() # Double-check that the run path does not exist if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') cmd = 'mv {work} {output}'.format( work=self.work_path, output=self.output_path ) sp.check_call(shlex.split(cmd)) # Remove old restart files # TODO: Move to subroutine restart_freq = self.config.get('restart_freq', default_restart_freq) restart_history = self.config.get('restart_history', default_restart_history) # Remove any outdated restart files prior_restart_dirs = [d for d in os.listdir(self.archive_path) if d.startswith('restart')] for res_dir in prior_restart_dirs: res_idx = int(res_dir.lstrip('restart')) if (self.repeat_run or (not res_idx % restart_freq == 0 and res_idx <= (self.counter - restart_history))): res_path = os.path.join(self.archive_path, res_dir) # Only delete real directories; ignore symbolic restart links if os.path.isdir(res_path): shutil.rmtree(res_path) collate_config = self.config.get('collate', {}) if collate_config.get('enable', True): cmd = '{python} {payu} collate -i {expt}'.format( python=sys.executable, payu=self.payu_path, expt=self.counter ) sp.check_call(shlex.split(cmd)) if self.config.get('hpctoolkit', False): cmd = '{python} {payu} profile -i {expt}'.format( python=sys.executable, payu=self.payu_path, expt=self.counter ) sp.check_call(shlex.split(cmd)) archive_script = self.userscripts.get('archive') if archive_script: self.run_userscript(archive_script)
def archive(self): if not self.config.get('archive', True): print('payu: not archiving due to config.yaml setting.') return # Check there is a work directory, otherwise bail if not os.path.exists(self.work_sym_path): sys.exit('payu: error: No work directory to archive.') mkdir_p(self.archive_path) make_symlink(self.archive_path, self.archive_sym_path) # Remove work symlink if os.path.islink(self.work_sym_path): os.remove(self.work_sym_path) mkdir_p(self.restart_path) for model in self.models: model.archive() # Postprocess the model suite if len(self.models) > 1: self.model.archive() # Double-check that the run path does not exist if os.path.exists(self.output_path): sys.exit('payu: error: Output path already exists.') movetree(self.work_path, self.output_path) # Remove old restart files # TODO: Move to subroutine restart_freq = self.config.get('restart_freq', default_restart_freq) restart_history = self.config.get('restart_history', default_restart_history) # Remove any outdated restart files prior_restart_dirs = [d for d in os.listdir(self.archive_path) if d.startswith('restart')] for res_dir in prior_restart_dirs: res_idx = int(res_dir.lstrip('restart')) if (self.repeat_run or (not res_idx % restart_freq == 0 and res_idx <= (self.counter - restart_history))): res_path = os.path.join(self.archive_path, res_dir) # Only delete real directories; ignore symbolic restart links if (os.path.isdir(res_path) and not os.path.islink(res_path)): shutil.rmtree(res_path) # Ensure dynamic library support for subsequent python calls ld_libpaths = os.environ['LD_LIBRARY_PATH'] py_libpath = sysconfig.get_config_var('LIBDIR') if py_libpath not in ld_libpaths.split(':'): os.environ['LD_LIBRARY_PATH'] = ':'.join([py_libpath, ld_libpaths]) collate_config = self.config.get('collate', {}) if collate_config.get('enable', True): cmd = '{python} {payu} collate -i {expt}'.format( python=sys.executable, payu=self.payu_path, expt=self.counter ) sp.check_call(shlex.split(cmd)) if self.config.get('hpctoolkit', False): cmd = '{python} {payu} profile -i {expt}'.format( python=sys.executable, payu=self.payu_path, expt=self.counter ) sp.check_call(shlex.split(cmd)) archive_script = self.userscripts.get('archive') if archive_script: self.run_userscript(archive_script)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update( model.exec_path_local, 'libmpi.so' ) model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the full path to symlinked exec_name in work as some # older MPI libraries complained executable was not in PATH model_prog.append(os.path.join(model.work_path, model.exec_name)) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) # TODO: Consider making this default if self.config.get('coredump', False): enable_core_dump() # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # Dump out environment with open(self.env_fname, 'w') as file: file.write(yaml.dump(dict(os.environ), default_flow_style=False)) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. print(cmd) if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) f_out.close() f_err.close() self.finish_time = datetime.datetime.now() info = get_job_info() if info is None: # Not being run under PBS, reverse engineer environment info = { 'PAYU_PATH': os.path.dirname(self.payu_path) } # Add extra information to save to jobinfo info.update( { 'PAYU_CONTROL_DIR': self.control_path, 'PAYU_RUN_ID': self.run_id, 'PAYU_CURRENT_RUN': self.counter, 'PAYU_N_RUNS': self.n_runs, 'PAYU_JOB_STATUS': rc, 'PAYU_START_TIME': self.start_time.isoformat(), 'PAYU_FINISH_TIME': self.finish_time.isoformat(), 'PAYU_WALLTIME': "{0} s".format( (self.finish_time - self.start_time).total_seconds() ), } ) # Dump job info with open(self.job_fname, 'w') as file: file.write(yaml.dump(info, default_flow_style=False)) # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = get_job_id(short=False) if job_id == '': job_id = str(self.run_id)[:6] for fname in self.output_fnames: src = os.path.join(self.control_path, fname) stem, suffix = os.path.splitext(fname) dest = os.path.join(error_log_dir, ".".join((stem, job_id)) + suffix) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) error_script = self.userscripts.get('error') if error_script: self.run_userscript(error_script) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in self.output_fnames: f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def setup(self): if not self.top_level_model: return cpl_keys = {'cice': ('input_ice.nml', 'coupling', 'runtime0'), 'matm': ('input_atm.nml', 'coupling', 'truntime0')} # Keep track of this in order to set the oasis runtime. run_runtime = 0 for model in self.expt.models: if model.model_type == 'cice' or model.model_type == 'cice5': # Horrible hack to make a link to o2i.nc in the # work/ice/RESTART directory f_name = 'o2i.nc' f_src = os.path.join(model.work_path, f_name) f_dst = os.path.join(model.work_restart_path, f_name) if os.path.isfile(f_src): make_symlink(f_src, f_dst) if model.model_type == 'cice5': # Stage the supplemental input files if model.prior_restart_path: for f_name in model.access_restarts: f_src = os.path.join(model.prior_restart_path, f_name) f_dst = os.path.join(model.work_input_path, f_name) if os.path.isfile(f_src): make_symlink(f_src, f_dst) if model.model_type in ('cice', 'matm'): # Update the supplemental OASIS namelists cpl_fname, cpl_group, runtime0_key = cpl_keys[model.model_type] cpl_fpath = os.path.join(model.work_path, cpl_fname) cpl_nml = f90nml.read(cpl_fpath) # Which calendar are we using, noleap or Gregorian. caltype = cpl_nml[cpl_group]['caltype'] init_date = cal.int_to_date(cpl_nml[cpl_group]['init_date']) # Get time info about the beginning of this run. We're # interested in: # 1. start date of run # 2. total runtime of all previous runs. if model.prior_restart_path and not self.expt.repeat_run: prior_cpl_fpath = os.path.join(model.prior_restart_path, cpl_fname) # With later versions this file exists in the prior restart # path, but this was not always the case, so check, and if # not there use prior output path if not os.path.exists(prior_cpl_fpath): print('payu: warning: {0} missing from prior restart ' 'path; checking prior output.'.format(cpl_fname), file=sys.stderr) if not os.path.isdir(model.prior_output_path): print('payu: error: No prior output path; ' 'aborting run.') sys.exit(errno.ENOENT) prior_cpl_fpath = os.path.join(model.prior_output_path, cpl_fname) try: prior_cpl_nml = f90nml.read(prior_cpl_fpath) except IOError as exc: if exc.errno == errno.ENOENT: print('payu: error: {0} does not exist; aborting.' ''.format(prior_cpl_fpath), file=sys.stderr) sys.exit(exc.errno) else: raise cpl_nml_grp = prior_cpl_nml[cpl_group] # The total time in seconds since the beginning of # the experiment. total_runtime = int(cpl_nml_grp[runtime0_key] + cpl_nml_grp['runtime']) run_start_date = cal.date_plus_seconds(init_date, total_runtime, caltype) else: total_runtime = 0 run_start_date = init_date # Get new runtime for this run. We get this from either the # 'runtime' part of the payu config, or from the namelist if self.expt.runtime: run_runtime = cal.runtime_from_date( run_start_date, self.expt.runtime['years'], self.expt.runtime['months'], self.expt.runtime['days'], self.expt.runtime.get('seconds', 0), caltype) else: run_runtime = cpl_nml[cpl_group]['runtime'] # Now write out new run start date and total runtime. cpl_nml[cpl_group]['inidate'] = cal.date_to_int(run_start_date) cpl_nml[cpl_group][runtime0_key] = total_runtime cpl_nml[cpl_group]['runtime'] = int(run_runtime) if model.model_type == 'cice': if self.expt.counter and not self.expt.repeat_run: cpl_nml[cpl_group]['jobnum'] = 1 + self.expt.counter else: cpl_nml[cpl_group]['jobnum'] = 1 nml_work_path = os.path.join(model.work_path, cpl_fname) f90nml.write(cpl_nml, nml_work_path + '~') shutil.move(nml_work_path + '~', nml_work_path) # Now change the oasis runtime. This needs to be done after the others. for model in self.expt.models: if model.model_type == 'oasis': namcouple = os.path.join(model.work_path, 'namcouple') s = '' with open(namcouple, 'r+') as f: s = f.read() m = re.search(r"^[ \t]*\$RUNTIME.*?^[ \t]*(\d+)", s, re.MULTILINE | re.DOTALL) assert(m is not None) s = s[:m.start(1)] + str(run_runtime) + s[m.end(1):] with open(namcouple, 'w') as f: f.write(s)
def setup(self): cpl_keys = {'cice': ('input_ice.nml', 'coupling_nml', 'runtime0'), 'matm': ('input_atm.nml', 'coupling', 'truntime0')} # Keep track of this in order to set the oasis runtime. run_runtime = 0 for model in self.expt.models: if model.model_type == 'cice': # Stage the supplemental input files if model.prior_restart_path: for f_name in model.access_restarts: f_src = os.path.join(model.prior_restart_path, f_name) f_dst = os.path.join(model.work_input_path, f_name) if os.path.isfile(f_src): make_symlink(f_src, f_dst) if model.model_type in ('cice', 'matm'): # Update the supplemental OASIS namelists cpl_fname, cpl_group, runtime0_key = cpl_keys[model.model_type] cpl_fpath = os.path.join(model.work_path, cpl_fname) cpl_nml = f90nml.read(cpl_fpath) # Which calendar are we using, noleap or Gregorian. caltype = cpl_nml[cpl_group]['caltype'] init_date = cal.int_to_date(cpl_nml[cpl_group]['init_date']) # Get time info about the beginning of this run. We're # interested in: # 1. start date of run # 2. total runtime of all previous runs. if model.prior_output_path and not self.expt.repeat_run: prior_cpl_fpath = os.path.join(model.prior_output_path, cpl_fname) prior_cpl_nml = f90nml.read(prior_cpl_fpath) cpl_nml_grp = prior_cpl_nml[cpl_group] # The total time in seconds since the beginning of # the experiment. total_runtime = int(cpl_nml_grp[runtime0_key] + cpl_nml_grp['runtime']) run_start_date = cal.date_plus_seconds(init_date, total_runtime, caltype) else: total_runtime = 0 run_start_date = init_date # Get new runtime for this run. We get this from either the # 'runtime' part of the payu config, or from the namelist if self.expt.runtime: run_runtime = cal.runtime_from_date( run_start_date, self.expt.runtime['years'], self.expt.runtime['months'], self.expt.runtime['days'], self.expt.runtime.get('seconds', 0), caltype) else: run_runtime = cpl_nml[cpl_group]['runtime'] # Now write out new run start date and total runtime. cpl_nml[cpl_group]['inidate'] = cal.date_to_int(run_start_date) cpl_nml[cpl_group][runtime0_key] = total_runtime cpl_nml[cpl_group]['runtime'] = int(run_runtime) if model.model_type == 'cice': if self.expt.counter and not self.expt.repeat_run: cpl_nml[cpl_group]['jobnum'] = 1 + self.expt.counter else: cpl_nml[cpl_group]['jobnum'] = 1 nml_work_path = os.path.join(model.work_path, cpl_fname) f90nml.write(cpl_nml, nml_work_path + '~') shutil.move(nml_work_path + '~', nml_work_path) # Now change the oasis runtime. This needs to be done after the others. for model in self.expt.models: if model.model_type == 'oasis': namcouple = os.path.join(model.work_path, 'namcouple') s = '' with open(namcouple, 'r+') as f: s = f.read() m = re.search(r"^[ \t]*\$RUNTIME.*?^[ \t]*(\d+)", s, re.MULTILINE | re.DOTALL) assert(m is not None) s = s[:m.start(1)] + str(run_runtime) + s[m.end(1):] with open(namcouple, 'w') as f: f.write(s)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path_local: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update( model.exec_path_local, 'libmpi.so' ) model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) # Use the full path to symlinked exec_name in work as some # older MPI libraries complained executable was not in PATH model_prog.append(os.path.join(model.work_path, model.exec_name)) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) # TODO: Consider making this default if self.config.get('coredump', False): enable_core_dump() # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # Dump out environment with open(self.env_fname, 'w') as file: file.write(yaml.dump(dict(os.environ), default_flow_style=False)) self.runlog.create_manifest() if self.runlog.enabled: self.runlog.commit() # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. print(cmd) if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) f_out.close() f_err.close() self.finish_time = datetime.datetime.now() info = get_job_info() if info is None: # Not being run under PBS, reverse engineer environment info = { 'PAYU_PATH': os.path.dirname(self.payu_path) } # Add extra information to save to jobinfo info.update( { 'PAYU_CONTROL_DIR': self.control_path, 'PAYU_RUN_ID': self.run_id, 'PAYU_CURRENT_RUN': self.counter, 'PAYU_N_RUNS': self.n_runs, 'PAYU_JOB_STATUS': rc, 'PAYU_START_TIME': self.start_time.isoformat(), 'PAYU_FINISH_TIME': self.finish_time.isoformat(), 'PAYU_WALLTIME': "{0} s".format( (self.finish_time - self.start_time).total_seconds() ), } ) # Dump job info with open(self.job_fname, 'w') as file: file.write(yaml.dump(info, default_flow_style=False)) # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = get_job_id(short=False) if job_id == '': job_id = self.run_id[:6] for fname in self.output_fnames: src = os.path.join(self.control_path, fname) stem, suffix = os.path.splitext(fname) dest = os.path.join(error_log_dir, ".".join((stem, job_id)) + suffix) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in self.output_fnames: f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def run(self, *user_flags): # XXX: This was previously done in reversion envmod.setup() self.load_modules() f_out = open(self.stdout_fname, 'w') f_err = open(self.stderr_fname, 'w') # Set MPI environment variables env = self.config.get('env') # Explicitly check for `None`, in case of an empty `env:` entry if env is None: env = {} for var in env: if env[var] is None: env_value = '' else: env_value = str(env[var]) os.environ[var] = env_value mpi_config = self.config.get('mpi', {}) mpi_runcmd = mpi_config.get('runcmd', 'mpirun') if self.config.get('scalasca', False): mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd]) # MPI runtime flags mpi_flags = mpi_config.get('flags', []) if not mpi_flags: mpi_flags = self.config.get('mpirun', []) # TODO: Legacy config removal warning if type(mpi_flags) != list: mpi_flags = [mpi_flags] # TODO: More uniform support needed here if self.config.get('scalasca', False): mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags] # XXX: I think this may be broken if user_flags: mpi_flags.extend(list(user_flags)) if self.debug: mpi_flags.append('--debug') mpi_progs = [] for model in self.models: # Skip models without executables (e.g. couplers) if not model.exec_path: continue mpi_config = self.config.get('mpi', {}) mpi_module = mpi_config.get('module', None) # Update MPI library module (if not explicitly set) # TODO: Check for MPI library mismatch across multiple binaries if mpi_module is None: mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so') model_prog = [] # Our MPICH wrapper does not support a working directory flag if not mpi_module.startswith('mvapich'): model_prog.append('-wdir {0}'.format(model.work_path)) # Append any model-specific MPI flags model_flags = model.config.get('mpiflags', []) if not isinstance(model_flags, list): model_prog.append(model_flags) else: model_prog.extend(model_flags) model_ncpus = model.config.get('ncpus') if model_ncpus: model_prog.append('-np {0}'.format(model_ncpus)) model_npernode = model.config.get('npernode') # TODO: New Open MPI format? if model_npernode: if model_npernode % 2 == 0: npernode_flag = ('-map-by ppr:{0}:socket' ''.format(model_npernode / 2)) else: npernode_flag = ('-map-by ppr:{0}:node' ''.format(model_npernode)) if self.config.get('scalasca', False): npernode_flag = '\"{0}\"'.format(npernode_flag) model_prog.append(npernode_flag) if self.config.get('hpctoolkit', False): os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000' model_prog.append('hpcrun') for prof in self.profilers: if prof.runscript: model_prog = model_prog.append(prof.runscript) model_prog.append(model.exec_prefix) model_prog.append(model.exec_path) mpi_progs.append(' '.join(model_prog)) cmd = '{runcmd} {flags} {exes}'.format( runcmd=mpi_runcmd, flags=' '.join(mpi_flags), exes=' : '.join(mpi_progs) ) for prof in self.profilers: cmd = prof.wrapper(cmd) # Expand shell variables inside flags if self.expand_shell_vars: cmd = os.path.expandvars(cmd) print(cmd) # Our MVAPICH wrapper does not support working directories if mpi_module.startswith('mvapich'): curdir = os.getcwd() os.chdir(self.work_path) else: curdir = None # NOTE: This may not be necessary, since env seems to be getting # correctly updated. Need to look into this. if env: # TODO: Replace with mpirun -x flag inputs proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err, env=os.environ.copy()) proc.wait() rc = proc.returncode else: rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err) # Return to control directory if curdir: os.chdir(curdir) if self.runlog: self.runlog.commit() f_out.close() f_err.close() # Remove any empty output files (e.g. logs) for fname in os.listdir(self.work_path): fpath = os.path.join(self.work_path, fname) if os.path.getsize(fpath) == 0: os.remove(fpath) # Clean up any profiling output # TODO: Move after `rc` code check? for prof in self.profilers: prof.postprocess() # TODO: Need a model-specific cleanup method call here # NOTE: This does not appear to catch hanging jobs killed by PBS if rc != 0: # Backup logs for failed runs error_log_dir = os.path.join(self.archive_path, 'error_logs') mkdir_p(error_log_dir) # NOTE: This is PBS-specific job_id = os.environ.get('PBS_JOBID', '') for fname in (self.stdout_fname, self.stderr_fname): src = os.path.join(self.control_path, fname) # NOTE: This assumes standard .out/.err extensions dest = os.path.join(error_log_dir, fname[:-4] + '.' + job_id + fname[-4:]) print(src, dest) shutil.copyfile(src, dest) # Create the symlink to the logs if it does not exist make_symlink(self.archive_path, self.archive_sym_path) # Terminate payu sys.exit('payu: Model exited with error code {0}; aborting.' ''.format(rc)) # Decrement run counter on successful run stop_file_path = os.path.join(self.control_path, 'stop_run') if os.path.isfile(stop_file_path): assert os.stat(stop_file_path).st_size == 0 os.remove(stop_file_path) print('payu: Stop file detected; terminating resubmission.') self.n_runs = 0 else: self.n_runs -= 1 # Move logs to archive (or delete if empty) for f in (self.stdout_fname, self.stderr_fname): f_path = os.path.join(self.control_path, f) if os.path.getsize(f_path) == 0: os.remove(f_path) else: shutil.move(f_path, self.work_path) run_script = self.userscripts.get('run') if run_script: self.run_userscript(run_script)
def setup(self): cpl_keys = { 'cice': ('input_ice.nml', 'coupling_nml', 'runtime0'), 'matm': ('input_atm.nml', 'coupling', 'truntime0') } # Keep track of this in order to set the oasis runtime. run_runtime = 0 for model in self.expt.models: if model.model_type == 'cice': # Stage the supplemental input files if model.prior_restart_path: for f_name in model.access_restarts: f_src = os.path.join(model.prior_restart_path, f_name) f_dst = os.path.join(model.work_input_path, f_name) if os.path.isfile(f_src): make_symlink(f_src, f_dst) if model.model_type in ('cice', 'matm'): # Update the supplemental OASIS namelists cpl_fname, cpl_group, runtime0_key = cpl_keys[model.model_type] cpl_fpath = os.path.join(model.work_path, cpl_fname) cpl_nml = f90nml.read(cpl_fpath) # Which calendar are we using, noleap or Gregorian. caltype = cpl_nml[cpl_group]['caltype'] init_date = cal.int_to_date(cpl_nml[cpl_group]['init_date']) # Get time info about the beginning of this run. We're # interested in: # 1. start date of run # 2. total runtime of all previous runs. if model.prior_output_path and not self.expt.repeat_run: prior_cpl_fpath = os.path.join(model.prior_output_path, cpl_fname) prior_cpl_nml = f90nml.read(prior_cpl_fpath) cpl_nml_grp = prior_cpl_nml[cpl_group] # The total time in seconds since the beginning of # the experiment. total_runtime = int(cpl_nml_grp[runtime0_key] + cpl_nml_grp['runtime']) run_start_date = cal.date_plus_seconds( init_date, total_runtime, caltype) else: total_runtime = 0 run_start_date = init_date # Get new runtime for this run. We get this from either the # 'runtime' part of the payu config, or from the namelist if self.expt.runtime: run_runtime = cal.runtime_from_date( run_start_date, self.expt.runtime['years'], self.expt.runtime['months'], self.expt.runtime['days'], self.expt.runtime.get('seconds', 0), caltype) else: run_runtime = cpl_nml[cpl_group]['runtime'] # Now write out new run start date and total runtime. cpl_nml[cpl_group]['inidate'] = cal.date_to_int(run_start_date) cpl_nml[cpl_group][runtime0_key] = total_runtime cpl_nml[cpl_group]['runtime'] = int(run_runtime) if model.model_type == 'cice': if self.expt.counter and not self.expt.repeat_run: cpl_nml[cpl_group]['jobnum'] = 1 + self.expt.counter else: cpl_nml[cpl_group]['jobnum'] = 1 nml_work_path = os.path.join(model.work_path, cpl_fname) f90nml.write(cpl_nml, nml_work_path + '~') shutil.move(nml_work_path + '~', nml_work_path) # Now change the oasis runtime. This needs to be done after the others. for model in self.expt.models: if model.model_type == 'oasis': namcouple = os.path.join(model.work_path, 'namcouple') s = '' with open(namcouple, 'r+') as f: s = f.read() m = re.search(r"^[ \t]*\$RUNTIME.*?^[ \t]*(\d+)", s, re.MULTILINE | re.DOTALL) assert (m is not None) s = s[:m.start(1)] + str(run_runtime) + s[m.end(1):] with open(namcouple, 'w') as f: f.write(s)