def _write_segment_desc_file(run_desc, desc_file, restart_dir, segment_namrun, tmp_run_desc_dir): """ :param dict run_desc: Run description dictionary. :param str desc_file: Name of the run description YAML file for segment. :param restart_dir: Directory path in which to find the restart file(s) for the segments. Use :py:obj:`None` for segment 0 to avoid replacing the restart directory path in the base run description YAML file. :type restart_dir: :py:class:`pathlib.Path` or None :param segment_namrun: File path and name of namelist section file containing namrun for the segment. :type segment_namrun: :py:class:`pathlib.Path` :param tmp_run_desc_dir: Temporary directory where the namelists and run description files for segments are stored. :type tmp_run_desc_dir: :py:class:`pathlib.Path` :return: Run description dict updated with namrun namelist section and restart file(s) paths, File path and name of temporary run description file for the segment. :rtype: 2-tuple """ # namrun namelist for segment namelist_namrun = get_run_desc_value( run_desc, ("segmented run", "namelists", "namrun")) namelist_namrun_index = run_desc["namelists"]["namelist_cfg"].index( namelist_namrun) run_desc["namelists"]["namelist_cfg"][namelist_namrun_index] = os.fspath( segment_namrun) # restart file(s) for segment if restart_dir is not None: nml = f90nml.read(segment_namrun) restart_timestep = nml["namrun"]["nn_it000"] - 1 for name, path in get_run_desc_value(run_desc, ("restart", )).items(): path = Path(path) name_head = path.name.split("_")[0] name_tail = path.name.split("_", 2)[-1] restart_path = ( restart_dir / "{name_head}_{restart_timestep:08d}_{name_tail}".format( name_head=name_head, restart_timestep=restart_timestep, name_tail=name_tail, )) run_desc["restart"][name] = os.fspath(restart_path) # walltime for segment segment_walltime = get_run_desc_value( run_desc, ("segmented run", "segment walltime")) run_desc["walltime"] = segment_walltime # write temporary run description file for segment with (tmp_run_desc_dir / desc_file).open("wt") as f: yaml.safe_dump(run_desc, f, default_flow_style=False) return run_desc, tmp_run_desc_dir / desc_file
def _calc_n_segments(run_desc): run_start_date = arrow.get( get_run_desc_value(run_desc, ("segmented run", "start date"))) run_end_date = arrow.get( get_run_desc_value(run_desc, ("segmented run", "end date"))) days_per_segment = get_run_desc_value( run_desc, ("segmented run", "days per segment")) n_segments_delta = (run_end_date.shift(days=+1) - run_start_date) / days_per_segment n_segments = n_segments_delta.days + math.ceil(n_segments_delta.seconds / (60 * 60 * 24)) return n_segments
def _definitions(run_desc, run_desc_file, run_dir, results_dir, deflate): salishsea_cmd = { "beluga": Path("${HOME}", ".local", "bin", "salishsea"), "cedar": Path("${HOME}", ".local", "bin", "salishsea"), "delta": Path("${PBS_O_HOME}", "bin", "salishsea"), "graham": Path("${HOME}", ".local", "bin", "salishsea"), "omega": Path("${PBS_O_HOME}", "bin", "salishsea"), "orcinus": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"), "sigma": Path("${PBS_O_HOME}", "bin", "salishsea"), "salish": Path("${HOME}", ".local", "bin", "salishsea"), "seawolf1": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"), "seawolf2": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"), "seawolf3": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"), "sockeye": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"), }.get(SYSTEM, Path("${HOME}", ".local", "bin", "salishsea")) defns = ('RUN_ID="{run_id}"\n' 'RUN_DESC="{run_dir}/{run_desc_file}"\n' 'WORK_DIR="{run_dir}"\n' 'RESULTS_DIR="{results_dir}"\n' 'COMBINE="{salishsea_cmd} combine"\n').format( run_id=get_run_desc_value(run_desc, ("run_id", )), run_desc_file=run_desc_file.name, run_dir=run_dir, results_dir=results_dir, salishsea_cmd=salishsea_cmd, ) if deflate: defns += 'DEFLATE="{salishsea_cmd} deflate"\n'.format( salishsea_cmd=salishsea_cmd) defns += 'GATHER="{salishsea_cmd} gather"\n'.format( salishsea_cmd=salishsea_cmd) return defns
def _build_tmp_run_dir( run_desc, desc_file, results_dir, cedar_broadwell, deflate, max_deflate_jobs, separate_deflate, nocheck_init, quiet, ): run_dir = api.prepare(desc_file, nocheck_init) if not quiet: log.info("Created run directory {}".format(run_dir)) nemo_processors = get_n_processors(run_desc, run_dir) separate_xios_server = get_run_desc_value( run_desc, ("output", "separate XIOS server")) if separate_xios_server: xios_processors = get_run_desc_value(run_desc, ("output", "XIOS servers")) else: xios_processors = 0 batch_script = _build_batch_script( run_desc, desc_file, nemo_processors, xios_processors, max_deflate_jobs, results_dir, run_dir, deflate, separate_deflate, cedar_broadwell, ) batch_file = run_dir / "SalishSeaNEMO.sh" with batch_file.open("wt") as f: f.write(batch_script) if separate_deflate: for deflate_job, pattern in SEPARATE_DEFLATE_JOBS.items(): deflate_script = _build_deflate_script(run_desc, pattern, deflate_job, results_dir) script_file = run_dir / "deflate_{}.sh".format(deflate_job) with script_file.open("wt") as f: f.write(deflate_script) return run_dir, batch_file
def _calc_run_segments(desc_file, results_dir): run_desc = load_run_desc(desc_file) if "segmented run" not in run_desc: first_seg_no = 0 return [(run_desc, desc_file, results_dir, {})], first_seg_no base_run_id = get_run_desc_value(run_desc, ("run_id", )) start_date = arrow.get( get_run_desc_value(run_desc, ("segmented run", "start date"))) start_timestep = get_run_desc_value(run_desc, ("segmented run", "start time step")) end_date = arrow.get( get_run_desc_value(run_desc, ("segmented run", "end date"))) days_per_segment = get_run_desc_value( run_desc, ("segmented run", "days per segment")) namelist_namdom = get_run_desc_value( run_desc, ("segmented run", "namelists", "namdom"), expand_path=True) rn_rdt = f90nml.read(namelist_namdom)["namdom"]["rn_rdt"] timesteps_per_day = 24 * 60 * 60 / rn_rdt n_segments = _calc_n_segments(run_desc) run_segments = [] first_seg_no = get_run_desc_value( run_desc, ("segmented run", "first segment number")) for i, seg_no in enumerate(range(first_seg_no, first_seg_no + n_segments)): segment_run_id = "{seg_no}_{base_run_id}".format( seg_no=seg_no, base_run_id=base_run_id) segment_run_desc = copy.deepcopy(run_desc) segment_run_desc["run_id"] = segment_run_id nn_it000 = int(start_timestep + i * days_per_segment * timesteps_per_day) date0 = min(start_date.shift(days=+i * days_per_segment), end_date) segment_days = min( days_per_segment, (end_date - start_date.shift(days=+i * days_per_segment)).days + 1, ) nn_itend = int(nn_it000 + segment_days * timesteps_per_day - 1) run_segments.append(( # Run description dict for the segment segment_run_desc, # Run description YAML file name for the segment "{file_stem}_{seg_no}{suffix}".format(file_stem=desc_file.stem, seg_no=seg_no, suffix=desc_file.suffix), # Results directory for the segment results_dir.parent / "{dir_name}_{seg_no}".format( dir_name=results_dir.name, seg_no=seg_no), # f90nml namelist patch for the segment for the namelist containing namrum { "namrun": { "nn_it000": nn_it000, "nn_itend": nn_itend, "nn_date0": int(date0.format("YYYYMMDD")), } }, )) return run_segments, first_seg_no
def _write_segment_namrun_namelist(run_desc, namelist_namrun_patch, tmp_run_desc_dir): """ :param dict run_desc: Run description dictionary. :param dict namelist_namrun_patch: f90nml patch for namrun namelist for the segment. :param tmp_run_desc_dir: Temporary directory where the namelists and run description files for segments are stored. :type tmp_run_desc_dir: :py:class:`pathlib.Path` :return: File path and name of namelist section file containing namrun namelist for the segment. :rtype: :py:class:`pathlib.Path` """ namelist_namrun = get_run_desc_value( run_desc, ("segmented run", "namelists", "namrun"), expand_path=True) f90nml.patch(namelist_namrun, namelist_namrun_patch, tmp_run_desc_dir / namelist_namrun.name) return tmp_run_desc_dir / namelist_namrun.name
def _build_deflate_script(run_desc, pattern, result_type, results_dir): script = "#!/bin/bash\n" try: email = get_run_desc_value(run_desc, ("email", )) except KeyError: email = "{user}@eos.ubc.ca".format(user=os.getenv("USER")) pmem = "2500mb" if result_type == "ptrc" else "2000mb" script = "\n".join(( script, "{pbs_directives}\n".format(pbs_directives=_pbs_directives( run_desc, 1, email, results_dir, pmem=pmem, deflate=True, result_type=result_type, )), )) script += ('RESULTS_DIR="{results_dir}"\n' 'DEFLATE="${{PBS_O_HOME}}/.local/bin/salishsea deflate"\n' "\n" "{modules}\n" "cd ${{RESULTS_DIR}}\n" 'echo "Results deflation started at $(date)"\n' "${{DEFLATE}} {pattern} --jobs 1 --debug\n" "DEFLATE_EXIT_CODE=$?\n" 'echo "Results deflation ended at $(date)"\n' "\n" "chmod g+rw ${{RESULTS_DIR}}/*\n" "chmod o+r ${{RESULTS_DIR}}/*\n" "\n" "exit ${{DEFLATE_EXIT_CODE}}\n").format(results_dir=results_dir, modules=_modules(), pattern=pattern) return script
def _pbs_directives( run_desc, n_processors, email, results_dir, procs_per_node=0, pmem="2000mb", deflate=False, result_type="", stderr_stdout=True, ): """Return the PBS directives used to run NEMO on a cluster that uses the TORQUE resource manager for job scheduling. The string that is returned is intended for inclusion in a bash script that will be to the cluster queue manager via the :command:`qsub` command. :param dict run_desc: Run description dictionary. :param int n_processors: Number of processors that the run will be executed on; i.e. the sum of NEMO and XIOS processors. :param str email: Email address to send job begin, end & abort notifications to. :param results_dir: Directory to store results into. :type results_dir: :py:class:`pathlib.Path` :param int procs_per_node: Number of processors per node. Defaults to 0 to produce :kbd:`#PBS -l procs=n_processors` directive. Otherwise produces :kbd:`#PBS -l nodes=n:ppn=procs_per_node` directive. :param str pmem: Memory per processor. :param boolean deflate: Return directives for a run results deflation job when :py:obj:`True`. :param str result_type: Run result type ('grid', 'ptrc', or 'dia') for deflation job. :param boolean stderr_stdout: When :py:obj:`False`, don't include directives to put stderr and stdout in results directory. Added for use in run scripts generated by :kbd:`run_NEMO` worker that do per-command redirection to stderr and stdout. :returns: PBS directives for run script. :rtype: Unicode str """ run_id = get_run_desc_value(run_desc, ("run_id", )) if not procs_per_node: procs_directive = "#PBS -l procs={procs}".format(procs=n_processors) else: nodes = math.ceil(n_processors / procs_per_node) if SYSTEM == "sockeye": procs_directive = "#PBS -l select={nodes}:ncpus={procs_per_node}:mpiprocs={procs_per_node}:mem=186gb".format( nodes=nodes, procs_per_node=procs_per_node) else: procs_directive = "#PBS -l nodes={nodes}:ppn={procs_per_node}".format( nodes=nodes, procs_per_node=procs_per_node) if deflate: run_id = "{result_type}_{run_id}_deflate".format( run_id=run_id, result_type=result_type) try: td = datetime.timedelta( seconds=get_run_desc_value(run_desc, ("walltime", ))) except TypeError: t = datetime.datetime.strptime( get_run_desc_value(run_desc, ("walltime", )), "%H:%M:%S").time() td = datetime.timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) walltime = _td2hms(td) pbs_directives = textwrap.dedent("""\ #PBS -N {run_id} #PBS -S /bin/bash #PBS -l walltime={walltime} # email when the job [b]egins and [e]nds, or is [a]borted #PBS -m bea #PBS -M {email} """).format(run_id=run_id, walltime=walltime, email=email) if SYSTEM == "sockeye": pbs_directives += textwrap.dedent("""\ #PBS -A st-sallen1-1 {procs_directive} """).format(procs_directive=procs_directive) else: if SYSTEM == "orcinus" or SYSTEM.startswith("seawolf"): pbs_directives += "#PBS -l partition=QDR\n" if SYSTEM == "salish": pbs_directives += textwrap.dedent("""\ {procs_directive} # total memory for job #PBS -l mem=64gb """).format(procs_directive=procs_directive, pmem=pmem) else: pbs_directives += textwrap.dedent("""\ {procs_directive} # memory per processor #PBS -l pmem={pmem} """).format(procs_directive=procs_directive, pmem=pmem) if stderr_stdout: stdout = ("stdout_deflate_{result_type}".format( result_type=result_type) if deflate else "stdout") stderr = ("stderr_deflate_{result_type}".format( result_type=result_type) if deflate else "stderr") pbs_directives += ("# stdout and stderr file paths/names\n" "#PBS -o {results_dir}/{stdout}\n" "#PBS -e {results_dir}/{stderr}\n").format( results_dir=results_dir, stdout=stdout, stderr=stderr) return pbs_directives
def _sbatch_directives( run_desc, n_processors, cedar_broadwell, email, results_dir, mem="0", deflate=False, result_type="", ): """Return the SBATCH directives used to run NEMO on a cluster that uses the Slurm Workload Manager for job scheduling. The string that is returned is intended for inclusion in a bash script that will submitted be to the cluster queue manager via the :command:`sbatch` command. :param dict run_desc: Run description dictionary. :param int n_processors: Number of processors that the run will be executed on; the sum of NEMO and XIOS processors. :param boolean cedar_broadwell: Use broadwell (32 cores/node) nodes on cedar. :param str email: Email address to send job begin, end & abort notifications to. :param results_dir: Directory to store results into. :type results_dir: :py:class:`pathlib.Path` :param str mem: Memory per node. :param boolean deflate: Return directives for a run results deflation job when :py:obj:`True`. :param str result_type: Run result type ('grid', 'ptrc', or 'dia') for deflation job. :returns: SBATCH directives for run script. :rtype: Unicode str """ run_id = get_run_desc_value(run_desc, ("run_id", )) constraint = "broadwell" if SYSTEM == "cedar" and cedar_broadwell else "skylake" try: processors_per_node = {"beluga": 40, "cedar": 48, "graham": 32}[SYSTEM] except KeyError: log.error("unknown system: {system}".format(system=SYSTEM)) raise SystemExit(2) if SYSTEM == "cedar" and cedar_broadwell: processors_per_node = 32 nodes = math.ceil(n_processors / processors_per_node) mem = {"beluga": "92G", "cedar": "0", "graham": "0"}.get(SYSTEM, mem) if deflate: run_id = "{result_type}_{run_id}_deflate".format( run_id=run_id, result_type=result_type) try: td = datetime.timedelta( seconds=get_run_desc_value(run_desc, ("walltime", ))) except TypeError: t = datetime.datetime.strptime( get_run_desc_value(run_desc, ("walltime", )), "%H:%M:%S").time() td = datetime.timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) walltime = _td2hms(td) if SYSTEM == "cedar": sbatch_directives = ("#SBATCH --job-name={run_id}\n" "#SBATCH --constraint={constraint}\n").format( run_id=run_id, constraint=constraint) else: sbatch_directives = "#SBATCH --job-name={run_id}\n".format( run_id=run_id) sbatch_directives += ("#SBATCH --nodes={nodes}\n" "#SBATCH --ntasks-per-node={processors_per_node}\n" "#SBATCH --mem={mem}\n" "#SBATCH --time={walltime}\n" "#SBATCH --mail-user={email}\n" "#SBATCH --mail-type=ALL\n").format( nodes=int(nodes), processors_per_node=processors_per_node, mem=mem, walltime=walltime, email=email, ) try: account = get_run_desc_value(run_desc, ("account", ), fatal=False) sbatch_directives += "#SBATCH --account={account}\n".format( account=account) except KeyError: account = "rrg-allen" if SYSTEM in {"beluga", "cedar"} else "def-allen" sbatch_directives += "#SBATCH --account={account}\n".format( account=account) log.info( ("No account found in run description YAML file, " "so assuming {account}. If sbatch complains you can specify a " "different account with a YAML line like account: def-allen" ).format(account=account)) stdout = ("stdout_deflate_{result_type}".format( result_type=result_type) if deflate else "stdout") stderr = ("stderr_deflate_{result_type}".format( result_type=result_type) if deflate else "stderr") sbatch_directives += ("# stdout and stderr file paths/names\n" "#SBATCH --output={stdout}\n" "#SBATCH --error={stderr}\n").format( stdout=results_dir / stdout, stderr=results_dir / stderr) return sbatch_directives
def _build_batch_script( run_desc, desc_file, nemo_processors, xios_processors, max_deflate_jobs, results_dir, run_dir, deflate, separate_deflate, cedar_broadwell, ): """Build the Bash script that will execute the run. :param dict run_desc: Run description dictionary. :param desc_file: File path/name of the YAML run description file. :type desc_file: :py:class:`pathlib.Path` :param int nemo_processors: Number of processors that NEMO will be executed on. :param int xios_processors: Number of processors that XIOS will be executed on. :param int max_deflate_jobs: Maximum number of concurrent sub-processes to use for netCDF deflating. :param results_dir: Path of the directory in which to store the run results; it will be created if it does not exist. :type results_dir: :py:class:`pathlib.Path` :param run_dir: Path of the temporary run directory. :type run_dir: :py:class:`pathlib.Path` :param boolean deflate: Include "salishsea deflate" command in the bash script. :param boolean separate_deflate: Produce separate bash scripts to deflate the run results and qsub them to run as serial jobs after the NEMO run finishes. :param boolean cedar_broadwell: Use broadwell (32 cores/node) nodes on cedar. :returns: Bash script to execute the run. :rtype: str """ script = "#!/bin/bash\n" try: email = get_run_desc_value(run_desc, ("email", ), fatal=False) except KeyError: email = "{user}@eoas.ubc.ca".format(user=os.getenv("USER")) if SYSTEM in {"beluga", "cedar", "graham"}: script = "\n".join(( script, "{sbatch_directives}\n".format( sbatch_directives=_sbatch_directives( run_desc, nemo_processors + xios_processors, cedar_broadwell, email, results_dir, )), )) else: procs_per_node = { "delta": 20, "omega": 20, "sigma": 20, "sockeye": 40, "orcinus": 12, "seawolf1": 12, "seawolf2": 12, "seawolf3": 12, }.get(SYSTEM, 0) script = "\n".join(( script, "{pbs_directives}\n".format(pbs_directives=_pbs_directives( run_desc, nemo_processors + xios_processors, email, results_dir, procs_per_node, )), )) script = "\n".join(( script, "{defns}\n" "{modules}\n" "{execute}\n" "{fix_permissions}\n" "{cleanup}".format( defns=_definitions(run_desc, desc_file, run_dir, results_dir, deflate), modules=_modules(), execute=_execute( nemo_processors, xios_processors, deflate, max_deflate_jobs, separate_deflate, ), fix_permissions=_fix_permissions(), cleanup=_cleanup(), ), )) return script