Exemple #1
0
def _write_segment_desc_file(run_desc, desc_file, restart_dir, segment_namrun,
                             tmp_run_desc_dir):
    """
    :param dict run_desc: Run description dictionary.

    :param str desc_file: Name of the run description YAML file for segment.

    :param restart_dir: Directory path in which to find the restart file(s)
                        for the segments.
                        Use :py:obj:`None` for segment 0 to avoid replacing the
                        restart directory path in the base run description YAML file.
    :type restart_dir: :py:class:`pathlib.Path` or None

    :param segment_namrun: File path and name of namelist section file containing
                           namrun for the segment.
    :type segment_namrun: :py:class:`pathlib.Path`

    :param tmp_run_desc_dir: Temporary directory where the namelists and run description
                             files for segments are stored.
    :type tmp_run_desc_dir: :py:class:`pathlib.Path`

    :return: Run description dict updated with namrun namelist section and
             restart file(s) paths,
             File path and name of temporary run description file for the segment.
    :rtype: 2-tuple
    """
    # namrun namelist for segment
    namelist_namrun = get_run_desc_value(
        run_desc, ("segmented run", "namelists", "namrun"))
    namelist_namrun_index = run_desc["namelists"]["namelist_cfg"].index(
        namelist_namrun)
    run_desc["namelists"]["namelist_cfg"][namelist_namrun_index] = os.fspath(
        segment_namrun)
    # restart file(s) for segment
    if restart_dir is not None:
        nml = f90nml.read(segment_namrun)
        restart_timestep = nml["namrun"]["nn_it000"] - 1
        for name, path in get_run_desc_value(run_desc, ("restart", )).items():
            path = Path(path)
            name_head = path.name.split("_")[0]
            name_tail = path.name.split("_", 2)[-1]
            restart_path = (
                restart_dir /
                "{name_head}_{restart_timestep:08d}_{name_tail}".format(
                    name_head=name_head,
                    restart_timestep=restart_timestep,
                    name_tail=name_tail,
                ))
            run_desc["restart"][name] = os.fspath(restart_path)
    # walltime for segment
    segment_walltime = get_run_desc_value(
        run_desc, ("segmented run", "segment walltime"))
    run_desc["walltime"] = segment_walltime
    # write temporary run description file for segment
    with (tmp_run_desc_dir / desc_file).open("wt") as f:
        yaml.safe_dump(run_desc, f, default_flow_style=False)
    return run_desc, tmp_run_desc_dir / desc_file
Exemple #2
0
def _calc_n_segments(run_desc):
    run_start_date = arrow.get(
        get_run_desc_value(run_desc, ("segmented run", "start date")))
    run_end_date = arrow.get(
        get_run_desc_value(run_desc, ("segmented run", "end date")))
    days_per_segment = get_run_desc_value(
        run_desc, ("segmented run", "days per segment"))

    n_segments_delta = (run_end_date.shift(days=+1) -
                        run_start_date) / days_per_segment
    n_segments = n_segments_delta.days + math.ceil(n_segments_delta.seconds /
                                                   (60 * 60 * 24))
    return n_segments
Exemple #3
0
def _definitions(run_desc, run_desc_file, run_dir, results_dir, deflate):
    salishsea_cmd = {
        "beluga": Path("${HOME}", ".local", "bin", "salishsea"),
        "cedar": Path("${HOME}", ".local", "bin", "salishsea"),
        "delta": Path("${PBS_O_HOME}", "bin", "salishsea"),
        "graham": Path("${HOME}", ".local", "bin", "salishsea"),
        "omega": Path("${PBS_O_HOME}", "bin", "salishsea"),
        "orcinus": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"),
        "sigma": Path("${PBS_O_HOME}", "bin", "salishsea"),
        "salish": Path("${HOME}", ".local", "bin", "salishsea"),
        "seawolf1": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"),
        "seawolf2": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"),
        "seawolf3": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"),
        "sockeye": Path("${PBS_O_HOME}", ".local", "bin", "salishsea"),
    }.get(SYSTEM, Path("${HOME}", ".local", "bin", "salishsea"))
    defns = ('RUN_ID="{run_id}"\n'
             'RUN_DESC="{run_dir}/{run_desc_file}"\n'
             'WORK_DIR="{run_dir}"\n'
             'RESULTS_DIR="{results_dir}"\n'
             'COMBINE="{salishsea_cmd} combine"\n').format(
                 run_id=get_run_desc_value(run_desc, ("run_id", )),
                 run_desc_file=run_desc_file.name,
                 run_dir=run_dir,
                 results_dir=results_dir,
                 salishsea_cmd=salishsea_cmd,
             )
    if deflate:
        defns += 'DEFLATE="{salishsea_cmd} deflate"\n'.format(
            salishsea_cmd=salishsea_cmd)
    defns += 'GATHER="{salishsea_cmd} gather"\n'.format(
        salishsea_cmd=salishsea_cmd)
    return defns
Exemple #4
0
def _build_tmp_run_dir(
    run_desc,
    desc_file,
    results_dir,
    cedar_broadwell,
    deflate,
    max_deflate_jobs,
    separate_deflate,
    nocheck_init,
    quiet,
):
    run_dir = api.prepare(desc_file, nocheck_init)
    if not quiet:
        log.info("Created run directory {}".format(run_dir))
    nemo_processors = get_n_processors(run_desc, run_dir)
    separate_xios_server = get_run_desc_value(
        run_desc, ("output", "separate XIOS server"))
    if separate_xios_server:
        xios_processors = get_run_desc_value(run_desc,
                                             ("output", "XIOS servers"))
    else:
        xios_processors = 0
    batch_script = _build_batch_script(
        run_desc,
        desc_file,
        nemo_processors,
        xios_processors,
        max_deflate_jobs,
        results_dir,
        run_dir,
        deflate,
        separate_deflate,
        cedar_broadwell,
    )
    batch_file = run_dir / "SalishSeaNEMO.sh"
    with batch_file.open("wt") as f:
        f.write(batch_script)
    if separate_deflate:
        for deflate_job, pattern in SEPARATE_DEFLATE_JOBS.items():
            deflate_script = _build_deflate_script(run_desc, pattern,
                                                   deflate_job, results_dir)
            script_file = run_dir / "deflate_{}.sh".format(deflate_job)
            with script_file.open("wt") as f:
                f.write(deflate_script)
    return run_dir, batch_file
Exemple #5
0
def _calc_run_segments(desc_file, results_dir):
    run_desc = load_run_desc(desc_file)
    if "segmented run" not in run_desc:
        first_seg_no = 0
        return [(run_desc, desc_file, results_dir, {})], first_seg_no
    base_run_id = get_run_desc_value(run_desc, ("run_id", ))
    start_date = arrow.get(
        get_run_desc_value(run_desc, ("segmented run", "start date")))
    start_timestep = get_run_desc_value(run_desc,
                                        ("segmented run", "start time step"))
    end_date = arrow.get(
        get_run_desc_value(run_desc, ("segmented run", "end date")))
    days_per_segment = get_run_desc_value(
        run_desc, ("segmented run", "days per segment"))
    namelist_namdom = get_run_desc_value(
        run_desc, ("segmented run", "namelists", "namdom"), expand_path=True)
    rn_rdt = f90nml.read(namelist_namdom)["namdom"]["rn_rdt"]
    timesteps_per_day = 24 * 60 * 60 / rn_rdt
    n_segments = _calc_n_segments(run_desc)
    run_segments = []
    first_seg_no = get_run_desc_value(
        run_desc, ("segmented run", "first segment number"))
    for i, seg_no in enumerate(range(first_seg_no, first_seg_no + n_segments)):
        segment_run_id = "{seg_no}_{base_run_id}".format(
            seg_no=seg_no, base_run_id=base_run_id)
        segment_run_desc = copy.deepcopy(run_desc)
        segment_run_desc["run_id"] = segment_run_id
        nn_it000 = int(start_timestep +
                       i * days_per_segment * timesteps_per_day)
        date0 = min(start_date.shift(days=+i * days_per_segment), end_date)
        segment_days = min(
            days_per_segment,
            (end_date - start_date.shift(days=+i * days_per_segment)).days + 1,
        )
        nn_itend = int(nn_it000 + segment_days * timesteps_per_day - 1)
        run_segments.append((
            # Run description dict for the segment
            segment_run_desc,
            # Run description YAML file name for the segment
            "{file_stem}_{seg_no}{suffix}".format(file_stem=desc_file.stem,
                                                  seg_no=seg_no,
                                                  suffix=desc_file.suffix),
            # Results directory for the segment
            results_dir.parent / "{dir_name}_{seg_no}".format(
                dir_name=results_dir.name, seg_no=seg_no),
            # f90nml namelist patch for the segment for the namelist containing namrum
            {
                "namrun": {
                    "nn_it000": nn_it000,
                    "nn_itend": nn_itend,
                    "nn_date0": int(date0.format("YYYYMMDD")),
                }
            },
        ))
    return run_segments, first_seg_no
Exemple #6
0
def _write_segment_namrun_namelist(run_desc, namelist_namrun_patch,
                                   tmp_run_desc_dir):
    """
    :param dict run_desc: Run description dictionary.

    :param dict namelist_namrun_patch: f90nml patch for namrun namelist for the segment.

    :param tmp_run_desc_dir: Temporary directory where the namelists and run description
                             files for segments are stored.
    :type tmp_run_desc_dir: :py:class:`pathlib.Path`

    :return: File path and name of namelist section file containing namrun namelist
             for the segment.
    :rtype: :py:class:`pathlib.Path`
    """
    namelist_namrun = get_run_desc_value(
        run_desc, ("segmented run", "namelists", "namrun"), expand_path=True)
    f90nml.patch(namelist_namrun, namelist_namrun_patch,
                 tmp_run_desc_dir / namelist_namrun.name)
    return tmp_run_desc_dir / namelist_namrun.name
Exemple #7
0
def _build_deflate_script(run_desc, pattern, result_type, results_dir):
    script = "#!/bin/bash\n"
    try:
        email = get_run_desc_value(run_desc, ("email", ))
    except KeyError:
        email = "{user}@eos.ubc.ca".format(user=os.getenv("USER"))
    pmem = "2500mb" if result_type == "ptrc" else "2000mb"
    script = "\n".join((
        script,
        "{pbs_directives}\n".format(pbs_directives=_pbs_directives(
            run_desc,
            1,
            email,
            results_dir,
            pmem=pmem,
            deflate=True,
            result_type=result_type,
        )),
    ))
    script += ('RESULTS_DIR="{results_dir}"\n'
               'DEFLATE="${{PBS_O_HOME}}/.local/bin/salishsea deflate"\n'
               "\n"
               "{modules}\n"
               "cd ${{RESULTS_DIR}}\n"
               'echo "Results deflation started at $(date)"\n'
               "${{DEFLATE}} {pattern} --jobs 1 --debug\n"
               "DEFLATE_EXIT_CODE=$?\n"
               'echo "Results deflation ended at $(date)"\n'
               "\n"
               "chmod g+rw ${{RESULTS_DIR}}/*\n"
               "chmod o+r ${{RESULTS_DIR}}/*\n"
               "\n"
               "exit ${{DEFLATE_EXIT_CODE}}\n").format(results_dir=results_dir,
                                                       modules=_modules(),
                                                       pattern=pattern)
    return script
Exemple #8
0
def _pbs_directives(
    run_desc,
    n_processors,
    email,
    results_dir,
    procs_per_node=0,
    pmem="2000mb",
    deflate=False,
    result_type="",
    stderr_stdout=True,
):
    """Return the PBS directives used to run NEMO on a cluster that uses the
    TORQUE resource manager for job scheduling.

    The string that is returned is intended for inclusion in a bash script
    that will be to the cluster queue manager via the :command:`qsub` command.

    :param dict run_desc: Run description dictionary.

    :param int n_processors: Number of processors that the run will be
                             executed on; i.e. the sum of NEMO and XIOS processors.

    :param str email: Email address to send job begin, end & abort
                      notifications to.

    :param results_dir: Directory to store results into.
    :type results_dir: :py:class:`pathlib.Path`

    :param int procs_per_node: Number of processors per node.
                               Defaults to 0 to produce
                               :kbd:`#PBS -l procs=n_processors` directive.
                               Otherwise produces
                               :kbd:`#PBS -l nodes=n:ppn=procs_per_node` directive.

    :param str pmem: Memory per processor.

    :param boolean deflate: Return directives for a run results deflation job
                            when :py:obj:`True`.

    :param str result_type: Run result type ('grid', 'ptrc', or 'dia') for
                            deflation job.

    :param boolean stderr_stdout: When :py:obj:`False`, don't include directives
                                  to put stderr and stdout in results directory.
                                  Added for use in run scripts generated by
                                  :kbd:`run_NEMO` worker that do per-command
                                  redirection to stderr and stdout.

    :returns: PBS directives for run script.
    :rtype: Unicode str
    """
    run_id = get_run_desc_value(run_desc, ("run_id", ))
    if not procs_per_node:
        procs_directive = "#PBS -l procs={procs}".format(procs=n_processors)
    else:
        nodes = math.ceil(n_processors / procs_per_node)
        if SYSTEM == "sockeye":
            procs_directive = "#PBS -l select={nodes}:ncpus={procs_per_node}:mpiprocs={procs_per_node}:mem=186gb".format(
                nodes=nodes, procs_per_node=procs_per_node)
        else:
            procs_directive = "#PBS -l nodes={nodes}:ppn={procs_per_node}".format(
                nodes=nodes, procs_per_node=procs_per_node)
    if deflate:
        run_id = "{result_type}_{run_id}_deflate".format(
            run_id=run_id, result_type=result_type)
    try:
        td = datetime.timedelta(
            seconds=get_run_desc_value(run_desc, ("walltime", )))
    except TypeError:
        t = datetime.datetime.strptime(
            get_run_desc_value(run_desc, ("walltime", )), "%H:%M:%S").time()
        td = datetime.timedelta(hours=t.hour,
                                minutes=t.minute,
                                seconds=t.second)
    walltime = _td2hms(td)
    pbs_directives = textwrap.dedent("""\
        #PBS -N {run_id}
        #PBS -S /bin/bash
        #PBS -l walltime={walltime}
        # email when the job [b]egins and [e]nds, or is [a]borted
        #PBS -m bea
        #PBS -M {email}
        """).format(run_id=run_id, walltime=walltime, email=email)
    if SYSTEM == "sockeye":
        pbs_directives += textwrap.dedent("""\
            #PBS -A st-sallen1-1
            {procs_directive}
            """).format(procs_directive=procs_directive)
    else:
        if SYSTEM == "orcinus" or SYSTEM.startswith("seawolf"):
            pbs_directives += "#PBS -l partition=QDR\n"
        if SYSTEM == "salish":
            pbs_directives += textwrap.dedent("""\
                {procs_directive}
                # total memory for job
                #PBS -l mem=64gb
                """).format(procs_directive=procs_directive, pmem=pmem)
        else:
            pbs_directives += textwrap.dedent("""\
                {procs_directive}
                # memory per processor
                #PBS -l pmem={pmem}
                """).format(procs_directive=procs_directive, pmem=pmem)
    if stderr_stdout:
        stdout = ("stdout_deflate_{result_type}".format(
            result_type=result_type) if deflate else "stdout")
        stderr = ("stderr_deflate_{result_type}".format(
            result_type=result_type) if deflate else "stderr")
        pbs_directives += ("# stdout and stderr file paths/names\n"
                           "#PBS -o {results_dir}/{stdout}\n"
                           "#PBS -e {results_dir}/{stderr}\n").format(
                               results_dir=results_dir,
                               stdout=stdout,
                               stderr=stderr)
    return pbs_directives
Exemple #9
0
def _sbatch_directives(
    run_desc,
    n_processors,
    cedar_broadwell,
    email,
    results_dir,
    mem="0",
    deflate=False,
    result_type="",
):
    """Return the SBATCH directives used to run NEMO on a cluster that uses the
    Slurm Workload Manager for job scheduling.

    The string that is returned is intended for inclusion in a bash script
    that will submitted be to the cluster queue manager via the
    :command:`sbatch` command.

    :param dict run_desc: Run description dictionary.

    :param int n_processors: Number of processors that the run will be
                             executed on; the sum of NEMO and XIOS processors.

    :param boolean cedar_broadwell: Use broadwell (32 cores/node) nodes on
                                    cedar.

    :param str email: Email address to send job begin, end & abort
                      notifications to.

    :param results_dir: Directory to store results into.
    :type results_dir: :py:class:`pathlib.Path`

    :param str mem: Memory per node.

    :param boolean deflate: Return directives for a run results deflation job
                            when :py:obj:`True`.

    :param str result_type: Run result type ('grid', 'ptrc', or 'dia') for
                            deflation job.

    :returns: SBATCH directives for run script.
    :rtype: Unicode str
    """
    run_id = get_run_desc_value(run_desc, ("run_id", ))
    constraint = "broadwell" if SYSTEM == "cedar" and cedar_broadwell else "skylake"
    try:
        processors_per_node = {"beluga": 40, "cedar": 48, "graham": 32}[SYSTEM]
    except KeyError:
        log.error("unknown system: {system}".format(system=SYSTEM))
        raise SystemExit(2)
    if SYSTEM == "cedar" and cedar_broadwell:
        processors_per_node = 32
    nodes = math.ceil(n_processors / processors_per_node)
    mem = {"beluga": "92G", "cedar": "0", "graham": "0"}.get(SYSTEM, mem)
    if deflate:
        run_id = "{result_type}_{run_id}_deflate".format(
            run_id=run_id, result_type=result_type)
    try:
        td = datetime.timedelta(
            seconds=get_run_desc_value(run_desc, ("walltime", )))
    except TypeError:
        t = datetime.datetime.strptime(
            get_run_desc_value(run_desc, ("walltime", )), "%H:%M:%S").time()
        td = datetime.timedelta(hours=t.hour,
                                minutes=t.minute,
                                seconds=t.second)
    walltime = _td2hms(td)
    if SYSTEM == "cedar":
        sbatch_directives = ("#SBATCH --job-name={run_id}\n"
                             "#SBATCH --constraint={constraint}\n").format(
                                 run_id=run_id, constraint=constraint)
    else:
        sbatch_directives = "#SBATCH --job-name={run_id}\n".format(
            run_id=run_id)
    sbatch_directives += ("#SBATCH --nodes={nodes}\n"
                          "#SBATCH --ntasks-per-node={processors_per_node}\n"
                          "#SBATCH --mem={mem}\n"
                          "#SBATCH --time={walltime}\n"
                          "#SBATCH --mail-user={email}\n"
                          "#SBATCH --mail-type=ALL\n").format(
                              nodes=int(nodes),
                              processors_per_node=processors_per_node,
                              mem=mem,
                              walltime=walltime,
                              email=email,
                          )
    try:
        account = get_run_desc_value(run_desc, ("account", ), fatal=False)
        sbatch_directives += "#SBATCH --account={account}\n".format(
            account=account)
    except KeyError:
        account = "rrg-allen" if SYSTEM in {"beluga", "cedar"} else "def-allen"
        sbatch_directives += "#SBATCH --account={account}\n".format(
            account=account)
        log.info(
            ("No account found in run description YAML file, "
             "so assuming {account}. If sbatch complains you can specify a "
             "different account with a YAML line like account: def-allen"
             ).format(account=account))
    stdout = ("stdout_deflate_{result_type}".format(
        result_type=result_type) if deflate else "stdout")
    stderr = ("stderr_deflate_{result_type}".format(
        result_type=result_type) if deflate else "stderr")
    sbatch_directives += ("# stdout and stderr file paths/names\n"
                          "#SBATCH --output={stdout}\n"
                          "#SBATCH --error={stderr}\n").format(
                              stdout=results_dir / stdout,
                              stderr=results_dir / stderr)
    return sbatch_directives
Exemple #10
0
def _build_batch_script(
    run_desc,
    desc_file,
    nemo_processors,
    xios_processors,
    max_deflate_jobs,
    results_dir,
    run_dir,
    deflate,
    separate_deflate,
    cedar_broadwell,
):
    """Build the Bash script that will execute the run.

    :param dict run_desc: Run description dictionary.

    :param desc_file: File path/name of the YAML run description file.
    :type desc_file: :py:class:`pathlib.Path`

    :param int nemo_processors: Number of processors that NEMO will be executed
                                on.

    :param int xios_processors: Number of processors that XIOS will be executed
                                on.

    :param int max_deflate_jobs: Maximum number of concurrent sub-processes to
                                 use for netCDF deflating.

    :param results_dir: Path of the directory in which to store the run
                        results;
                        it will be created if it does not exist.
    :type results_dir: :py:class:`pathlib.Path`

    :param run_dir: Path of the temporary run directory.
    :type run_dir: :py:class:`pathlib.Path`

    :param boolean deflate: Include "salishsea deflate" command in the bash
                            script.

    :param boolean separate_deflate: Produce separate bash scripts to deflate
                                     the run results and qsub them to run as
                                     serial jobs after the NEMO run finishes.

    :param boolean cedar_broadwell: Use broadwell (32 cores/node) nodes on
                                    cedar.

    :returns: Bash script to execute the run.
    :rtype: str
    """
    script = "#!/bin/bash\n"
    try:
        email = get_run_desc_value(run_desc, ("email", ), fatal=False)
    except KeyError:
        email = "{user}@eoas.ubc.ca".format(user=os.getenv("USER"))
    if SYSTEM in {"beluga", "cedar", "graham"}:
        script = "\n".join((
            script,
            "{sbatch_directives}\n".format(
                sbatch_directives=_sbatch_directives(
                    run_desc,
                    nemo_processors + xios_processors,
                    cedar_broadwell,
                    email,
                    results_dir,
                )),
        ))
    else:
        procs_per_node = {
            "delta": 20,
            "omega": 20,
            "sigma": 20,
            "sockeye": 40,
            "orcinus": 12,
            "seawolf1": 12,
            "seawolf2": 12,
            "seawolf3": 12,
        }.get(SYSTEM, 0)
        script = "\n".join((
            script,
            "{pbs_directives}\n".format(pbs_directives=_pbs_directives(
                run_desc,
                nemo_processors + xios_processors,
                email,
                results_dir,
                procs_per_node,
            )),
        ))
    script = "\n".join((
        script,
        "{defns}\n"
        "{modules}\n"
        "{execute}\n"
        "{fix_permissions}\n"
        "{cleanup}".format(
            defns=_definitions(run_desc, desc_file, run_dir, results_dir,
                               deflate),
            modules=_modules(),
            execute=_execute(
                nemo_processors,
                xios_processors,
                deflate,
                max_deflate_jobs,
                separate_deflate,
            ),
            fix_permissions=_fix_permissions(),
            cleanup=_cleanup(),
        ),
    ))
    return script