def make_batch_script(self, input_template, job, case, outfile=None): expect(os.path.exists(input_template), "input file '{}' does not exist".format(input_template)) task_count = self.get_value("task_count", subgroup=job) overrides = {} if task_count is not None: overrides["total_tasks"] = int(task_count) overrides["num_nodes"] = int(math.ceil(float(task_count)/float(case.tasks_per_node))) else: task_count = case.get_value("TOTALPES")*int(case.thread_count) if int(task_count) < case.get_value("MAX_TASKS_PER_NODE"): overrides["max_tasks_per_node"] = int(task_count) overrides["job_id"] = case.get_value("CASE") + os.path.splitext(job)[1] if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives(case, job, overrides=overrides) overrides["mpirun"] = case.get_mpirun_cmd(job=job) output_text = transform_vars(open(input_template,"r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job(job) if outfile is None else outfile logger.info("Creating file {}".format(output_name)) with open(output_name, "w") as fd: fd.write(output_text) # make sure batch script is exectuble os.chmod(output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
if int(task_count) < case.get_value("MAX_TASKS_PER_NODE"): overrides["max_tasks_per_node"] = int(task_count) overrides["job_id"] = case.get_value("CASE") + os.path.splitext(job)[1] if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives( case, job, overrides=overrides) overrides["mpirun"] = case.get_mpirun_cmd(job=job) output_text = transform_vars(open(input_template, "r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job( job) if outfile is None else outfile logger.info("Creating file {}".format(output_name)) with open(output_name, "w") as fd: fd.write(output_text) # make sure batch script is exectuble os.chmod( output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) def set_job_defaults(self, batch_jobs, case): if self._batchtype is None: self._batchtype = self.get_batch_system_type() if self._batchtype == "none":
def _submit_single_job(self, case, job, dep_jobs=None, allow_fail=False, no_batch=False, skip_pnl=False, mail_user=None, mail_type=None, batch_args=None, dry_run=False, resubmit_immediate=False): if not dry_run: logger.warning("Submit job {}".format(job)) batch_system = self.get_value("BATCH_SYSTEM", subgroup=None) if batch_system is None or batch_system == "none" or no_batch: logger.info("Starting job script {}".format(job)) function_name = job.replace(".", "_") if not dry_run: args = self._build_run_args(job, True, skip_pnl=skip_pnl, set_continue_run=resubmit_immediate, submit_resubmits=not resubmit_immediate) try: getattr(case, function_name)(**{k: v for k, (v, _) in args.items()}) except Exception as e: # We don't want exception from the run phases getting into submit phase logger.warning("Exception from {}: {}".format(function_name, str(e))) return submitargs = self.get_submit_args(case, job) args_override = self.get_value("BATCH_COMMAND_FLAGS", subgroup=job) if args_override: submitargs = args_override if dep_jobs is not None and len(dep_jobs) > 0: logger.debug("dependencies: {}".format(dep_jobs)) if allow_fail: dep_string = self.get_value("depend_allow_string", subgroup=None) if dep_string is None: logger.warning("'depend_allow_string' is not defined for this batch system, " + "falling back to the 'depend_string'") dep_string = self.get_value("depend_string", subgroup=None) else: dep_string = self.get_value("depend_string", subgroup=None) expect(dep_string is not None, "'depend_string' is not defined for this batch system") separator_string = self.get_value("depend_separator", subgroup=None) expect(separator_string is not None,"depend_separator string not defined") expect("jobid" in dep_string, "depend_string is missing jobid for prerequisite jobs") dep_ids_str = str(dep_jobs[0]) for dep_id in dep_jobs[1:]: dep_ids_str += separator_string + str(dep_id) dep_string = dep_string.replace("jobid",dep_ids_str.strip()) # pylint: disable=maybe-no-member submitargs += " " + dep_string if batch_args is not None: submitargs += " " + batch_args cime_config = get_cime_config() if mail_user is None and cime_config.has_option("main", "MAIL_USER"): mail_user = cime_config.get("main", "MAIL_USER") if mail_user is not None: mail_user_flag = self.get_value('batch_mail_flag', subgroup=None) if mail_user_flag is not None: submitargs += " " + mail_user_flag + " " + mail_user if mail_type is None: if job == "case.test" and cime_config.has_option("create_test", "MAIL_TYPE"): mail_type = cime_config.get("create_test", "MAIL_TYPE") elif cime_config.has_option("main", "MAIL_TYPE"): mail_type = cime_config.get("main", "MAIL_TYPE") else: mail_type = self.get_value("batch_mail_default") if mail_type: mail_type = mail_type.split(",") # pylint: disable=no-member if mail_type: mail_type_flag = self.get_value("batch_mail_type_flag", subgroup=None) if mail_type_flag is not None: mail_type_args = [] for indv_type in mail_type: mail_type_arg = self.get_batch_mail_type(indv_type) mail_type_args.append(mail_type_arg) if mail_type_flag == "-m": # hacky, PBS-type systems pass multiple mail-types differently submitargs += " {} {}".format(mail_type_flag, "".join(mail_type_args)) else: submitargs += " {} {}".format(mail_type_flag, " {} ".format(mail_type_flag).join(mail_type_args)) batchsubmit = self.get_value("batch_submit", subgroup=None) expect(batchsubmit is not None, "Unable to determine the correct command for batch submission.") batchredirect = self.get_value("batch_redirect", subgroup=None) batch_env_flag = self.get_value("batch_env", subgroup=None) run_args = self._build_run_args_str(job, False, skip_pnl=skip_pnl, set_continue_run=resubmit_immediate, submit_resubmits=not resubmit_immediate) if batch_env_flag: sequence = (batchsubmit, submitargs, run_args, batchredirect, get_batch_script_for_job(job)) else: sequence = (batchsubmit, submitargs, batchredirect, get_batch_script_for_job(job), run_args) submitcmd = " ".join(s.strip() for s in sequence if s is not None) if dry_run: return submitcmd else: logger.info("Submitting job script {}".format(submitcmd)) output = run_cmd_no_fail(submitcmd, combine_output=True) jobid = self.get_job_id(output) logger.info("Submitted job id is {}".format(jobid)) return jobid
def _case_build_impl(caseroot, case, sharedlib_only, model_only, buildlist, save_build_provenance): ############################################################################### t1 = time.time() expect(not (sharedlib_only and model_only), "Contradiction: both sharedlib_only and model_only") logger.info("Building case in directory {}".format(caseroot)) logger.info("sharedlib_only is {}".format(sharedlib_only)) logger.info("model_only is {}".format(model_only)) expect(os.path.isdir(caseroot), "'{}' is not a valid directory".format(caseroot)) os.chdir(caseroot) expect(os.path.exists(get_batch_script_for_job(case.get_primary_job())), "ERROR: must invoke case.setup script before calling build script ") cimeroot = case.get_value("CIMEROOT") comp_classes = case.get_values("COMP_CLASSES") case.check_lockedfiles(skip="env_batch") # Retrieve relevant case data # This environment variable gets set for cesm Make and # needs to be unset before building again. if "MODEL" in os.environ: del os.environ["MODEL"] build_threaded = case.get_build_threaded() casetools = case.get_value("CASETOOLS") exeroot = os.path.abspath(case.get_value("EXEROOT")) incroot = os.path.abspath(case.get_value("INCROOT")) libroot = os.path.abspath(case.get_value("LIBROOT")) sharedlibroot = os.path.abspath(case.get_value("SHAREDLIBROOT")) multi_driver = case.get_value("MULTI_DRIVER") complist = [] ninst = 1 for comp_class in comp_classes: if comp_class == "CPL": config_dir = None if multi_driver: ninst = case.get_value("NINST_MAX") else: config_dir = os.path.dirname(case.get_value("CONFIG_{}_FILE".format(comp_class))) if multi_driver: ninst = 1 else: ninst = case.get_value("NINST_{}".format(comp_class)) comp = case.get_value("COMP_{}".format(comp_class)) thrds = case.get_value("NTHRDS_{}".format(comp_class)) expect(ninst is not None,"Failed to get ninst for comp_class {}".format(comp_class)) complist.append((comp_class.lower(), comp, thrds, ninst, config_dir )) os.environ["COMP_{}".format(comp_class)] = comp ocn_submodel = case.get_value("OCN_SUBMODEL") profile_papi_enable = case.get_value("PROFILE_PAPI_ENABLE") compiler = case.get_value("COMPILER") comp_interface = case.get_value("COMP_INTERFACE") mpilib = case.get_value("MPILIB") use_esmf_lib = case.get_value("USE_ESMF_LIB") debug = case.get_value("DEBUG") ninst_build = case.get_value("NINST_BUILD") smp_value = case.get_value("SMP_VALUE") clm_use_petsc = case.get_value("CLM_USE_PETSC") cism_use_trilinos = case.get_value("CISM_USE_TRILINOS") mali_use_albany = case.get_value("MALI_USE_ALBANY") use_moab = case.get_value("USE_MOAB") clm_config_opts = case.get_value("CLM_CONFIG_OPTS") cam_config_opts = case.get_value("CAM_CONFIG_OPTS") pio_config_opts = case.get_value("PIO_CONFIG_OPTS") ninst_value = case.get_value("NINST_VALUE") mach = case.get_value("MACH") os_ = case.get_value("OS") # Load some params into env os.environ["CIMEROOT"] = cimeroot os.environ["CASETOOLS"] = casetools os.environ["EXEROOT"] = exeroot os.environ["INCROOT"] = incroot os.environ["LIBROOT"] = libroot os.environ["SHAREDLIBROOT"] = sharedlibroot os.environ["CASEROOT"] = caseroot os.environ["COMPILER"] = compiler os.environ["COMP_INTERFACE"] = comp_interface os.environ["NINST_VALUE"] = str(ninst_value) os.environ["BUILD_THREADED"] = stringify_bool(build_threaded) os.environ["MACH"] = mach os.environ["USE_ESMF_LIB"] = stringify_bool(use_esmf_lib) os.environ["MPILIB"] = mpilib os.environ["DEBUG"] = stringify_bool(debug) os.environ["OS"] = os_ os.environ["CLM_CONFIG_OPTS"] = clm_config_opts if clm_config_opts is not None else "" os.environ["CAM_CONFIG_OPTS"] = cam_config_opts if cam_config_opts is not None else "" os.environ["PIO_CONFIG_OPTS"] = pio_config_opts if pio_config_opts is not None else "" os.environ["OCN_SUBMODEL"] = ocn_submodel if ocn_submodel is not None else "" os.environ["PROFILE_PAPI_ENABLE"] = stringify_bool(profile_papi_enable) os.environ["CLM_USE_PETSC"] = stringify_bool(clm_use_petsc) os.environ["CISM_USE_TRILINOS"] = stringify_bool(cism_use_trilinos) os.environ["MALI_USE_ALBANY"] = stringify_bool(mali_use_albany) os.environ["USE_MOAB"] = stringify_bool(use_moab) if get_model() == "e3sm" and mach == "titan" and compiler == "pgiacc": case.set_value("CAM_TARGET", "preqx_acc") # This is a timestamp for the build , not the same as the testid, # and this case may not be a test anyway. For a production # experiment there may be many builds of the same case. lid = get_timestamp("%y%m%d-%H%M%S") os.environ["LID"] = lid # Set the overall USE_PETSC variable to TRUE if any of the # *_USE_PETSC variables are TRUE. # For now, there is just the one CLM_USE_PETSC variable, but in # the future there may be others -- so USE_PETSC will be true if # ANY of those are true. use_petsc = clm_use_petsc case.set_value("USE_PETSC", use_petsc) os.environ["USE_PETSC"] = stringify_bool(use_petsc) # Set the overall USE_TRILINOS variable to TRUE if any of the # *_USE_TRILINOS variables are TRUE. # For now, there is just the one CISM_USE_TRILINOS variable, but in # the future there may be others -- so USE_TRILINOS will be true if # ANY of those are true. use_trilinos = False if cism_use_trilinos is None else cism_use_trilinos case.set_value("USE_TRILINOS", use_trilinos) os.environ["USE_TRILINOS"] = stringify_bool(use_trilinos) # Set the overall USE_ALBANY variable to TRUE if any of the # *_USE_ALBANY variables are TRUE. # For now, there is just the one MALI_USE_ALBANY variable, but in # the future there may be others -- so USE_ALBANY will be true if # ANY of those are true. use_albany = stringify_bool(mali_use_albany) case.set_value("USE_ALBANY", use_albany) os.environ["USE_ALBANY"] = use_albany # Load modules case.load_env() sharedpath = _build_checks(case, build_threaded, comp_interface, use_esmf_lib, debug, compiler, mpilib, complist, ninst_build, smp_value, model_only, buildlist) t2 = time.time() logs = [] if not model_only: logs = _build_libraries(case, exeroot, sharedpath, caseroot, cimeroot, libroot, lid, compiler, buildlist, comp_interface) if not sharedlib_only: os.environ["INSTALL_SHAREDPATH"] = os.path.join(exeroot, sharedpath) # for MPAS makefile generators logs.extend(_build_model(build_threaded, exeroot, clm_config_opts, incroot, complist, lid, caseroot, cimeroot, compiler, buildlist, comp_interface)) if not buildlist: # in case component build scripts updated the xml files, update the case object case.read_xml() # Note, doing buildlists will never result in the system thinking the build is complete post_build(case, logs, build_complete=not (buildlist or sharedlib_only), save_build_provenance=save_build_provenance) t3 = time.time() if not sharedlib_only: logger.info("Time spent not building: {:f} sec".format(t2 - t1)) logger.info("Time spent building: {:f} sec".format(t3 - t2)) logger.info("MODEL BUILD HAS FINISHED SUCCESSFULLY") return True
ext = os.path.splitext(job)[-1] if len(ext) == 0: ext = job if ext.startswith('.'): ext = ext[1:] overrides["job_id"] = ext + '.' + case.get_value("CASE") if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives( case, job, overrides=overrides) output_text = transform_vars(open(input_template, "r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job( job) if outfile is None else outfile logger.info("Creating file {}".format(output_name)) with open(output_name, "w") as fd: fd.write(output_text) # make sure batch script is exectuble os.chmod( output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) def set_job_defaults(self, batch_jobs, case): if self._batchtype is None: self._batchtype = self.get_batch_system_type() if self._batchtype == "none":
task_count = case.get_value("TOTALPES") * int(case.thread_count) if int(task_count) < case.get_value("MAX_TASKS_PER_NODE"): overrides["max_tasks_per_node"] = int(task_count) overrides["job_id"] = case.get_value("CASE") + os.path.splitext(job)[1] if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives( case, job, overrides=overrides) output_text = transform_vars(open(input_template, "r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job(job) with open(output_name, "w") as fd: fd.write(output_text) # make sure batch script is exectuble os.chmod( output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) def set_job_defaults(self, batch_jobs, case): if self._batchtype is None: self._batchtype = self.get_batch_system_type() if self._batchtype == 'none':
if threaded and case.total_tasks * case.thread_count > cost_per_node: smt_factor = max(1.0,int(case.get_value("MAX_TASKS_PER_NODE") / cost_per_node)) case.set_value("TOTALPES", int(case.total_tasks * max(1.0,float(case.thread_count) / smt_factor))) else: case.set_value("TOTALPES", case.total_tasks*case.thread_count) # May need to select new batch settings if pelayout changed (e.g. problem is now too big for prev-selected queue) env_batch = case.get_env("batch") env_batch.set_job_defaults([(case.get_primary_job(), {})], case) # create batch files env_batch.make_all_batch_files(case) if get_model() == "e3sm" and not case.get_value("TEST"): input_batch_script = os.path.join(case.get_value("MACHDIR"), "template.case.run.sh") env_batch.make_batch_script(input_batch_script, "case.run", case, outfile=get_batch_script_for_job("case.run.sh")) # Make a copy of env_mach_pes.xml in order to be able # to check that it does not change once case.setup is invoked case.flush() logger.debug("at copy TOTALPES = {}".format(case.get_value("TOTALPES"))) lock_file("env_mach_pes.xml") lock_file("env_batch.xml") # Create user_nl files for the required number of instances if not os.path.exists("user_nl_cpl"): logger.info("Creating user_nl_xxx files for components and cpl") # loop over models for model in models: comp = case.get_value("COMP_{}".format(model))
# hacky, PBS-type systems pass multiple mail-types differently submitargs += " {} {}".format(mail_type_flag, "".join(mail_type_args)) else: submitargs += " {} {}".format( mail_type_flag, " {} ".format(mail_type_flag).join(mail_type_args)) batchsubmit = self.get_value("batch_submit", subgroup=None) expect( batchsubmit is not None, "Unable to determine the correct command for batch submission.") batchredirect = self.get_value("batch_redirect", subgroup=None) submitcmd = '' for string in (batchsubmit, submitargs, batchredirect, get_batch_script_for_job(job)): if string is not None: submitcmd += string + " " if job == 'case.run' and skip_pnl: batch_env_flag = self.get_value("batch_env", subgroup=None) if not batch_env_flag: submitcmd += " --skip-preview-namelist" else: submitcmd += " {} ARGS_FOR_SCRIPT='--skip-preview-namelist'".format( batch_env_flag) if dry_run: return submitcmd else: logger.info("Submitting job script {}".format(submitcmd))
class EnvBatch(EnvBase): def __init__(self, case_root=None, infile="env_batch.xml"): """ initialize an object interface to file env_batch.xml in the case directory """ self._batchtype = None # This arbitrary setting should always be overwritten self._default_walltime = "00:20:00" schema = os.path.join(get_cime_root(), "config", "xml_schemas", "env_batch.xsd") EnvBase.__init__(self, case_root, infile, schema=schema) # pylint: disable=arguments-differ def set_value(self, item, value, subgroup=None, ignore_type=False): """ Override the entry_id set_value function with some special cases for this class """ val = None if item == "JOB_WALLCLOCK_TIME": #Most systems use %H:%M:%S format for wallclock but LSF #uses %H:%M this code corrects the value passed in to be #the correct format - if we find we have more exceptions #than this we may need to generalize this further walltime_format = self.get_value("walltime_format", subgroup=None) if walltime_format is not None and walltime_format.count( ":") != value.count(":"): # pylint: disable=maybe-no-member if value.count(":") == 1: t_spec = "%H:%M" elif value.count(":") == 2: t_spec = "%H:%M:%S" else: expect( False, "could not interpret format for wallclock time {}". format(value)) value = format_time(walltime_format, t_spec, value) # allow the user to set item for all jobs if subgroup is not provided if subgroup is None: nodes = self.get_children("entry", {"id": item}) for node in nodes: self._set_value(node, value, vid=item, ignore_type=ignore_type) val = value else: group = self.get_optional_child("group", {"id": subgroup}) if group is not None: node = self.get_optional_child("entry", {"id": item}, root=group) if node is not None: val = self._set_value(node, value, vid=item, ignore_type=ignore_type) return val # pylint: disable=arguments-differ def get_value(self, item, attribute=None, resolved=True, subgroup="case.run"): """ Must default subgroup to something in order to provide single return value """ value = None if subgroup is None: node = self.get_optional_child(item, attribute) if node is not None: value = self.text(node) if resolved: value = self.get_resolved_value(value) else: value = EnvBase.get_value(self, item, attribute, resolved) else: value = EnvBase.get_value(self, item, attribute=attribute, resolved=resolved, subgroup=subgroup) return value def get_type_info(self, vid): nodes = self.get_children("entry", {"id": vid}) type_info = None for node in nodes: new_type_info = self._get_type_info(node) if type_info is None: type_info = new_type_info else: expect( type_info == new_type_info, "Inconsistent type_info for entry id={} {} {}".format( vid, new_type_info, type_info)) return type_info def get_jobs(self): groups = self.get_children("group") results = [] for group in groups: if self.get(group, "id") not in ["job_submission", "config_batch"]: results.append(self.get(group, "id")) return results def create_job_groups(self, batch_jobs): # Subtle: in order to support dynamic batch jobs, we need to remove the # job_submission group and replace with job-based groups orig_group = self.get_child( "group", {"id": "job_submission"}, err_msg="Looks like job groups have already been created") orig_group_children = EnvBase.get_children(self, root=orig_group, no_validate=True) childnodes = [] for child in reversed(orig_group_children): childnodes.append(self.copy(child)) self.remove_child(orig_group) for name, jdict in batch_jobs: new_job_group = self.make_child("group", {"id": name}) for field in jdict.keys(): val = jdict[field] node = self.make_child("entry", { "id": field, "value": val }, root=new_job_group) self.make_child("type", root=node, text="char") for child in childnodes: self.add_child(child, root=new_job_group) def cleanupnode(self, node): if self.get(node, "id") == "batch_system": fnode = self.get_child(name="file", root=node) self.remove_child(fnode, root=node) gnode = self.get_child(name="group", root=node) self.remove_child(gnode, root=node) vnode = self.get_optional_child(name="values", root=node) if vnode is not None: self.remove_child(vnode, root=node) else: node = EnvBase.cleanupnode(self, node) return node def set_batch_system(self, batchobj, batch_system_type=None): if batch_system_type is not None: self.set_batch_system_type(batch_system_type) if batchobj.batch_system_node is not None and batchobj.machine_node is not None: for node in batchobj.get_children(root=batchobj.machine_node, no_validate=True): oldnode = batchobj.get_optional_child( self.name(node), root=batchobj.batch_system_node) if oldnode is not None and self.name(oldnode) != "directives": logger.debug("Replacing {}".format(self.name(oldnode))) batchobj.remove_child(oldnode, root=batchobj.batch_system_node) if batchobj.batch_system_node is not None: self.add_child(self.copy(batchobj.batch_system_node)) if batchobj.machine_node is not None: self.add_child(self.copy(batchobj.machine_node)) def make_batch_script(self, input_template, job, case): expect(os.path.exists(input_template), "input file '{}' does not exist".format(input_template)) task_count = self.get_value("task_count", subgroup=job) overrides = {} if task_count is not None: overrides["total_tasks"] = int(task_count) overrides["num_nodes"] = int( math.ceil(float(task_count) / float(case.tasks_per_node))) overrides["pedocumentation"] = "" # TODO? overrides["job_id"] = case.get_value("CASE") + os.path.splitext(job)[1] if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives( case, job, overrides=overrides) output_text = transform_vars(open(input_template, "r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job(job) with open(output_name, "w") as fd: fd.write(output_text) os.chmod( output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
############################################################################### t1 = time.time() expect(not (sharedlib_only and model_only), "Contradiction: both sharedlib_only and model_only") logger.info("Building case in directory {}".format(caseroot)) logger.info("sharedlib_only is {}".format(sharedlib_only)) logger.info("model_only is {}".format(model_only)) expect(os.path.isdir(caseroot), "'{}' is not a valid directory".format(caseroot)) os.chdir(caseroot) expect( os.path.exists(get_batch_script_for_job("case.run")), "ERROR: must invoke case.setup script before calling build script ") cimeroot = case.get_value("CIMEROOT") comp_classes = case.get_values("COMP_CLASSES") check_lockedfiles(case) # Retrieve relevant case data # This environment variable gets set for cesm Make and # needs to be unset before building again. if "MODEL" in os.environ: del os.environ["MODEL"] build_threaded = case.get_build_threaded() casetools = case.get_value("CASETOOLS")
if task_count is not None: overrides["total_tasks"] = int(task_count) overrides["num_nodes"] = int(math.ceil(float(task_count)/float(case.tasks_per_node))) else: task_count = case.get_value("TOTALPES")*int(case.thread_count) if int(task_count) < case.get_value("MAX_TASKS_PER_NODE"): overrides["max_tasks_per_node"] = int(task_count) overrides["job_id"] = case.get_value("CASE") + os.path.splitext(job)[1] if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives(case, job, overrides=overrides) output_text = transform_vars(open(input_template,"r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job(job) with open(output_name, "w") as fd: fd.write(output_text) # make sure batch script is exectuble os.chmod(output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) def set_job_defaults(self, batch_jobs, case): if self._batchtype is None: self._batchtype = self.get_batch_system_type() if self._batchtype == 'none': return for job, jsect in batch_jobs:
def _submit_single_job(self, case, job, dep_jobs=None, no_batch=False, skip_pnl=False, mail_user=None, mail_type=None, batch_args=None, dry_run=False): if not dry_run: logger.warning("Submit job {}".format(job)) batch_system = self.get_value("BATCH_SYSTEM", subgroup=None) if batch_system is None or batch_system == "none" or no_batch: logger.info("Starting job script {}".format(job)) function_name = job.replace(".", "_") if not dry_run: if "archive" not in function_name: getattr(case,function_name)(skip_pnl=skip_pnl) else: getattr(case,function_name)() return submitargs = self.get_submit_args(case, job) args_override = self.get_value("BATCH_COMMAND_FLAGS", subgroup=job) if args_override: submitargs = args_override if dep_jobs is not None and len(dep_jobs) > 0: logger.info("dependencies: {}".format(dep_jobs)) dep_string = self.get_value("depend_string", subgroup=None) separator_string = self.get_value("depend_separator", subgroup=None) expect(separator_string is not None,"depend_separator string not defined") expect("jobid" in dep_string, "depend_string is missing jobid for prerequisite jobs") dep_ids_str = str(dep_jobs[0]) for dep_id in dep_jobs[1:]: dep_ids_str += separator_string + str(dep_id) dep_string = dep_string.replace("jobid",dep_ids_str.strip()) # pylint: disable=maybe-no-member submitargs += " " + dep_string if batch_args is not None: submitargs += " " + batch_args cime_config = get_cime_config() if mail_user is None and cime_config.has_option("main", "MAIL_USER"): mail_user = cime_config.get("main", "MAIL_USER") if mail_user is not None: mail_user_flag = self.get_value('batch_mail_flag', subgroup=None) if mail_user_flag is not None: submitargs += " " + mail_user_flag + " " + mail_user if mail_type is None: if job == "case.test" and cime_config.has_option("create_test", "MAIL_TYPE"): mail_type = cime_config.get("create_test", "MAIL_TYPE") elif cime_config.has_option("main", "MAIL_TYPE"): mail_type = cime_config.get("main", "MAIL_TYPE") else: mail_type = self.get_value("batch_mail_default") if mail_type: mail_type = mail_type.split(",") # pylint: disable=no-member if mail_type: mail_type_flag = self.get_value("batch_mail_type_flag", subgroup=None) if mail_type_flag is not None: mail_type_args = [] for indv_type in mail_type: mail_type_arg = self.get_batch_mail_type(indv_type) mail_type_args.append(mail_type_arg) if mail_type_flag == "-m": # hacky, PBS-type systems pass multiple mail-types differently submitargs += " {} {}".format(mail_type_flag, "".join(mail_type_args)) else: submitargs += " {} {}".format(mail_type_flag, " {} ".format(mail_type_flag).join(mail_type_args)) batchsubmit = self.get_value("batch_submit", subgroup=None) expect(batchsubmit is not None, "Unable to determine the correct command for batch submission.") batchredirect = self.get_value("batch_redirect", subgroup=None) submitcmd = '' batch_env_flag = self.get_value("batch_env", subgroup=None) if batch_env_flag: sequence = (batchsubmit, submitargs, "skip_pnl", batchredirect, get_batch_script_for_job(job)) else: sequence = (batchsubmit, submitargs, batchredirect, get_batch_script_for_job(job), "skip_pnl") for string in sequence: if string == "skip_pnl": if job in ['case.run', 'case.test'] and skip_pnl: batch_env_flag = self.get_value("batch_env", subgroup=None) if not batch_env_flag: submitcmd += " --skip-preview-namelist " else: submitcmd += " {} ARGS_FOR_SCRIPT='--skip-preview-namelist' ".format(batch_env_flag) elif string is not None: submitcmd += string + " " if dry_run: return submitcmd else: logger.info("Submitting job script {}".format(submitcmd)) output = run_cmd_no_fail(submitcmd, combine_output=True) jobid = self.get_job_id(output) logger.info("Submitted job id is {}".format(jobid)) return jobid
caseroot, clean=False, test_mode=False, reset=False): ############################################################################### os.chdir(caseroot) # Check that $DIN_LOC_ROOT exists - and abort if not a namelist compare tests din_loc_root = case.get_value("DIN_LOC_ROOT") testcase = case.get_value("TESTCASE") expect(not (not os.path.isdir(din_loc_root) and testcase != "SBN"), "inputdata root is not a directory: {}".format(din_loc_root)) # Remove batch scripts if reset or clean: case_run, case_test = get_batch_script_for_job( "case.run"), get_batch_script_for_job("case.test") if os.path.exists(case_run): os.remove(case_run) if not test_mode: # rebuild the models (even on restart) case.set_value("BUILD_COMPLETE", False) # backup and then clean test script if os.path.exists(case_test): os.remove(case_test) logger.info( "Successfully cleaned test script {}".format(case_test)) logger.info("Successfully cleaned batch script case.run")
overrides["total_tasks"] = int(task_count) overrides["num_nodes"] = int(math.ceil(float(task_count)/float(case.tasks_per_node))) else: task_count = case.get_value("TOTALPES")*int(case.thread_count) if int(task_count) < case.get_value("MAX_TASKS_PER_NODE"): overrides["max_tasks_per_node"] = int(task_count) overrides["job_id"] = case.get_value("CASE") + os.path.splitext(job)[1] if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives(case, job, overrides=overrides) overrides["mpirun"] = case.get_mpirun_cmd(job=job) output_text = transform_vars(open(input_template,"r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job(job) if outfile is None else outfile logger.info("Creating file {}".format(output_name)) with open(output_name, "w") as fd: fd.write(output_text) # make sure batch script is exectuble os.chmod(output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) def set_job_defaults(self, batch_jobs, case): if self._batchtype is None: self._batchtype = self.get_batch_system_type() if self._batchtype == "none": return known_jobs = self.get_jobs()
def make_batch_script(self, input_template, job, case, outfile=None): expect(os.path.exists(input_template), "input file '{}' does not exist".format(input_template)) overrides = self.get_job_overrides(job, case) ext = os.path.splitext(job)[-1] if len(ext) == 0: ext = job if ext.startswith('.'): ext = ext[1:] overrides["job_id"] = ext + '.' + case.get_value("CASE") if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives(case, job, overrides=overrides) output_text = transform_vars(open(input_template,"r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job(job) if outfile is None else outfile logger.info("Creating file {}".format(output_name)) with open(output_name, "w") as fd: fd.write(output_text) # make sure batch script is exectuble os.chmod(output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) def set_job_defaults(self, batch_jobs, case): if self._batchtype is None: self._batchtype = self.get_batch_system_type() if self._batchtype == "none": return env_workflow = case.get_env('workflow') known_jobs = env_workflow.get_jobs()
def _case_setup_impl(case, caseroot, clean=False, test_mode=False, reset=False): ############################################################################### os.chdir(caseroot) # Check that $DIN_LOC_ROOT exists - and abort if not a namelist compare tests din_loc_root = case.get_value("DIN_LOC_ROOT") testcase = case.get_value("TESTCASE") expect(not (not os.path.isdir(din_loc_root) and testcase != "SBN"), "inputdata root is not a directory: {}".format(din_loc_root)) # Remove batch scripts if reset or clean: # clean batch script batch_script = get_batch_script_for_job(case.get_primary_job()) if os.path.exists(batch_script): os.remove(batch_script) logger.info("Successfully cleaned batch script {}".format(batch_script)) if not test_mode: # rebuild the models (even on restart) case.set_value("BUILD_COMPLETE", False) if not clean: case.load_env() models = case.get_values("COMP_CLASSES") mach = case.get_value("MACH") compiler = case.get_value("COMPILER") debug = case.get_value("DEBUG") mpilib = case.get_value("MPILIB") sysos = case.get_value("OS") expect(mach is not None, "xml variable MACH is not set") # creates the Macros.make, Depends.compiler, Depends.machine, Depends.machine.compiler # and env_mach_specific.xml if they don't already exist. if not os.path.isfile("Macros.make") or not os.path.isfile("env_mach_specific.xml"): configure(Machines(machine=mach), caseroot, ["Makefile"], compiler, mpilib, debug, sysos) # Set tasks to 1 if mpi-serial library if mpilib == "mpi-serial": for vid, value in case: if vid.startswith("NTASKS") and value != 1: case.set_value(vid, 1) # Check ninst. # In CIME there can be multiple instances of each component model (an ensemble) NINST is the instance of that component. multi_driver = case.get_value("MULTI_DRIVER") nthrds = 1 for comp in models: ntasks = case.get_value("NTASKS_{}".format(comp)) nthrds = max(nthrds,case.get_value("NTHRDS_{}".format(comp))) if comp == "CPL": continue ninst = case.get_value("NINST_{}".format(comp)) if multi_driver: expect(case.get_value("NINST_LAYOUT_{}".format(comp)) == "concurrent", "If multi_driver is TRUE, NINST_LAYOUT_{} must be concurrent".format(comp)) case.set_value("NTASKS_PER_INST_{}".format(comp), ntasks) else: if ninst > ntasks: if ntasks == 1: case.set_value("NTASKS_{}".format(comp), ninst) ntasks = ninst else: expect(False, "NINST_{} value {:d} greater than NTASKS_{} {:d}".format(comp, ninst, comp, ntasks)) case.set_value("NTASKS_PER_INST_{}".format(comp), int(ntasks / ninst)) if nthrds > 1: case.set_value("BUILD_THREADED",True) if os.path.exists(get_batch_script_for_job(case.get_primary_job())): logger.info("Machine/Decomp/Pes configuration has already been done ...skipping") case.initialize_derived_attributes() case.set_value("SMP_PRESENT", case.get_build_threaded()) else: case.check_pelayouts_require_rebuild(models) unlock_file("env_build.xml") unlock_file("env_batch.xml") case.flush() case.check_lockedfiles() case.initialize_derived_attributes() cost_per_node = 16 if case.get_value("MACH") == "yellowstone" else case.get_value("MAX_MPITASKS_PER_NODE") case.set_value("COST_PES", case.num_nodes * cost_per_node) case.set_value("TOTALPES", case.total_tasks) case.set_value("SMP_PRESENT", case.get_build_threaded()) # create batch files env_batch = case.get_env("batch") env_batch.make_all_batch_files(case) if get_model() == "e3sm" and not case.get_value("TEST"): input_batch_script = os.path.join(case.get_value("MACHDIR"), "template.case.run.sh") env_batch.make_batch_script(input_batch_script, "case.run", case, outfile=get_batch_script_for_job("case.run.sh")) # May need to select new batch settings if pelayout changed (e.g. problem is now too big for prev-selected queue) env_batch.set_job_defaults([(case.get_primary_job(), {})], case) case.schedule_rewrite(env_batch) # Make a copy of env_mach_pes.xml in order to be able # to check that it does not change once case.setup is invoked case.flush() logger.debug("at copy TOTALPES = {}".format(case.get_value("TOTALPES"))) lock_file("env_mach_pes.xml") lock_file("env_batch.xml") # Create user_nl files for the required number of instances if not os.path.exists("user_nl_cpl"): logger.info("Creating user_nl_xxx files for components and cpl") # loop over models for model in models: comp = case.get_value("COMP_{}".format(model)) logger.debug("Building {} usernl files".format(model)) _build_usernl_files(case, model, comp) if comp == "cism": glcroot = case.get_value("COMP_ROOT_DIR_GLC") run_cmd_no_fail("{}/cime_config/cism.template {}".format(glcroot, caseroot)) _build_usernl_files(case, "drv", "cpl") # Create needed directories for case case.create_dirs() logger.info("If an old case build already exists, might want to run \'case.build --clean\' before building") # Some tests need namelists created here (ERP) - so do this if we are in test mode if test_mode or get_model() == "e3sm": logger.info("Generating component namelists as part of setup") case.create_namelists() # Record env information env_module = case.get_env("mach_specific") env_module.make_env_mach_specific_file("sh", case) env_module.make_env_mach_specific_file("csh", case) env_module.save_all_env_info("software_environment.txt")
case.total_tasks * case.thread_count) # May need to select new batch settings if pelayout changed (e.g. problem is now too big for prev-selected queue) env_batch = case.get_env("batch") env_batch.set_job_defaults([(case.get_primary_job(), {})], case) # create batch files env_batch.make_all_batch_files(case) if get_model() == "e3sm" and not case.get_value("TEST"): input_batch_script = os.path.join(case.get_value("MACHDIR"), "template.case.run.sh") env_batch.make_batch_script( input_batch_script, "case.run", case, outfile=get_batch_script_for_job("case.run.sh")) # Make a copy of env_mach_pes.xml in order to be able # to check that it does not change once case.setup is invoked case.flush() logger.debug("at copy TOTALPES = {}".format( case.get_value("TOTALPES"))) lock_file("env_mach_pes.xml") lock_file("env_batch.xml") # Create user_nl files for the required number of instances if not os.path.exists("user_nl_cpl"): logger.info("Creating user_nl_xxx files for components and cpl") # loop over models for model in models:
if int(task_count) < case.get_value("MAX_TASKS_PER_NODE"): overrides["max_tasks_per_node"] = int(task_count) overrides["job_id"] = case.get_value("CASE") + os.path.splitext(job)[1] if "pleiades" in case.get_value("MACH"): # pleiades jobname needs to be limited to 15 chars overrides["job_id"] = overrides["job_id"][:15] overrides["batchdirectives"] = self.get_batch_directives( case, job, overrides=overrides) overrides["mpirun"] = case.get_mpirun_cmd(job=job) output_text = transform_vars(open(input_template, "r").read(), case=case, subgroup=job, overrides=overrides) output_name = get_batch_script_for_job( job) if outfile is None else outfile logger.info("Creating file {}".format(output_name)) with open(output_name, "w") as fd: fd.write(output_text) # make sure batch script is exectuble os.chmod( output_name, os.stat(output_name).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) def set_job_defaults(self, batch_jobs, case): if self._batchtype is None: self._batchtype = self.get_batch_system_type() if self._batchtype == 'none':