def _save_logs(case, lid): ############################################################################### rundir = case.get_value("RUNDIR") logfiles = glob.glob(os.path.join(rundir, "*.log.{}".format(lid))) for logfile in logfiles: if os.path.isfile(logfile): gzip_existing_file(logfile)
def _archive_atm_costs(lid, rundir): atm_chunk_costs_src_path = os.path.join(rundir, "atm_chunk_costs.txt") if os.path.exists(atm_chunk_costs_src_path): atm_chunk_costs_dst_path = os.path.join( rundir, "atm_chunk_costs.{}".format(lid)) shutil.move(atm_chunk_costs_src_path, atm_chunk_costs_dst_path) utils.gzip_existing_file(atm_chunk_costs_dst_path)
def _archive_memory_profile(lid, rundir): # gzip memory profile log glob_to_copy = "memory.[0-4].*.log" for item in glob.glob(os.path.join(rundir, glob_to_copy)): mprof_dst_path = os.path.join(os.path.dirname(item), (os.path.basename(item) + ".{}").format(lid)) shutil.move(item, mprof_dst_path) utils.gzip_existing_file(mprof_dst_path)
def _record_olcf_queue(job_id, lid, full_timing_dir): for cmd, filename in [ ("bjobs -u all >", "bjobsu_all"), ("bjobs -r -u all -o 'jobid slots exec_host' >", "bjobsru_allo"), ("bjobs -l -UF %s >" % job_id, "bjobslUF_jobid"), ]: full_cmd = cmd + " " + filename utils.run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir) utils.gzip_existing_file( os.path.join(full_timing_dir, filename + "." + lid))
def _record_anl_theta_queue(job_id, lid, full_timing_dir): for cmd, filename in [ ( "qstat -l --header JobID:JobName:User:Project:WallTime:QueuedTime:Score:RunTime:TimeRemaining:Nodes:State:Location:Mode:Command:Args:Procs:Queue:StartTime:attrs:Geometry", "qstatf", ), ("qstat -lf %s" % job_id, "qstatf_jobid"), ("xtnodestat", "xtnodestat"), ("xtprocadmin", "xtprocadmin"), ]: filename = "%s.%s" % (filename, lid) utils.run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) utils.gzip_existing_file(os.path.join(full_timing_dir, filename))
def _record_anl_queue(job_id, lid, full_timing_dir): for cmd, filename in [ ("sinfo -l", "sinfol"), ("squeue -o '%all' --job {}".format(job_id), "squeueall_jobid"), ( "squeue -o '%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %.20S %.20V %j'", "squeuef", ), ("squeue -t R -o '%.10i %R'", "squeues"), ]: filename = "%s.%s" % (filename, lid) utils.run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) utils.gzip_existing_file(os.path.join(full_timing_dir, filename))
def _record_nersc_queue(job_id, lid, full_timing_dir): for cmd, filename in [ ("sinfo -a -l", "sinfol"), ("scontrol show jobid %s" % job_id, "sqsf_jobid"), # ("sqs -f", "sqsf"), ( "squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef", ), ("squeue -t R -o '%.10i %R'", "squeues"), ]: filename = "%s.%s" % (filename, lid) utils.run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) utils.gzip_existing_file(os.path.join(full_timing_dir, filename))
def post_build(case, logs, build_complete=False, save_build_provenance=True): ############################################################################### for log in logs: gzip_existing_file(log) if build_complete: # must ensure there's an lid lid = os.environ["LID"] if "LID" in os.environ else get_timestamp("%y%m%d-%H%M%S") if save_build_provenance: save_build_provenance_sub(case, lid=lid) # Set XML to indicate build complete case.set_value("BUILD_COMPLETE", True) case.set_value("BUILD_STATUS", 0) if "SMP_VALUE" in os.environ: case.set_value("SMP_BUILD", os.environ["SMP_VALUE"]) case.flush() lock_file("env_build.xml")
def save_logs(case, lid): ############################################################################### logdir = case.get_value("LOGDIR") if logdir is not None and len(logdir) > 0: if not os.path.isdir(logdir): os.makedirs(logdir) caseroot = case.get_value("CASEROOT") rundir = case.get_value("RUNDIR") logfiles = glob.glob(os.path.join(rundir,"*.log.%s"%(lid))) for logfile in logfiles: if os.path.isfile(logfile): logfile_gz = gzip_existing_file(logfile) shutil.copy(logfile_gz, os.path.join(caseroot, logdir, os.path.basename(logfile_gz)))
expect(not os.path.exists(full_timing_dir), "%s already exists" % full_timing_dir) os.makedirs(full_timing_dir) mach = case.get_value("MACH") compiler = case.get_value("COMPILER") # For some batch machines save queue info job_id = _get_batch_job_id_for_syslog(case) if mach == "mira": for cmd, filename in [("qstat -lf", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail("%s > %s" % (cmd, filename), from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach in ["corip1", "edison"]: for cmd, filename in [("sqs -f", "sqsf"), ("sqs -w -a", "sqsw"), ("sqs -f %s" % job_id, "sqsf_jobid"), ("squeue", "squeuef")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail("%s > %s" % (cmd, filename), from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach == "titan": for cmd, filename in [("xtdb2proc -f xtdb2proc", "xtdb2procf"), ("qstat -f > qstat", "qstatf"), ("qstat -f %s > qstatf_jobid" % job_id, "qstatf_jobid"), ("xtnodestat > xtnodestat", "xtnodestatf"), ("showq > showqf", "showqf")]:
os.makedirs(full_timing_dir) except OSError: logger.warning("{} cannot be created. Skipping archive of timing data and associated provenance.".format(full_timing_dir)) return mach = case.get_value("MACH") compiler = case.get_value("COMPILER") # For some batch machines save queue info job_id = _get_batch_job_id_for_syslog(case) if job_id is not None: if mach == "mira": for cmd, filename in [("qstat -f", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach == "theta": for cmd, filename in [("qstat -l --header JobID:JobName:User:Project:WallTime:QueuedTime:Score:RunTime:TimeRemaining:Nodes:State:Location:Mode:Command:Args:Procs:Queue:StartTime:attrs:Geometry", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid"), ("xtnodestat", "xtnodestat"), ("xtprocadmin", "xtprocadmin")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach in ["edison", "cori-haswell", "cori-knl"]: for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"), # ("sqs -f", "sqsf"), ("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"), ("squeue -t R -o '%.10i %R'", "squeues")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
with utils.SharedArea(): lid = os.environ["LID"] if lid is None else lid if case.get_value("SAVE_TIMING"): caseroot = case.get_value("CASEROOT") rundir = case.get_value("RUNDIR") rundir_timing_dir = _archive_timings(lid, rundir) _archive_atm_costs(lid, rundir) _archive_memory_profile(lid, rundir) _archive_spio_stats(lid, rundir) utils.gzip_existing_file( os.path.join(caseroot, "timing", "e3sm_timing_stats.%s" % lid)) # JGF: not sure why we do this timing_saved_file = "timing.%s.saved" % lid utils.touch(os.path.join(caseroot, "timing", timing_saved_file)) project = case.get_value("PROJECT", subgroup=case.get_primary_job()) if not case.is_save_timing_dir_project(project): return timing_dir = case.get_value("SAVE_TIMING_DIR") if timing_dir is None or not os.path.isdir(timing_dir): return mach = case.get_value("MACH")
t3 = time.time() if not sharedlib_only: logger.info("Time spent not building: {:f} sec".format(t2 - t1)) logger.info("Time spent building: {:f} sec".format(t3 - t2)) logger.info("MODEL BUILD HAS FINISHED SUCCESSFULLY") return True ############################################################################### def post_build(case, logs, build_complete=False, save_build_provenance=True): ############################################################################### for log in logs: gzip_existing_file(log) if build_complete: # must ensure there's an lid lid = os.environ["LID"] if "LID" in os.environ else get_timestamp( "%y%m%d-%H%M%S") if save_build_provenance: save_build_provenance_sub(case, lid=lid) # Set XML to indicate build complete case.set_value("BUILD_COMPLETE", True) case.set_value("BUILD_STATUS", 0) if "SMP_VALUE" in os.environ: case.set_value("SMP_BUILD", os.environ["SMP_VALUE"]) case.flush()
def _save_postrun_timing_e3sm(case, lid): caseroot = case.get_value("CASEROOT") rundir = case.get_value("RUNDIR") # tar timings rundir_timing_dir = os.path.join(rundir, "timing." + lid) shutil.move(os.path.join(rundir, "timing"), rundir_timing_dir) with tarfile.open("%s.tar.gz" % rundir_timing_dir, "w:gz") as tfd: tfd.add(rundir_timing_dir, arcname=os.path.basename(rundir_timing_dir)) shutil.rmtree(rundir_timing_dir) gzip_existing_file(os.path.join(caseroot, "timing", "e3sm_timing_stats.%s" % lid)) # JGF: not sure why we do this timing_saved_file = "timing.%s.saved" % lid touch(os.path.join(caseroot, "timing", timing_saved_file)) project = case.get_value("PROJECT", subgroup=case.get_primary_job()) if not case.is_save_timing_dir_project(project): return timing_dir = case.get_value("SAVE_TIMING_DIR") if timing_dir is None or not os.path.isdir(timing_dir): return mach = case.get_value("MACH") base_case = case.get_value("CASE") full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid) if not os.path.isdir(full_timing_dir): return # Kill mach_syslog job_id = _get_batch_job_id_for_syslog(case) if job_id is not None: syslog_jobid_path = os.path.join(rundir, "syslog_jobid.{}".format(job_id)) if os.path.exists(syslog_jobid_path): try: with open(syslog_jobid_path, "r") as fd: syslog_jobid = int(fd.read().strip()) os.kill(syslog_jobid, signal.SIGTERM) except (ValueError, OSError) as e: logger.warning("Failed to kill syslog: {}".format(e)) finally: os.remove(syslog_jobid_path) # copy timings copy_umask("%s.tar.gz" % rundir_timing_dir, full_timing_dir) # # save output files and logs # globs_to_copy = [] if job_id is not None: if mach == "titan": globs_to_copy.append("%s*OU" % job_id) elif mach == "anvil": globs_to_copy.append("/home/%s/%s*OU" % (getpass.getuser(), job_id)) elif mach in ["mira", "theta"]: globs_to_copy.append("%s*error" % job_id) globs_to_copy.append("%s*output" % job_id) globs_to_copy.append("%s*cobaltlog" % job_id) elif mach in ["edison", "cori-haswell", "cori-knl"]: globs_to_copy.append("%s*run*%s" % (case.get_value("CASE"), job_id)) globs_to_copy.append("logs/run_environment.txt.{}".format(lid)) globs_to_copy.append("logs/e3sm.log.{}.gz".format(lid)) globs_to_copy.append("logs/cpl.log.{}.gz".format(lid)) globs_to_copy.append("timing/*.{}*".format(lid)) globs_to_copy.append("CaseStatus") for glob_to_copy in globs_to_copy: for item in glob.glob(os.path.join(caseroot, glob_to_copy)): basename = os.path.basename(item) if basename != timing_saved_file: if lid not in basename and not basename.endswith(".gz"): copy_umask(item, os.path.join(full_timing_dir, "{}.{}".format(basename, lid))) else: copy_umask(item, full_timing_dir) # zip everything for root, _, files in os.walk(full_timing_dir): for filename in files: if not filename.endswith(".gz"): gzip_existing_file(os.path.join(root, filename))
elif not fv3_standalone and 'SUCCESSFUL TERMINATION' in fd.read( ): count_ok += 1 if count_ok != cpl_ninst: expect(False, "Model did not complete - see {} \n ".format(cpl_logfile)) ############################################################################### def _save_logs(case, lid): ############################################################################### rundir = case.get_value("RUNDIR") logfiles = glob.glob(os.path.join(rundir, "*.log.{}".format(lid))) for logfile in logfiles: if os.path.isfile(logfile): gzip_existing_file(logfile) ###################################################################################### def _resubmit_check(case): ############################################################################### """ check to see if we need to do resubmission from this particular job, Note that Mira requires special logic """ dout_s = case.get_value("DOUT_S") logger.warning("dout_s {} ".format(dout_s)) mach = case.get_value("MACH") logger.warning("mach {} ".format(mach)) resubmit_num = case.get_value("RESUBMIT") logger.warning("resubmit_num {}".format(resubmit_num))
def _save_prerun_timing_e3sm(case, lid): project = case.get_value("PROJECT", subgroup=case.get_primary_job()) if not case.is_save_timing_dir_project(project): return timing_dir = case.get_value("SAVE_TIMING_DIR") if timing_dir is None or not os.path.isdir(timing_dir): logger.warning("SAVE_TIMING_DIR {} is not valid. E3SM requires a valid SAVE_TIMING_DIR to archive timing data.".format(timing_dir)) return logger.info("Archiving timing data and associated provenance in {}.".format(timing_dir)) rundir = case.get_value("RUNDIR") blddir = case.get_value("EXEROOT") caseroot = case.get_value("CASEROOT") cimeroot = case.get_value("CIMEROOT") base_case = case.get_value("CASE") full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid) if os.path.exists(full_timing_dir): logger.warning("{} already exists. Skipping archive of timing data and associated provenance.".format(full_timing_dir)) return try: os.makedirs(full_timing_dir) except OSError: logger.warning("{} cannot be created. Skipping archive of timing data and associated provenance.".format(full_timing_dir)) return mach = case.get_value("MACH") compiler = case.get_value("COMPILER") # For some batch machines save queue info job_id = _get_batch_job_id_for_syslog(case) if job_id is not None: if mach == "mira": for cmd, filename in [("qstat -f", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach == "theta": for cmd, filename in [("qstat -l --header JobID:JobName:User:Project:WallTime:QueuedTime:Score:RunTime:TimeRemaining:Nodes:State:Location:Mode:Command:Args:Procs:Queue:StartTime:attrs:Geometry", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid"), ("xtnodestat", "xtnodestat"), ("xtprocadmin", "xtprocadmin")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach in ["edison", "cori-haswell", "cori-knl"]: for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"), # ("sqs -f", "sqsf"), ("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"), ("squeue -t R -o '%.10i %R'", "squeues")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach == "titan": for cmd, filename in [("qstat -f %s >" % job_id, "qstatf_jobid"), ("xtnodestat >", "xtnodestat"), # ("qstat -f >", "qstatf"), # ("xtdb2proc -f", "xtdb2proc"), ("showq >", "showq")]: full_cmd = cmd + " " + filename run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid)) # mdiag_reduce = os.path.join(full_timing_dir, "mdiag_reduce." + lid) # run_cmd_no_fail("./mdiag_reduce.csh", arg_stdout=mdiag_reduce, from_dir=os.path.join(caseroot, "Tools")) # gzip_existing_file(mdiag_reduce) elif mach == "anvil": for cmd, filename in [("qstat -f -1 acme >", "qstatf"), ("qstat -f %s >" % job_id, "qstatf_jobid"), ("qstat -r acme >", "qstatr")]: full_cmd = cmd + " " + filename run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid)) # copy/tar SourceModes source_mods_dir = os.path.join(caseroot, "SourceMods") if os.path.isdir(source_mods_dir): with tarfile.open(os.path.join(full_timing_dir, "SourceMods.{}.tar.gz".format(lid)), "w:gz") as tfd: tfd.add(source_mods_dir, arcname="SourceMods") # Save various case configuration items case_docs = os.path.join(full_timing_dir, "CaseDocs.{}".format(lid)) os.mkdir(case_docs) globs_to_copy = [ "CaseDocs/*", "*.run", ".*.run", "*.xml", "user_nl_*", "*env_mach_specific*", "Macros*", "README.case", "Depends.{}".format(mach), "Depends.{}".format(compiler), "Depends.{}.{}".format(mach, compiler), "software_environment.txt" ] for glob_to_copy in globs_to_copy: for item in glob.glob(os.path.join(caseroot, glob_to_copy)): copy_umask(item, os.path.join(case_docs, "{}.{}".format(os.path.basename(item).lstrip("."), lid))) # Copy some items from build provenance blddir_globs_to_copy = [ "GIT_LOGS_HEAD", "build_environment.txt" ] for blddir_glob_to_copy in blddir_globs_to_copy: for item in glob.glob(os.path.join(blddir, blddir_glob_to_copy)): copy_umask(item, os.path.join(full_timing_dir, os.path.basename(item) + "." + lid)) # Save state of repo from_repo = cimeroot if os.path.exists(os.path.join(cimeroot, ".git")) else os.path.dirname(cimeroot) desc = get_current_commit(tag=True, repo=from_repo) with open(os.path.join(full_timing_dir, "GIT_DESCRIBE.{}".format(lid)), "w") as fd: fd.write(desc) # What this block does is mysterious to me (JGF) if job_id is not None: # Kill mach_syslog from previous run if one exists syslog_jobid_path = os.path.join(rundir, "syslog_jobid.{}".format(job_id)) if os.path.exists(syslog_jobid_path): try: with open(syslog_jobid_path, "r") as fd: syslog_jobid = int(fd.read().strip()) os.kill(syslog_jobid, signal.SIGTERM) except (ValueError, OSError) as e: logger.warning("Failed to kill syslog: {}".format(e)) finally: os.remove(syslog_jobid_path) # If requested, spawn a mach_syslog process to monitor job progress sample_interval = case.get_value("SYSLOG_N") if sample_interval > 0: archive_checkpoints = os.path.join(full_timing_dir, "checkpoints.{}".format(lid)) os.mkdir(archive_checkpoints) touch("{}/e3sm.log.{}".format(rundir, lid)) syslog_jobid = run_cmd_no_fail("./mach_syslog {:d} {} {} {} {}/timing/checkpoints {} >& /dev/null & echo $!".format(sample_interval, job_id, lid, rundir, rundir, archive_checkpoints), from_dir=os.path.join(caseroot, "Tools")) with open(os.path.join(rundir, "syslog_jobid.{}".format(job_id)), "w") as fd: fd.write("{}\n".format(syslog_jobid))
def save_postrun_provenance_acme(case, lid): save_timing = case.get_value("SAVE_TIMING") if not save_timing: return lid = os.environ["LID"] if lid is None else lid rundir = case.get_value("RUNDIR") timing_dir = case.get_value("SAVE_TIMING_DIR") caseroot = case.get_value("CASEROOT") mach = case.get_value("MACH") base_case = case.get_value("CASE") full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid) # Kill mach_syslog job_id = _get_batch_job_id_for_syslog(case) if job_id is not None: syslog_jobid_path = os.path.join(rundir, "syslog_jobid", ".%s" % job_id) if os.path.exists(syslog_jobid_path): try: with open(syslog_jobid_path, "r") as fd: syslog_jobid = int(fd.read().strip()) os.kill(syslog_jobid, signal.SIGTERM) except (ValueError, OSError) as e: logger.warning("Failed to kill syslog: %s" % e) finally: os.remove(syslog_jobid_path) # copy/tar timings with tarfile.open(os.path.join(full_timing_dir, "timing.%s.tar.gz" % lid), "w:gz") as tfd: tfd.add(os.path.join(rundir, "timing")) # # save output files and logs # globs_to_copy = [] if mach == "titan": globs_to_copy.append("%s*OU" % job_id) elif mach == "mira": globs_to_copy.append("%s*output" % job_id) globs_to_copy.append("%s*cobaltlog" % job_id) elif mach in ["edison", "corip1"]: globs_to_copy.append("%s" % case.get_value("CASE")) globs_to_copy.append("logs/acme.log.%s.gz" % lid) globs_to_copy.append("logs/cpl.log.%s.gz" % lid) globs_to_copy.append("timing/*.%s" % lid) globs_to_copy.append("CaseStatus") for glob_to_copy in globs_to_copy: for item in glob.glob(os.path.join(caseroot, glob_to_copy)): basename = os.path.basename(item) if lid not in basename and not basename.endswith(".gz"): shutil.copy(item, os.path.join(full_timing_dir, "%s.%s" % (basename, lid))) else: shutil.copy(item, full_timing_dir) # zip everything for root, _, files in os.walk(full_timing_dir): for filename in files: if not filename.endswith(".gz"): gzip_existing_file(os.path.join(root, filename))
def save_prerun_provenance_acme(case, lid=None): if not case.get_value("SAVE_TIMING"): return lid = os.environ["LID"] if lid is None else lid timing_dir = case.get_value("SAVE_TIMING_DIR") if timing_dir is None or timing_dir == 'UNSET': logger.warning("ACME requires SAVE_TIMING_DIR to be set in order to save timings. Skipping save timings") return logger.info("timing dir is %s" % timing_dir) rundir = case.get_value("RUNDIR") blddir = case.get_value("EXEROOT") caseroot = case.get_value("CASEROOT") cimeroot = case.get_value("CIMEROOT") base_case = case.get_value("CASE") full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid) expect(not os.path.exists(full_timing_dir), "%s already exists" % full_timing_dir) os.makedirs(full_timing_dir) mach = case.get_value("MACH") compiler = case.get_value("COMPILER") # For some batch machines save queue info job_id = _get_batch_job_id_for_syslog(case) if mach == "mira": for cmd, filename in [("qstat -lf", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail("%s > %s" % (cmd, filename), from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach in ["corip1", "edison"]: for cmd, filename in [("sqs -f", "sqsf"), ("sqs -w -a", "sqsw"), ("sqs -f %s" % job_id, "sqsf_jobid"), ("squeue", "squeuef")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail("%s > %s" % (cmd, filename), from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach == "titan": for cmd, filename in [("xtdb2proc -f xtdb2proc", "xtdb2procf"), ("qstat -f > qstat", "qstatf"), ("qstat -f %s > qstatf_jobid" % job_id, "qstatf_jobid"), ("xtnodestat > xtnodestat", "xtnodestatf"), ("showq > showqf", "showqf")]: run_cmd_no_fail(cmd + "." + lid, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid)) mdiag_reduce = os.path.join(full_timing_dir, "mdiag_reduce." + lid) run_cmd_no_fail("./mdiag_reduce.csh > %s" % mdiag_reduce, from_dir=os.path.join(caseroot, "Tools")) gzip_existing_file(mdiag_reduce) # copy/tar SourceModes source_mods_dir = os.path.join(caseroot, "SourceMods") if os.path.isdir(source_mods_dir): with tarfile.open(os.path.join(full_timing_dir, "SourceMods.%s.tar.gz" % lid), "w:gz") as tfd: tfd.add(source_mods_dir) # Save various case configuration items case_docs = os.path.join(full_timing_dir, "CaseDocs.%s" % lid) os.mkdir(case_docs) globs_to_copy = [ "CaseDocs/*", "*.run", "*.xml", "user_nl_*", "*env_mach_specific*", "Macros", "README.case", "Depends.%s" % mach, "Depends.%s" % compiler, "Depends.%s.%s" % (mach, compiler), "software_environment.txt" ] for glob_to_copy in globs_to_copy: for item in glob.glob(os.path.join(caseroot, glob_to_copy)): shutil.copy(item, os.path.join(case_docs, os.path.basename(item) + "." + lid)) # Copy some items from build provenance blddir_globs_to_copy = [ "GIT_LOGS_HEAD", "build_environment.txt" ] for blddir_glob_to_copy in blddir_globs_to_copy: for item in glob.glob(os.path.join(blddir, blddir_glob_to_copy)): shutil.copy(item, os.path.join(full_timing_dir, os.path.basename(item) + "." + lid)) # What this block does is mysterious to me (JGF) if job_id is not None: sample_interval = case.get_value("SYSLOG_N") if sample_interval > 0: archive_checkpoints = os.path.join(full_timing_dir, "checkpoints.%s" % lid) os.mkdir(archive_checkpoints) touch("%s/acme.log.%s" % (rundir, lid)) syslog_jobid = run_cmd_no_fail("./mach_syslog %d %s %s %s %s/timing/checkpoints %s/checkpoints >& /dev/null & echo $!" % (sample_interval, job_id, lid, rundir, rundir, archive_checkpoints), from_dir=os.path.join(caseroot, "Tools")) with open(os.path.join(rundir, "syslog_jobid.%s" % job_id), "w") as fd: fd.write("%s\n" % syslog_jobid) # Save state of repo run_cmd_no_fail("git describe > %s" % os.path.join(full_timing_dir, "GIT_DESCRIBE.%s" % lid), from_dir=cimeroot)
############################################################################### def save_logs(case, lid): ############################################################################### logdir = case.get_value("LOGDIR") if logdir is not None and len(logdir) > 0: if not os.path.isdir(logdir): os.makedirs(logdir) caseroot = case.get_value("CASEROOT") rundir = case.get_value("RUNDIR") logfiles = glob.glob(os.path.join(rundir, "*.log.%s" % (lid))) for logfile in logfiles: if os.path.isfile(logfile): logfile_gz = gzip_existing_file(logfile) shutil.copy( logfile_gz, os.path.join(caseroot, logdir, os.path.basename(logfile_gz))) ############################################################################### def resubmit_check(case): ############################################################################### # check to see if we need to do resubmission from this particular job, # Note that Mira requires special logic dout_s = case.get_value("DOUT_S") logger.warn("dout_s %s " % (dout_s))
return mach = case.get_value("MACH") compiler = case.get_value("COMPILER") # For some batch machines save queue info job_id = _get_batch_job_id_for_syslog(case) if job_id is not None: if mach == "theta": for cmd, filename in [("qstat -l --header JobID:JobName:User:Project:WallTime:QueuedTime:Score:RunTime:TimeRemaining:Nodes:State:Location:Mode:Command:Args:Procs:Queue:StartTime:attrs:Geometry", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid"), ("xtnodestat", "xtnodestat"), ("xtprocadmin", "xtprocadmin")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach in ["cori-haswell", "cori-knl"]: for cmd, filename in [("sinfo -a -l", "sinfol"), ("scontrol show jobid %s" % job_id, "sqsf_jobid"), # ("sqs -f", "sqsf"), ("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"), ("squeue -t R -o '%.10i %R'", "squeues")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach in ["anvil", "chrysalis", "compy"]: for cmd, filename in [("sinfo -l", "sinfol"), ("squeue -o '%all' --job {}".format(job_id), "squeueall_jobid"), ("squeue -o '%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %.20S %.20V %j'", "squeuef"), ("squeue -t R -o '%.10i %R'", "squeues")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
def _save_prerun_timing_acme(case, lid): timing_dir = case.get_value("SAVE_TIMING_DIR") if timing_dir is None or not os.path.isdir(timing_dir): logger.warning("SAVE_TIMING_DIR '%s' is not valid. ACME requires a valid SAVE_TIMING_DIR to be set in order to archive timings. Skipping archive timings" % timing_dir) return logger.info("timing dir is {}".format(timing_dir)) rundir = case.get_value("RUNDIR") blddir = case.get_value("EXEROOT") caseroot = case.get_value("CASEROOT") cimeroot = case.get_value("CIMEROOT") base_case = case.get_value("CASE") full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid) expect(not os.path.exists(full_timing_dir), "{} already exists".format(full_timing_dir)) os.makedirs(full_timing_dir) expect(os.path.exists(full_timing_dir), "{} does not exists".format(full_timing_dir)) mach = case.get_value("MACH") compiler = case.get_value("COMPILER") # For some batch machines save queue info job_id = _get_batch_job_id_for_syslog(case) if job_id is not None: if mach == "mira": for cmd, filename in [("qstat -f", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach in ["edison", "cori-haswell", "cori-knl"]: for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"), # ("sqs -f", "sqsf"), ("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"), ("squeue -t R -o '%.10i %R'", "squeues")]: filename = "%s.%s" % (filename, lid) run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename)) elif mach == "titan": for cmd, filename in [("qstat -f %s >" % job_id, "qstatf_jobid"), ("xtnodestat >", "xtnodestat"), # ("qstat -f >", "qstatf"), # ("xtdb2proc -f", "xtdb2proc"), ("showq >", "showq")]: full_cmd = cmd + " " + filename run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid)) # mdiag_reduce = os.path.join(full_timing_dir, "mdiag_reduce." + lid) # run_cmd_no_fail("./mdiag_reduce.csh", arg_stdout=mdiag_reduce, from_dir=os.path.join(caseroot, "Tools")) # gzip_existing_file(mdiag_reduce) elif mach == "anvil": for cmd, filename in [("qstat -f -1 acme >", "qstatf"), ("qstat -f %s >" % job_id, "qstatf_jobid"), ("qstat -r acme >", "qstatr")]: full_cmd = cmd + " " + filename run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir) gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid)) # copy/tar SourceModes source_mods_dir = os.path.join(caseroot, "SourceMods") if os.path.isdir(source_mods_dir): with tarfile.open(os.path.join(full_timing_dir, "SourceMods.{}.tar.gz".format(lid)), "w:gz") as tfd: tfd.add(source_mods_dir, arcname="SourceMods") # Save various case configuration items case_docs = os.path.join(full_timing_dir, "CaseDocs.{}".format(lid)) os.mkdir(case_docs) globs_to_copy = [ "CaseDocs/*", "*.run", "*.xml", "user_nl_*", "*env_mach_specific*", "Macros", "README.case", "Depends.{}".format(mach), "Depends.{}".format(compiler), "Depends.{}.{}".format(mach, compiler), "software_environment.txt" ] for glob_to_copy in globs_to_copy: for item in glob.glob(os.path.join(caseroot, glob_to_copy)): copy_umask(item, os.path.join(case_docs, os.path.basename(item) + "." + lid)) # Copy some items from build provenance blddir_globs_to_copy = [ "GIT_LOGS_HEAD", "build_environment.txt" ] for blddir_glob_to_copy in blddir_globs_to_copy: for item in glob.glob(os.path.join(blddir, blddir_glob_to_copy)): copy_umask(item, os.path.join(full_timing_dir, os.path.basename(item) + "." + lid)) # What this block does is mysterious to me (JGF) if job_id is not None: sample_interval = case.get_value("SYSLOG_N") if sample_interval > 0: archive_checkpoints = os.path.join(full_timing_dir, "checkpoints.{}".format(lid)) os.mkdir(archive_checkpoints) touch("{}/acme.log.{}".format(rundir, lid)) syslog_jobid = run_cmd_no_fail("./mach_syslog {:d} {} {} {} {}/timing/checkpoints {} >& /dev/null & echo $!".format(sample_interval, job_id, lid, rundir, rundir, archive_checkpoints), from_dir=os.path.join(caseroot, "Tools")) with open(os.path.join(rundir, "syslog_jobid.{}".format(job_id)), "w") as fd: fd.write("{}\n".format(syslog_jobid)) # Save state of repo if os.path.exists(os.path.join(cimeroot, ".git")): run_cmd_no_fail("git describe", arg_stdout=os.path.join(full_timing_dir, "GIT_DESCRIBE.{}".format(lid)), from_dir=cimeroot) else: run_cmd_no_fail("git describe", arg_stdout=os.path.join(full_timing_dir, "GIT_DESCRIBE.{}".format(lid)), from_dir=os.path.dirname(cimeroot))