Beispiel #1
0
def _save_logs(case, lid):
###############################################################################
    rundir = case.get_value("RUNDIR")
    logfiles = glob.glob(os.path.join(rundir, "*.log.{}".format(lid)))
    for logfile in logfiles:
        if os.path.isfile(logfile):
            gzip_existing_file(logfile)
Beispiel #2
0
def _archive_atm_costs(lid, rundir):
    atm_chunk_costs_src_path = os.path.join(rundir, "atm_chunk_costs.txt")
    if os.path.exists(atm_chunk_costs_src_path):
        atm_chunk_costs_dst_path = os.path.join(
            rundir, "atm_chunk_costs.{}".format(lid))
        shutil.move(atm_chunk_costs_src_path, atm_chunk_costs_dst_path)
        utils.gzip_existing_file(atm_chunk_costs_dst_path)
Beispiel #3
0
def _archive_memory_profile(lid, rundir):
    # gzip memory profile log
    glob_to_copy = "memory.[0-4].*.log"
    for item in glob.glob(os.path.join(rundir, glob_to_copy)):
        mprof_dst_path = os.path.join(os.path.dirname(item),
                                      (os.path.basename(item) +
                                       ".{}").format(lid))
        shutil.move(item, mprof_dst_path)
        utils.gzip_existing_file(mprof_dst_path)
Beispiel #4
0
def _record_olcf_queue(job_id, lid, full_timing_dir):
    for cmd, filename in [
        ("bjobs -u all >", "bjobsu_all"),
        ("bjobs -r -u all -o 'jobid slots exec_host' >", "bjobsru_allo"),
        ("bjobs -l -UF %s >" % job_id, "bjobslUF_jobid"),
    ]:
        full_cmd = cmd + " " + filename
        utils.run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir)
        utils.gzip_existing_file(
            os.path.join(full_timing_dir, filename + "." + lid))
Beispiel #5
0
def _record_anl_theta_queue(job_id, lid, full_timing_dir):
    for cmd, filename in [
        (
            "qstat -l --header JobID:JobName:User:Project:WallTime:QueuedTime:Score:RunTime:TimeRemaining:Nodes:State:Location:Mode:Command:Args:Procs:Queue:StartTime:attrs:Geometry",
            "qstatf",
        ),
        ("qstat -lf %s" % job_id, "qstatf_jobid"),
        ("xtnodestat", "xtnodestat"),
        ("xtprocadmin", "xtprocadmin"),
    ]:
        filename = "%s.%s" % (filename, lid)
        utils.run_cmd_no_fail(cmd,
                              arg_stdout=filename,
                              from_dir=full_timing_dir)
        utils.gzip_existing_file(os.path.join(full_timing_dir, filename))
Beispiel #6
0
def _record_anl_queue(job_id, lid, full_timing_dir):
    for cmd, filename in [
        ("sinfo -l", "sinfol"),
        ("squeue -o '%all' --job {}".format(job_id), "squeueall_jobid"),
        (
            "squeue -o '%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %.20S %.20V %j'",
            "squeuef",
        ),
        ("squeue -t R -o '%.10i %R'", "squeues"),
    ]:
        filename = "%s.%s" % (filename, lid)
        utils.run_cmd_no_fail(cmd,
                              arg_stdout=filename,
                              from_dir=full_timing_dir)
        utils.gzip_existing_file(os.path.join(full_timing_dir, filename))
Beispiel #7
0
def _record_nersc_queue(job_id, lid, full_timing_dir):
    for cmd, filename in [
        ("sinfo -a -l", "sinfol"),
        ("scontrol show jobid %s" % job_id, "sqsf_jobid"),
            # ("sqs -f", "sqsf"),
        (
            "squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'",
            "squeuef",
        ),
        ("squeue -t R -o '%.10i %R'", "squeues"),
    ]:
        filename = "%s.%s" % (filename, lid)
        utils.run_cmd_no_fail(cmd,
                              arg_stdout=filename,
                              from_dir=full_timing_dir)
        utils.gzip_existing_file(os.path.join(full_timing_dir, filename))
Beispiel #8
0
def post_build(case, logs, build_complete=False, save_build_provenance=True):
###############################################################################
    for log in logs:
        gzip_existing_file(log)

    if build_complete:
        # must ensure there's an lid
        lid = os.environ["LID"] if "LID" in os.environ else get_timestamp("%y%m%d-%H%M%S")
        if save_build_provenance:
            save_build_provenance_sub(case, lid=lid)
        # Set XML to indicate build complete
        case.set_value("BUILD_COMPLETE", True)
        case.set_value("BUILD_STATUS", 0)
        if "SMP_VALUE" in os.environ:
            case.set_value("SMP_BUILD", os.environ["SMP_VALUE"])
            case.flush()

        lock_file("env_build.xml")
Beispiel #9
0
def save_logs(case, lid):
###############################################################################
    logdir = case.get_value("LOGDIR")
    if logdir is not None and len(logdir) > 0:
        if not os.path.isdir(logdir):
            os.makedirs(logdir)

        caseroot = case.get_value("CASEROOT")
        rundir = case.get_value("RUNDIR")
        logfiles = glob.glob(os.path.join(rundir,"*.log.%s"%(lid)))
        for logfile in logfiles:
            if os.path.isfile(logfile):
                logfile_gz = gzip_existing_file(logfile)
                shutil.copy(logfile_gz,
                            os.path.join(caseroot, logdir, os.path.basename(logfile_gz)))
Beispiel #10
0
    expect(not os.path.exists(full_timing_dir),
           "%s already exists" % full_timing_dir)

    os.makedirs(full_timing_dir)
    mach = case.get_value("MACH")
    compiler = case.get_value("COMPILER")

    # For some batch machines save queue info
    job_id = _get_batch_job_id_for_syslog(case)
    if mach == "mira":
        for cmd, filename in [("qstat -lf", "qstatf"),
                              ("qstat -lf %s" % job_id, "qstatf_jobid")]:
            filename = "%s.%s" % (filename, lid)
            run_cmd_no_fail("%s > %s" % (cmd, filename),
                            from_dir=full_timing_dir)
            gzip_existing_file(os.path.join(full_timing_dir, filename))
    elif mach in ["corip1", "edison"]:
        for cmd, filename in [("sqs -f", "sqsf"), ("sqs -w -a", "sqsw"),
                              ("sqs -f %s" % job_id, "sqsf_jobid"),
                              ("squeue", "squeuef")]:
            filename = "%s.%s" % (filename, lid)
            run_cmd_no_fail("%s > %s" % (cmd, filename),
                            from_dir=full_timing_dir)
            gzip_existing_file(os.path.join(full_timing_dir, filename))
    elif mach == "titan":
        for cmd, filename in [("xtdb2proc -f xtdb2proc", "xtdb2procf"),
                              ("qstat -f > qstat", "qstatf"),
                              ("qstat -f %s > qstatf_jobid" % job_id,
                               "qstatf_jobid"),
                              ("xtnodestat > xtnodestat", "xtnodestatf"),
                              ("showq > showqf", "showqf")]:
Beispiel #11
0
        os.makedirs(full_timing_dir)
    except OSError:
        logger.warning("{} cannot be created. Skipping archive of timing data and associated provenance.".format(full_timing_dir))
        return

    mach = case.get_value("MACH")
    compiler = case.get_value("COMPILER")

    # For some batch machines save queue info
    job_id = _get_batch_job_id_for_syslog(case)
    if job_id is not None:
        if mach == "mira":
            for cmd, filename in [("qstat -f", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach == "theta":
            for cmd, filename in [("qstat -l --header JobID:JobName:User:Project:WallTime:QueuedTime:Score:RunTime:TimeRemaining:Nodes:State:Location:Mode:Command:Args:Procs:Queue:StartTime:attrs:Geometry", "qstatf"), 
                                  ("qstat -lf %s" % job_id, "qstatf_jobid"),
                                  ("xtnodestat", "xtnodestat"),
                                  ("xtprocadmin", "xtprocadmin")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach in ["edison", "cori-haswell", "cori-knl"]:
            for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"),
                                  # ("sqs -f", "sqsf"),
                                  ("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"),
                                  ("squeue -t R -o '%.10i %R'", "squeues")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
Beispiel #12
0
    with utils.SharedArea():
        lid = os.environ["LID"] if lid is None else lid

        if case.get_value("SAVE_TIMING"):
            caseroot = case.get_value("CASEROOT")
            rundir = case.get_value("RUNDIR")

            rundir_timing_dir = _archive_timings(lid, rundir)

            _archive_atm_costs(lid, rundir)

            _archive_memory_profile(lid, rundir)

            _archive_spio_stats(lid, rundir)

            utils.gzip_existing_file(
                os.path.join(caseroot, "timing", "e3sm_timing_stats.%s" % lid))

            # JGF: not sure why we do this
            timing_saved_file = "timing.%s.saved" % lid
            utils.touch(os.path.join(caseroot, "timing", timing_saved_file))

            project = case.get_value("PROJECT",
                                     subgroup=case.get_primary_job())
            if not case.is_save_timing_dir_project(project):
                return

            timing_dir = case.get_value("SAVE_TIMING_DIR")
            if timing_dir is None or not os.path.isdir(timing_dir):
                return

            mach = case.get_value("MACH")
Beispiel #13
0
    t3 = time.time()

    if not sharedlib_only:
        logger.info("Time spent not building: {:f} sec".format(t2 - t1))
        logger.info("Time spent building: {:f} sec".format(t3 - t2))
        logger.info("MODEL BUILD HAS FINISHED SUCCESSFULLY")

    return True


###############################################################################
def post_build(case, logs, build_complete=False, save_build_provenance=True):
    ###############################################################################
    for log in logs:
        gzip_existing_file(log)

    if build_complete:
        # must ensure there's an lid
        lid = os.environ["LID"] if "LID" in os.environ else get_timestamp(
            "%y%m%d-%H%M%S")
        if save_build_provenance:
            save_build_provenance_sub(case, lid=lid)
        # Set XML to indicate build complete
        case.set_value("BUILD_COMPLETE", True)
        case.set_value("BUILD_STATUS", 0)
        if "SMP_VALUE" in os.environ:
            case.set_value("SMP_BUILD", os.environ["SMP_VALUE"])

        case.flush()
Beispiel #14
0
def _save_postrun_timing_e3sm(case, lid):
    caseroot = case.get_value("CASEROOT")
    rundir = case.get_value("RUNDIR")

    # tar timings
    rundir_timing_dir = os.path.join(rundir, "timing." + lid)
    shutil.move(os.path.join(rundir, "timing"), rundir_timing_dir)
    with tarfile.open("%s.tar.gz" % rundir_timing_dir, "w:gz") as tfd:
        tfd.add(rundir_timing_dir, arcname=os.path.basename(rundir_timing_dir))

    shutil.rmtree(rundir_timing_dir)

    gzip_existing_file(os.path.join(caseroot, "timing", "e3sm_timing_stats.%s" % lid))

    # JGF: not sure why we do this
    timing_saved_file = "timing.%s.saved" % lid
    touch(os.path.join(caseroot, "timing", timing_saved_file))

    project = case.get_value("PROJECT", subgroup=case.get_primary_job())
    if not case.is_save_timing_dir_project(project):
        return

    timing_dir = case.get_value("SAVE_TIMING_DIR")
    if timing_dir is None or not os.path.isdir(timing_dir):
        return

    mach = case.get_value("MACH")
    base_case = case.get_value("CASE")
    full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid)

    if not os.path.isdir(full_timing_dir):
        return

    # Kill mach_syslog
    job_id = _get_batch_job_id_for_syslog(case)
    if job_id is not None:
        syslog_jobid_path = os.path.join(rundir, "syslog_jobid.{}".format(job_id))
        if os.path.exists(syslog_jobid_path):
            try:
                with open(syslog_jobid_path, "r") as fd:
                    syslog_jobid = int(fd.read().strip())
                os.kill(syslog_jobid, signal.SIGTERM)
            except (ValueError, OSError) as e:
                logger.warning("Failed to kill syslog: {}".format(e))
            finally:
                os.remove(syslog_jobid_path)

    # copy timings
    copy_umask("%s.tar.gz" % rundir_timing_dir, full_timing_dir)

    #
    # save output files and logs
    #
    globs_to_copy = []
    if job_id is not None:
        if mach == "titan":
            globs_to_copy.append("%s*OU" % job_id)
        elif mach == "anvil":
            globs_to_copy.append("/home/%s/%s*OU" % (getpass.getuser(), job_id))
        elif mach in ["mira", "theta"]:
            globs_to_copy.append("%s*error" % job_id)
            globs_to_copy.append("%s*output" % job_id)
            globs_to_copy.append("%s*cobaltlog" % job_id)
        elif mach in ["edison", "cori-haswell", "cori-knl"]:
            globs_to_copy.append("%s*run*%s" % (case.get_value("CASE"), job_id))

    globs_to_copy.append("logs/run_environment.txt.{}".format(lid))
    globs_to_copy.append("logs/e3sm.log.{}.gz".format(lid))
    globs_to_copy.append("logs/cpl.log.{}.gz".format(lid))
    globs_to_copy.append("timing/*.{}*".format(lid))
    globs_to_copy.append("CaseStatus")

    for glob_to_copy in globs_to_copy:
        for item in glob.glob(os.path.join(caseroot, glob_to_copy)):
            basename = os.path.basename(item)
            if basename != timing_saved_file:
                if lid not in basename and not basename.endswith(".gz"):
                    copy_umask(item, os.path.join(full_timing_dir, "{}.{}".format(basename, lid)))
                else:
                    copy_umask(item, full_timing_dir)

    # zip everything
    for root, _, files in os.walk(full_timing_dir):
        for filename in files:
            if not filename.endswith(".gz"):
                gzip_existing_file(os.path.join(root, filename))
Beispiel #15
0
                elif not fv3_standalone and 'SUCCESSFUL TERMINATION' in fd.read(
                ):
                    count_ok += 1
        if count_ok != cpl_ninst:
            expect(False,
                   "Model did not complete - see {} \n ".format(cpl_logfile))


###############################################################################
def _save_logs(case, lid):
    ###############################################################################
    rundir = case.get_value("RUNDIR")
    logfiles = glob.glob(os.path.join(rundir, "*.log.{}".format(lid)))
    for logfile in logfiles:
        if os.path.isfile(logfile):
            gzip_existing_file(logfile)


######################################################################################
def _resubmit_check(case):
    ###############################################################################
    """
    check to see if we need to do resubmission from this particular job,
    Note that Mira requires special logic
    """
    dout_s = case.get_value("DOUT_S")
    logger.warning("dout_s {} ".format(dout_s))
    mach = case.get_value("MACH")
    logger.warning("mach {} ".format(mach))
    resubmit_num = case.get_value("RESUBMIT")
    logger.warning("resubmit_num {}".format(resubmit_num))
Beispiel #16
0
def _save_prerun_timing_e3sm(case, lid):
    project = case.get_value("PROJECT", subgroup=case.get_primary_job())
    if not case.is_save_timing_dir_project(project):
        return

    timing_dir = case.get_value("SAVE_TIMING_DIR")
    if timing_dir is None or not os.path.isdir(timing_dir):
        logger.warning("SAVE_TIMING_DIR {} is not valid. E3SM requires a valid SAVE_TIMING_DIR to archive timing data.".format(timing_dir))
        return

    logger.info("Archiving timing data and associated provenance in {}.".format(timing_dir))
    rundir = case.get_value("RUNDIR")
    blddir = case.get_value("EXEROOT")
    caseroot = case.get_value("CASEROOT")
    cimeroot = case.get_value("CIMEROOT")
    base_case = case.get_value("CASE")
    full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid)
    if os.path.exists(full_timing_dir):
        logger.warning("{} already exists. Skipping archive of timing data and associated provenance.".format(full_timing_dir))
        return

    try:
        os.makedirs(full_timing_dir)
    except OSError:
        logger.warning("{} cannot be created. Skipping archive of timing data and associated provenance.".format(full_timing_dir))
        return

    mach = case.get_value("MACH")
    compiler = case.get_value("COMPILER")

    # For some batch machines save queue info
    job_id = _get_batch_job_id_for_syslog(case)
    if job_id is not None:
        if mach == "mira":
            for cmd, filename in [("qstat -f", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach == "theta":
            for cmd, filename in [("qstat -l --header JobID:JobName:User:Project:WallTime:QueuedTime:Score:RunTime:TimeRemaining:Nodes:State:Location:Mode:Command:Args:Procs:Queue:StartTime:attrs:Geometry", "qstatf"),
                                  ("qstat -lf %s" % job_id, "qstatf_jobid"),
                                  ("xtnodestat", "xtnodestat"),
                                  ("xtprocadmin", "xtprocadmin")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach in ["edison", "cori-haswell", "cori-knl"]:
            for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"),
                                  # ("sqs -f", "sqsf"),
                                  ("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"),
                                  ("squeue -t R -o '%.10i %R'", "squeues")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach == "titan":
            for cmd, filename in [("qstat -f %s >" % job_id, "qstatf_jobid"),
                                  ("xtnodestat >", "xtnodestat"),
                                  # ("qstat -f >", "qstatf"),
                                  # ("xtdb2proc -f", "xtdb2proc"),
                                  ("showq >", "showq")]:
                full_cmd = cmd + " " + filename
                run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid))

            # mdiag_reduce = os.path.join(full_timing_dir, "mdiag_reduce." + lid)
            # run_cmd_no_fail("./mdiag_reduce.csh", arg_stdout=mdiag_reduce, from_dir=os.path.join(caseroot, "Tools"))
            # gzip_existing_file(mdiag_reduce)
        elif mach == "anvil":
            for cmd, filename in [("qstat -f -1 acme >", "qstatf"),
                                  ("qstat -f %s >" % job_id, "qstatf_jobid"),
                                  ("qstat -r acme >", "qstatr")]:
                full_cmd = cmd + " " + filename
                run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid))

    # copy/tar SourceModes
    source_mods_dir = os.path.join(caseroot, "SourceMods")
    if os.path.isdir(source_mods_dir):
        with tarfile.open(os.path.join(full_timing_dir, "SourceMods.{}.tar.gz".format(lid)), "w:gz") as tfd:
            tfd.add(source_mods_dir, arcname="SourceMods")

    # Save various case configuration items
    case_docs = os.path.join(full_timing_dir, "CaseDocs.{}".format(lid))
    os.mkdir(case_docs)
    globs_to_copy = [
        "CaseDocs/*",
        "*.run",
        ".*.run",
        "*.xml",
        "user_nl_*",
        "*env_mach_specific*",
        "Macros*",
        "README.case",
        "Depends.{}".format(mach),
        "Depends.{}".format(compiler),
        "Depends.{}.{}".format(mach, compiler),
        "software_environment.txt"
        ]
    for glob_to_copy in globs_to_copy:
        for item in glob.glob(os.path.join(caseroot, glob_to_copy)):
            copy_umask(item, os.path.join(case_docs, "{}.{}".format(os.path.basename(item).lstrip("."), lid)))

    # Copy some items from build provenance
    blddir_globs_to_copy = [
        "GIT_LOGS_HEAD",
        "build_environment.txt"
        ]
    for blddir_glob_to_copy in blddir_globs_to_copy:
        for item in glob.glob(os.path.join(blddir, blddir_glob_to_copy)):
            copy_umask(item, os.path.join(full_timing_dir, os.path.basename(item) + "." + lid))

    # Save state of repo
    from_repo = cimeroot if os.path.exists(os.path.join(cimeroot, ".git")) else os.path.dirname(cimeroot)
    desc = get_current_commit(tag=True, repo=from_repo)
    with open(os.path.join(full_timing_dir, "GIT_DESCRIBE.{}".format(lid)), "w") as fd:
        fd.write(desc)

    # What this block does is mysterious to me (JGF)
    if job_id is not None:

        # Kill mach_syslog from previous run if one exists
        syslog_jobid_path = os.path.join(rundir, "syslog_jobid.{}".format(job_id))
        if os.path.exists(syslog_jobid_path):
            try:
                with open(syslog_jobid_path, "r") as fd:
                    syslog_jobid = int(fd.read().strip())
                os.kill(syslog_jobid, signal.SIGTERM)
            except (ValueError, OSError) as e:
                logger.warning("Failed to kill syslog: {}".format(e))
            finally:
                os.remove(syslog_jobid_path)

        # If requested, spawn a mach_syslog process to monitor job progress
        sample_interval = case.get_value("SYSLOG_N")
        if sample_interval > 0:
            archive_checkpoints = os.path.join(full_timing_dir, "checkpoints.{}".format(lid))
            os.mkdir(archive_checkpoints)
            touch("{}/e3sm.log.{}".format(rundir, lid))
            syslog_jobid = run_cmd_no_fail("./mach_syslog {:d} {} {} {} {}/timing/checkpoints {} >& /dev/null & echo $!".format(sample_interval, job_id, lid, rundir, rundir, archive_checkpoints),
                                           from_dir=os.path.join(caseroot, "Tools"))
            with open(os.path.join(rundir, "syslog_jobid.{}".format(job_id)), "w") as fd:
                fd.write("{}\n".format(syslog_jobid))
Beispiel #17
0
def save_postrun_provenance_acme(case, lid):
    save_timing = case.get_value("SAVE_TIMING")
    if not save_timing:
        return

    lid = os.environ["LID"] if lid is None else lid

    rundir = case.get_value("RUNDIR")
    timing_dir = case.get_value("SAVE_TIMING_DIR")
    caseroot = case.get_value("CASEROOT")
    mach = case.get_value("MACH")
    base_case = case.get_value("CASE")
    full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid)

    # Kill mach_syslog
    job_id = _get_batch_job_id_for_syslog(case)
    if job_id is not None:
        syslog_jobid_path = os.path.join(rundir, "syslog_jobid", ".%s" % job_id)
        if os.path.exists(syslog_jobid_path):
            try:
                with open(syslog_jobid_path, "r") as fd:
                    syslog_jobid = int(fd.read().strip())
                os.kill(syslog_jobid, signal.SIGTERM)
            except (ValueError, OSError) as e:
                logger.warning("Failed to kill syslog: %s" % e)
            finally:
                os.remove(syslog_jobid_path)

    # copy/tar timings
    with tarfile.open(os.path.join(full_timing_dir, "timing.%s.tar.gz" % lid), "w:gz") as tfd:
        tfd.add(os.path.join(rundir, "timing"))

    #
    # save output files and logs
    #
    globs_to_copy = []
    if mach == "titan":
        globs_to_copy.append("%s*OU" % job_id)
    elif mach == "mira":
        globs_to_copy.append("%s*output" % job_id)
        globs_to_copy.append("%s*cobaltlog" % job_id)
    elif mach in ["edison", "corip1"]:
        globs_to_copy.append("%s" % case.get_value("CASE"))

    globs_to_copy.append("logs/acme.log.%s.gz" % lid)
    globs_to_copy.append("logs/cpl.log.%s.gz" % lid)
    globs_to_copy.append("timing/*.%s" % lid)
    globs_to_copy.append("CaseStatus")

    for glob_to_copy in globs_to_copy:
        for item in glob.glob(os.path.join(caseroot, glob_to_copy)):
            basename = os.path.basename(item)
            if lid not in basename and not basename.endswith(".gz"):
                shutil.copy(item, os.path.join(full_timing_dir, "%s.%s" % (basename, lid)))
            else:
                shutil.copy(item, full_timing_dir)

    # zip everything
    for root, _, files in os.walk(full_timing_dir):
        for filename in files:
            if not filename.endswith(".gz"):
                gzip_existing_file(os.path.join(root, filename))
Beispiel #18
0
def save_prerun_provenance_acme(case, lid=None):
    if not case.get_value("SAVE_TIMING"):
        return

    lid = os.environ["LID"] if lid is None else lid

    timing_dir = case.get_value("SAVE_TIMING_DIR")
    if timing_dir is None or timing_dir == 'UNSET':
        logger.warning("ACME requires SAVE_TIMING_DIR to be set in order to save timings. Skipping save timings")
        return

    logger.info("timing dir is %s" % timing_dir)
    rundir = case.get_value("RUNDIR")
    blddir = case.get_value("EXEROOT")
    caseroot = case.get_value("CASEROOT")
    cimeroot = case.get_value("CIMEROOT")
    base_case = case.get_value("CASE")
    full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid)
    expect(not os.path.exists(full_timing_dir), "%s already exists" % full_timing_dir)

    os.makedirs(full_timing_dir)
    mach = case.get_value("MACH")
    compiler = case.get_value("COMPILER")

    # For some batch machines save queue info
    job_id = _get_batch_job_id_for_syslog(case)
    if mach == "mira":
        for cmd, filename in [("qstat -lf", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]:
            filename = "%s.%s" % (filename, lid)
            run_cmd_no_fail("%s > %s" % (cmd, filename), from_dir=full_timing_dir)
            gzip_existing_file(os.path.join(full_timing_dir, filename))
    elif mach in ["corip1", "edison"]:
        for cmd, filename in [("sqs -f", "sqsf"), ("sqs -w -a", "sqsw"), ("sqs -f %s" % job_id, "sqsf_jobid"), ("squeue", "squeuef")]:
            filename = "%s.%s" % (filename, lid)
            run_cmd_no_fail("%s > %s" % (cmd, filename), from_dir=full_timing_dir)
            gzip_existing_file(os.path.join(full_timing_dir, filename))
    elif mach == "titan":
        for cmd, filename in [("xtdb2proc -f xtdb2proc", "xtdb2procf"),
                              ("qstat -f > qstat", "qstatf"),
                              ("qstat -f %s > qstatf_jobid" % job_id, "qstatf_jobid"),
                              ("xtnodestat > xtnodestat", "xtnodestatf"),
                              ("showq > showqf", "showqf")]:
            run_cmd_no_fail(cmd + "." + lid, from_dir=full_timing_dir)
            gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid))

        mdiag_reduce = os.path.join(full_timing_dir, "mdiag_reduce." + lid)
        run_cmd_no_fail("./mdiag_reduce.csh > %s" % mdiag_reduce, from_dir=os.path.join(caseroot, "Tools"))
        gzip_existing_file(mdiag_reduce)

    # copy/tar SourceModes
    source_mods_dir = os.path.join(caseroot, "SourceMods")
    if os.path.isdir(source_mods_dir):
        with tarfile.open(os.path.join(full_timing_dir, "SourceMods.%s.tar.gz" % lid), "w:gz") as tfd:
            tfd.add(source_mods_dir)

    # Save various case configuration items
    case_docs = os.path.join(full_timing_dir, "CaseDocs.%s" % lid)
    os.mkdir(case_docs)
    globs_to_copy = [
        "CaseDocs/*",
        "*.run",
        "*.xml",
        "user_nl_*",
        "*env_mach_specific*",
        "Macros",
        "README.case",
        "Depends.%s" % mach,
        "Depends.%s" % compiler,
        "Depends.%s.%s" % (mach, compiler),
        "software_environment.txt"
        ]
    for glob_to_copy in globs_to_copy:
        for item in glob.glob(os.path.join(caseroot, glob_to_copy)):
            shutil.copy(item, os.path.join(case_docs, os.path.basename(item) + "." + lid))

    # Copy some items from build provenance
    blddir_globs_to_copy = [
        "GIT_LOGS_HEAD",
        "build_environment.txt"
        ]
    for blddir_glob_to_copy in blddir_globs_to_copy:
        for item in glob.glob(os.path.join(blddir, blddir_glob_to_copy)):
            shutil.copy(item, os.path.join(full_timing_dir, os.path.basename(item) + "." + lid))

    # What this block does is mysterious to me (JGF)
    if job_id is not None:
        sample_interval = case.get_value("SYSLOG_N")
        if sample_interval > 0:
            archive_checkpoints = os.path.join(full_timing_dir, "checkpoints.%s" % lid)
            os.mkdir(archive_checkpoints)
            touch("%s/acme.log.%s" % (rundir, lid))
            syslog_jobid = run_cmd_no_fail("./mach_syslog %d %s %s %s %s/timing/checkpoints %s/checkpoints >& /dev/null & echo $!" %
                                           (sample_interval, job_id, lid, rundir, rundir, archive_checkpoints),
                                           from_dir=os.path.join(caseroot, "Tools"))
            with open(os.path.join(rundir, "syslog_jobid.%s" % job_id), "w") as fd:
                fd.write("%s\n" % syslog_jobid)

    # Save state of repo
    run_cmd_no_fail("git describe > %s" % os.path.join(full_timing_dir, "GIT_DESCRIBE.%s" % lid), from_dir=cimeroot)
Beispiel #19
0

###############################################################################
def save_logs(case, lid):
    ###############################################################################
    logdir = case.get_value("LOGDIR")
    if logdir is not None and len(logdir) > 0:
        if not os.path.isdir(logdir):
            os.makedirs(logdir)

        caseroot = case.get_value("CASEROOT")
        rundir = case.get_value("RUNDIR")
        logfiles = glob.glob(os.path.join(rundir, "*.log.%s" % (lid)))
        for logfile in logfiles:
            if os.path.isfile(logfile):
                logfile_gz = gzip_existing_file(logfile)
                shutil.copy(
                    logfile_gz,
                    os.path.join(caseroot, logdir,
                                 os.path.basename(logfile_gz)))


###############################################################################
def resubmit_check(case):
    ###############################################################################

    # check to see if we need to do resubmission from this particular job,
    # Note that Mira requires special logic

    dout_s = case.get_value("DOUT_S")
    logger.warn("dout_s %s " % (dout_s))
Beispiel #20
0
        return

    mach = case.get_value("MACH")
    compiler = case.get_value("COMPILER")

    # For some batch machines save queue info
    job_id = _get_batch_job_id_for_syslog(case)
    if job_id is not None:
        if mach == "theta":
            for cmd, filename in [("qstat -l --header JobID:JobName:User:Project:WallTime:QueuedTime:Score:RunTime:TimeRemaining:Nodes:State:Location:Mode:Command:Args:Procs:Queue:StartTime:attrs:Geometry", "qstatf"),
                                  ("qstat -lf %s" % job_id, "qstatf_jobid"),
                                  ("xtnodestat", "xtnodestat"),
                                  ("xtprocadmin", "xtprocadmin")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach in ["cori-haswell", "cori-knl"]:
            for cmd, filename in [("sinfo -a -l", "sinfol"), ("scontrol show jobid %s" % job_id, "sqsf_jobid"),
                                  # ("sqs -f", "sqsf"),
                                  ("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"),
                                  ("squeue -t R -o '%.10i %R'", "squeues")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach in ["anvil", "chrysalis", "compy"]:
            for cmd, filename in [("sinfo -l", "sinfol"), 
                                  ("squeue -o '%all' --job {}".format(job_id), "squeueall_jobid"),
                                  ("squeue -o '%.10i %.10P %.15u %.20a %.2t %.6D %.8C %.12M %.12l %.20S %.20V %j'", "squeuef"),
                                  ("squeue -t R -o '%.10i %R'", "squeues")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
Beispiel #21
0
def _save_prerun_timing_acme(case, lid):
    timing_dir = case.get_value("SAVE_TIMING_DIR")
    if timing_dir is None or not os.path.isdir(timing_dir):
        logger.warning("SAVE_TIMING_DIR '%s' is not valid. ACME requires a valid SAVE_TIMING_DIR to be set in order to archive timings. Skipping archive timings" % timing_dir)
        return

    logger.info("timing dir is {}".format(timing_dir))
    rundir = case.get_value("RUNDIR")
    blddir = case.get_value("EXEROOT")
    caseroot = case.get_value("CASEROOT")
    cimeroot = case.get_value("CIMEROOT")
    base_case = case.get_value("CASE")
    full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid)
    expect(not os.path.exists(full_timing_dir), "{} already exists".format(full_timing_dir))

    os.makedirs(full_timing_dir)
    expect(os.path.exists(full_timing_dir), "{} does not exists".format(full_timing_dir))
    mach = case.get_value("MACH")
    compiler = case.get_value("COMPILER")

    # For some batch machines save queue info
    job_id = _get_batch_job_id_for_syslog(case)
    if job_id is not None:
        if mach == "mira":
            for cmd, filename in [("qstat -f", "qstatf"), ("qstat -lf %s" % job_id, "qstatf_jobid")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach in ["edison", "cori-haswell", "cori-knl"]:
            for cmd, filename in [("sinfo -a -l", "sinfol"), ("sqs -f %s" % job_id, "sqsf_jobid"),
                                  # ("sqs -f", "sqsf"),
                                  ("squeue -o '%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l %.20S %.20V'", "squeuef"),
                                  ("squeue -t R -o '%.10i %R'", "squeues")]:
                filename = "%s.%s" % (filename, lid)
                run_cmd_no_fail(cmd, arg_stdout=filename, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename))
        elif mach == "titan":
            for cmd, filename in [("qstat -f %s >" % job_id, "qstatf_jobid"),
                                  ("xtnodestat >", "xtnodestat"),
                                  # ("qstat -f >", "qstatf"),
                                  # ("xtdb2proc -f", "xtdb2proc"),
                                  ("showq >", "showq")]:
                full_cmd = cmd + " " + filename
                run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid))

            # mdiag_reduce = os.path.join(full_timing_dir, "mdiag_reduce." + lid)
            # run_cmd_no_fail("./mdiag_reduce.csh", arg_stdout=mdiag_reduce, from_dir=os.path.join(caseroot, "Tools"))
            # gzip_existing_file(mdiag_reduce)
        elif mach == "anvil":
            for cmd, filename in [("qstat -f -1 acme >", "qstatf"),
                                  ("qstat -f %s >" % job_id, "qstatf_jobid"),
                                  ("qstat -r acme >", "qstatr")]:
                full_cmd = cmd + " " + filename
                run_cmd_no_fail(full_cmd + "." + lid, from_dir=full_timing_dir)
                gzip_existing_file(os.path.join(full_timing_dir, filename + "." + lid))

    # copy/tar SourceModes
    source_mods_dir = os.path.join(caseroot, "SourceMods")
    if os.path.isdir(source_mods_dir):
        with tarfile.open(os.path.join(full_timing_dir, "SourceMods.{}.tar.gz".format(lid)), "w:gz") as tfd:
            tfd.add(source_mods_dir, arcname="SourceMods")

    # Save various case configuration items
    case_docs = os.path.join(full_timing_dir, "CaseDocs.{}".format(lid))
    os.mkdir(case_docs)
    globs_to_copy = [
        "CaseDocs/*",
        "*.run",
        "*.xml",
        "user_nl_*",
        "*env_mach_specific*",
        "Macros",
        "README.case",
        "Depends.{}".format(mach),
        "Depends.{}".format(compiler),
        "Depends.{}.{}".format(mach, compiler),
        "software_environment.txt"
        ]
    for glob_to_copy in globs_to_copy:
        for item in glob.glob(os.path.join(caseroot, glob_to_copy)):
            copy_umask(item, os.path.join(case_docs, os.path.basename(item) + "." + lid))

    # Copy some items from build provenance
    blddir_globs_to_copy = [
        "GIT_LOGS_HEAD",
        "build_environment.txt"
        ]
    for blddir_glob_to_copy in blddir_globs_to_copy:
        for item in glob.glob(os.path.join(blddir, blddir_glob_to_copy)):
            copy_umask(item, os.path.join(full_timing_dir, os.path.basename(item) + "." + lid))

    # What this block does is mysterious to me (JGF)
    if job_id is not None:
        sample_interval = case.get_value("SYSLOG_N")
        if sample_interval > 0:
            archive_checkpoints = os.path.join(full_timing_dir, "checkpoints.{}".format(lid))
            os.mkdir(archive_checkpoints)
            touch("{}/acme.log.{}".format(rundir, lid))
            syslog_jobid = run_cmd_no_fail("./mach_syslog {:d} {} {} {} {}/timing/checkpoints {} >& /dev/null & echo $!".format(sample_interval, job_id, lid, rundir, rundir, archive_checkpoints),
                                           from_dir=os.path.join(caseroot, "Tools"))
            with open(os.path.join(rundir, "syslog_jobid.{}".format(job_id)), "w") as fd:
                fd.write("{}\n".format(syslog_jobid))

    # Save state of repo
    if os.path.exists(os.path.join(cimeroot, ".git")):
        run_cmd_no_fail("git describe", arg_stdout=os.path.join(full_timing_dir, "GIT_DESCRIBE.{}".format(lid)), from_dir=cimeroot)
    else:
        run_cmd_no_fail("git describe", arg_stdout=os.path.join(full_timing_dir, "GIT_DESCRIBE.{}".format(lid)), from_dir=os.path.dirname(cimeroot))