Example #1
0
def case_run(case):
    ###############################################################################
    # Set up the run, run the model, do the postrun steps
    run_with_submit = case.get_value("RUN_WITH_SUBMIT")
    expect(
        run_with_submit,
        "You are not calling the run script via the submit script. "
        "As a result, short-term archiving will not be called automatically."
        "Please submit your run using the submit script like so:"
        " ./case.submit")

    data_assimilation = case.get_value("DATA_ASSIMILATION")
    data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES")
    data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT")

    # set up the LID
    lid = new_lid()

    save_prerun_provenance(case)

    for cycle in range(data_assimilation_cycles):
        # After the first DA cycle, runs are restart runs
        if cycle > 0:
            case.set_value("CONTINUE_RUN", "TRUE")
            lid = new_lid()

        pre_run_check(case)
        run_model(case)
        post_run_check(case, lid)
        save_logs(case, lid)  # Copy log files back to caseroot
        if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"):
            get_timing(case, lid)  # Run the getTiming script
Example #2
0
def case_run(self, skip_pnl=False, set_continue_run=False, submit_resubmits=False):
###############################################################################
    # Set up the run, run the model, do the postrun steps
    prerun_script = self.get_value("PRERUN_SCRIPT")
    postrun_script = self.get_value("POSTRUN_SCRIPT")

    data_assimilation_cycles = self.get_value("DATA_ASSIMILATION_CYCLES")
    data_assimilation_script = self.get_value("DATA_ASSIMILATION_SCRIPT")
    data_assimilation = (data_assimilation_cycles > 0 and
                         len(data_assimilation_script) > 0 and
                         os.path.isfile(data_assimilation_script))
    # set up the LID
    lid = new_lid()

    if prerun_script:
        self.flush()
        _do_external(prerun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"),
                    lid, prefix="prerun")
        self.read_xml()

    for cycle in range(data_assimilation_cycles):
        # After the first DA cycle, runs are restart runs
        if cycle > 0:
            lid = new_lid()
            self.set_value("CONTINUE_RUN",
                           self.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

        lid = _run_model(self, lid, skip_pnl, da_cycle=cycle)

        if self.get_value("CHECK_TIMING") or self.get_value("SAVE_TIMING"):
            get_timing(self, lid)     # Run the getTiming script

        if data_assimilation:
            self.flush()
            _do_data_assimilation(data_assimilation_script, self.get_value("CASEROOT"), cycle, lid,
                                 self.get_value("RUNDIR"))
            self.read_xml()
        _save_logs(self, lid)
        save_postrun_provenance(self)

    if postrun_script:
        self.flush()
        _do_external(postrun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"),
                    lid, prefix="postrun")
        self.read_xml()
        _save_logs(self, lid)

    if set_continue_run:
        self.set_value("CONTINUE_RUN",
                       self.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

    logger.warning("check for resubmit")
    if submit_resubmits:
        _resubmit_check(self)

    return True
Example #3
0
def case_run(case, skip_pnl=False):
###############################################################################
    # Set up the run, run the model, do the postrun steps
    prerun_script = case.get_value("PRERUN_SCRIPT")
    postrun_script = case.get_value("POSTRUN_SCRIPT")

    data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES")
    data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT")
    data_assimilation = (data_assimilation_cycles > 0 and
                         len(data_assimilation_script) > 0 and
                         os.path.isfile(data_assimilation_script))
    # set up the LID
    lid = new_lid()

    if prerun_script:
        case.flush()
        do_external(prerun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"),
                    lid, prefix="prerun")
        case.read_xml()

    for cycle in range(data_assimilation_cycles):
        # After the first DA cycle, runs are restart runs
        if cycle > 0:
            lid = new_lid()
            case.set_value("CONTINUE_RUN",
                           case.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

        lid = run_model(case, lid, skip_pnl, da_cycle=cycle)

        if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"):
            get_timing(case, lid)     # Run the getTiming script

        if data_assimilation:
            case.flush()
            do_data_assimilation(data_assimilation_script, case.get_value("CASEROOT"), cycle, lid,
                                 case.get_value("RUNDIR"))
            case.read_xml()

        save_logs(case, lid)       # Copy log files back to caseroot

        save_postrun_provenance(case)

    if postrun_script:
        case.flush()
        do_external(postrun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"),
                    lid, prefix="postrun")
        case.read_xml()

    save_logs(case, lid)       # Copy log files back to caseroot

    logger.warning("check for resubmit")
    resubmit_check(case)

    return True
Example #4
0
    def run_phase(self):
        # This mimics a bit of what's done in the typical case.run. Note that
        # case.get_mpirun_cmd creates a command that runs the executable given by
        # case.run_exe. So it's important that (elsewhere in this test script) we create a
        # link pointing from that to the atm_driver.exe executable.
        lid = new_lid()
        os.environ["OMP_NUM_THREADS"] = str(self._case.thread_count)
        cmd = self._case.get_mpirun_cmd(allow_unresolved_envvars=False)
        run_cmd_no_fail(cmd, from_dir=self._atm_driver_rundir())

        self._link_to_output_files()
Example #5
0
def case_run(case):
###############################################################################
    # Set up the run, run the model, do the postrun steps
    run_with_submit = case.get_value("RUN_WITH_SUBMIT")
    expect(run_with_submit,
           "You are not calling the run script via the submit script. "
           "As a result, short-term archiving will not be called automatically."
           "Please submit your run using the submit script like so:"
           " ./case.submit")

    data_assimilation = case.get_value("DATA_ASSIMILATION")
    data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES")
    data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT")

    # set up the LID
    lid = new_lid()

    save_prerun_provenance(case)

    for cycle in range(data_assimilation_cycles):
        # After the first DA cycle, runs are restart runs
        if cycle > 0:
            case.set_value("CONTINUE_RUN", "TRUE")
            lid = new_lid()

        pre_run_check(case)
        run_model(case)
        post_run_check(case, lid)
        save_logs(case, lid)       # Copy log files back to caseroot
        if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"):
            get_timing(case, lid)     # Run the getTiming script

        if data_assimilation:
            do_data_assimilation(data_assimilation_script, cycle, data_assimilation_cycles, lid)

        save_postrun_provenance(case)

    logger.warn("check for resubmit")
    resubmit_check(case)

    return True
Example #6
0
def case_run(self,
             skip_pnl=False,
             set_continue_run=False,
             submit_resubmits=False):
    ###############################################################################
    model_log(
        "e3sm", logger,
        "{} CASE.RUN BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
    # Set up the run, run the model, do the postrun steps

    # set up the LID
    lid = new_lid()

    prerun_script = self.get_value("PRERUN_SCRIPT")
    if prerun_script:
        model_log(
            "e3sm", logger, "{} PRERUN_SCRIPT BEGINS HERE".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))
        self.flush()
        _do_external(prerun_script,
                     self.get_value("CASEROOT"),
                     self.get_value("RUNDIR"),
                     lid,
                     prefix="prerun")
        self.read_xml()
        model_log(
            "e3sm", logger, "{} PRERUN_SCRIPT HAS FINISHED".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))

    # We might need to tweak these if we want to allow the user to change them
    data_assimilation_cycles = self.get_value("DATA_ASSIMILATION_CYCLES")
    data_assimilation_script = self.get_value("DATA_ASSIMILATION_SCRIPT")
    data_assimilation = (data_assimilation_cycles > 0
                         and len(data_assimilation_script) > 0
                         and os.path.isfile(data_assimilation_script))

    for cycle in range(data_assimilation_cycles):
        # After the first DA cycle, runs are restart runs
        if cycle > 0:
            lid = new_lid()
            self.set_value("CONTINUE_RUN",
                           self.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

        # WARNING: All case variables are reloaded during run_model to get
        # new values of any variables that may have been changed by
        # the user during model execution. Thus, any local variables
        # set from case variables before this point may be
        # inconsistent with their latest values in the xml files, so
        # should generally be reloaded (via case.get_value(XXX)) if they are still needed.
        model_log(
            "e3sm", logger, "{} RUN_MODEL BEGINS HERE".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))
        lid = _run_model(self, lid, skip_pnl, da_cycle=cycle)
        model_log(
            "e3sm", logger, "{} RUN_MODEL HAS FINISHED".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))

        if self.get_value("CHECK_TIMING") or self.get_value("SAVE_TIMING"):
            model_log(
                "e3sm", logger, "{} GET_TIMING BEGINS HERE".format(
                    time.strftime("%Y-%m-%d %H:%M:%S")))
            get_timing(self, lid)  # Run the getTiming script
            model_log(
                "e3sm", logger, "{} GET_TIMING HAS FINISHED".format(
                    time.strftime("%Y-%m-%d %H:%M:%S")))

        if data_assimilation:
            model_log(
                "e3sm", logger, "{} DO_DATA_ASSIMILATION BEGINS HERE".format(
                    time.strftime("%Y-%m-%d %H:%M:%S")))
            self.flush()
            _do_data_assimilation(data_assimilation_script,
                                  self.get_value("CASEROOT"), cycle, lid,
                                  self.get_value("RUNDIR"))
            self.read_xml()
            model_log(
                "e3sm", logger, "{} DO_DATA_ASSIMILATION HAS FINISHED".format(
                    time.strftime("%Y-%m-%d %H:%M:%S")))

        _save_logs(self, lid)  # Copy log files back to caseroot

        model_log(
            "e3sm", logger, "{} SAVE_POSTRUN_PROVENANCE BEGINS HERE".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))
        save_postrun_provenance(self)
        model_log(
            "e3sm", logger, "{} SAVE_POSTRUN_PROVENANCE HAS FINISHED".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))

    postrun_script = self.get_value("POSTRUN_SCRIPT")
    if postrun_script:
        model_log(
            "e3sm", logger, "{} POSTRUN_SCRIPT BEGINS HERE".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))
        self.flush()
        _do_external(postrun_script,
                     self.get_value("CASEROOT"),
                     self.get_value("RUNDIR"),
                     lid,
                     prefix="postrun")
        self.read_xml()
        _save_logs(self, lid)
        model_log(
            "e3sm", logger, "{} POSTRUN_SCRIPT HAS FINISHED".format(
                time.strftime("%Y-%m-%d %H:%M:%S")))

    if set_continue_run:
        self.set_value("CONTINUE_RUN",
                       self.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

    external_workflow = self.get_value("EXTERNAL_WORKFLOW")
    if not external_workflow:
        logger.warning("check for resubmit")

        logger.debug("submit_resubmits is {}".format(submit_resubmits))
        if submit_resubmits:
            _resubmit_check(self)

    model_log(
        "e3sm", logger,
        "{} CASE.RUN HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))
    return True
Example #7
0
                    loop = True
                    case.set_value(
                        "CONTINUE_RUN",
                        case.get_value("RESUBMIT_SETS_CONTINUE_RUN"))
                elif num_retry_fails > 0 and retry_count >= num_retry_fails:
                    logger.warning("Detected model run failed, restarting")
                    retry_count -= 1
                    loop = True

                if loop:
                    # Archive the last consistent set of restart files and restore them
                    if case.get_value("DOUT_S"):
                        case.case_st_archive(resubmit=False)
                        case.restore_from_archive()

                    lid = new_lid()
                    case.create_namelists()

        if not cmd_success and not loop:
            # We failed and we're not restarting
            expect(
                False,
                "RUN FAIL: Command '{}' failed\nSee log file for details: {}".
                format(cmd, model_logfile))

    model_log(
        "e3sm", logger, "{} POST_RUN_CHECK BEGINS HERE".format(
            time.strftime("%Y-%m-%d %H:%M:%S")))
    _post_run_check(case, lid)
    model_log(
        "e3sm", logger, "{} POST_RUN_CHECK HAS FINISHED".format(
Example #8
0
def case_run(self, skip_pnl=False, set_continue_run=False, submit_resubmits=False):
###############################################################################
    model_log("e3sm", logger, "{} CASE.RUN BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
    # Set up the run, run the model, do the postrun steps
    prerun_script = self.get_value("PRERUN_SCRIPT")
    postrun_script = self.get_value("POSTRUN_SCRIPT")

    data_assimilation_cycles = self.get_value("DATA_ASSIMILATION_CYCLES")
    data_assimilation_script = self.get_value("DATA_ASSIMILATION_SCRIPT")
    data_assimilation = (data_assimilation_cycles > 0 and
                         len(data_assimilation_script) > 0 and
                         os.path.isfile(data_assimilation_script))


    # set up the LID
    lid = new_lid()

    if prerun_script:
        model_log("e3sm", logger, "{} PRERUN_SCRIPT BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        self.flush()
        _do_external(prerun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"),
                    lid, prefix="prerun")
        self.read_xml()
        model_log("e3sm", logger, "{} PRERUN_SCRIPT HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    for cycle in range(data_assimilation_cycles):
        # After the first DA cycle, runs are restart runs
        if cycle > 0:
            lid = new_lid()
            self.set_value("CONTINUE_RUN",
                           self.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

        model_log("e3sm", logger, "{} RUN_MODEL BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        lid = _run_model(self, lid, skip_pnl, da_cycle=cycle)
        model_log("e3sm", logger, "{} RUN_MODEL HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        if self.get_value("CHECK_TIMING") or self.get_value("SAVE_TIMING"):
            model_log("e3sm", logger, "{} GET_TIMING BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
            get_timing(self, lid)     # Run the getTiming script
            model_log("e3sm", logger, "{} GET_TIMING HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))


        if data_assimilation:
            model_log("e3sm", logger, "{} DO_DATA_ASSIMILATION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
            self.flush()
            _do_data_assimilation(data_assimilation_script, self.get_value("CASEROOT"), cycle, lid,
                                 self.get_value("RUNDIR"))
            self.read_xml()
            model_log("e3sm", logger, "{} DO_DATA_ASSIMILATION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

        _save_logs(self, lid)       # Copy log files back to caseroot

        model_log("e3sm", logger, "{} SAVE_POSTRUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        save_postrun_provenance(self)
        model_log("e3sm", logger, "{} SAVE_POSTRUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    if postrun_script:
        model_log("e3sm", logger, "{} POSTRUN_SCRIPT BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        self.flush()
        _do_external(postrun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"),
                    lid, prefix="postrun")
        self.read_xml()
        _save_logs(self, lid)
        model_log("e3sm", logger, "{} POSTRUN_SCRIPT HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    if set_continue_run:
        self.set_value("CONTINUE_RUN",
                       self.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

    logger.warning("check for resubmit")
    if submit_resubmits:
        _resubmit_check(self)

    model_log("e3sm", logger, "{} CASE.RUN HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    return True
Example #9
0
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0):
###############################################################################

    logger.debug("{} PRE_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
    _pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle)
    logger.debug("{} PRE_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    model = case.get_value("MODEL")

    # Set OMP_NUM_THREADS
    os.environ["OMP_NUM_THREADS"] = str(case.thread_count)

    # Run the model
    cmd = case.get_mpirun_cmd(allow_unresolved_envvars=False)
    logger.info("run command is {} ".format(cmd))

    rundir = case.get_value("RUNDIR")
    loop = True

    # MPIRUN_RETRY_REGEX allows the mpi command to be reattempted if the
    # failure described by that regular expression is matched in the model log
    # case.spare_nodes is overloaded and may also represent the number of
    # retries to attempt if ALLOCATE_SPARE_NODES is False
    retry_run_re = case.get_value("MPIRUN_RETRY_REGEX")
    node_fail_re = case.get_value("NODE_FAIL_REGEX")
    retry_count = 0
    if retry_run_re:
        retry_run_regex = re.compile(re.escape(retry_run_re))
        retry_count = case.get_value("MPIRUN_RETRY_COUNT")
    if node_fail_re:
        node_fail_regex = re.compile(re.escape(node_fail_re))

    while loop:
        loop = False

        logger.debug("{} SAVE_PRERUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        save_prerun_provenance(case)
        logger.debug("{} SAVE_PRERUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

        logger.debug("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        run_func = lambda: run_cmd(cmd, from_dir=rundir)[0]
        stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT"))
        logger.debug("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

        model_logfile = os.path.join(rundir, model + ".log." + lid)
        # Determine if failure was due to a failed node, if so, try to restart
        if retry_run_re or node_fail_re:
            model_logfile = os.path.join(rundir, model + ".log." + lid)
            if os.path.exists(model_logfile):
                num_node_fails=0
                num_retry_fails=0
                if node_fail_re:
                    num_node_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read()))
                if retry_run_re:
                    num_retry_fails = len(retry_run_regex.findall(open(model_logfile, 'r').read()))
                logger.debug ("RETRY: num_retry_fails {} spare_nodes {} retry_count {}".
                              format(num_retry_fails, case.spare_nodes, retry_count))
                if num_node_fails > 0 and case.spare_nodes >= num_node_fails:
                        # We failed due to node failure!
                    logger.warning("Detected model run failed due to node failure, restarting")
                    case.spare_nodes -= num_node_fails
                    loop = True
                    case.set_value("CONTINUE_RUN",
                                   case.get_value("RESUBMIT_SETS_CONTINUE_RUN"))
                elif num_retry_fails > 0 and retry_count >= num_retry_fails:
                    logger.warning("Detected model run failed, restarting")
                    retry_count -= 1
                    loop = True
                if loop:
                    # Archive the last consistent set of restart files and restore them
                    if case.get_value("DOUT_S"):
                        case.case_st_archive(resubmit=False)
                        case.restore_from_archive()

                    lid = new_lid()
                    case.create_namelists()

        if stat != 0 and not loop:
            # We failed and we're not restarting
            expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile))

    logger.debug("{} POST_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
    _post_run_check(case, lid)
    logger.debug("{} POST_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    return lid
Example #10
0
def case_run(self, skip_pnl=False, set_continue_run=False, submit_resubmits=False):
###############################################################################
    logger.debug("{} CASE.RUN BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
    # Set up the run, run the model, do the postrun steps
    prerun_script = self.get_value("PRERUN_SCRIPT")
    postrun_script = self.get_value("POSTRUN_SCRIPT")
    driver = self.get_value("COMP_INTERFACE")

    data_assimilation_cycles = self.get_value("DATA_ASSIMILATION_CYCLES")
    data_assimilation_script = self.get_value("DATA_ASSIMILATION_SCRIPT")
    data_assimilation = (data_assimilation_cycles > 0 and
                         len(data_assimilation_script) > 0 and
                         os.path.isfile(data_assimilation_script))

    driver = self.get_value("COMP_INTERFACE")

    # set up the LID
    lid = new_lid()

    if prerun_script:
        logger.debug("{} PRERUN_SCRIPT BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        self.flush()
        _do_external(prerun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"),
                    lid, prefix="prerun")
        self.read_xml()
        logger.debug("{} PRERUN_SCRIPT HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    for cycle in range(data_assimilation_cycles):
        # After the first DA cycle, runs are restart runs
        if cycle > 0:
            lid = new_lid()
            self.set_value("CONTINUE_RUN",
                           self.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

        logger.debug("{} RUN_MODEL BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        lid = _run_model(self, lid, skip_pnl, da_cycle=cycle)
        logger.debug("{} RUN_MODEL HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

        # TODO mvertens: remove the hard-wiring for nuopc below
        if driver != 'nuopc':
            if self.get_value("CHECK_TIMING") or self.get_value("SAVE_TIMING"):
                logger.debug("{} GET_TIMING BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
                get_timing(self, lid)     # Run the getTiming script
                logger.debug("{} GET_TIMING HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        else:
            self.set_value("CHECK_TIMING",False)

        if data_assimilation:
            logger.debug("{} DO_DATA_ASSIMILATION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
            self.flush()
            _do_data_assimilation(data_assimilation_script, self.get_value("CASEROOT"), cycle, lid,
                                 self.get_value("RUNDIR"))
            self.read_xml()
            logger.debug("{} DO_DATA_ASSIMILATION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

        _save_logs(self, lid)       # Copy log files back to caseroot

        logger.debug("{} SAVE_POSTRUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        save_postrun_provenance(self)
        logger.debug("{} SAVE_POSTRUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    if postrun_script:
        logger.debug("{} POSTRUN_SCRIPT BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))
        self.flush()
        _do_external(postrun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"),
                    lid, prefix="postrun")
        self.read_xml()
        _save_logs(self, lid)
        logger.debug("{} POSTRUN_SCRIPT HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    if set_continue_run:
        self.set_value("CONTINUE_RUN",
                       self.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

    logger.warning("check for resubmit")
    if submit_resubmits:
        _resubmit_check(self)

    logger.debug("{} CASE.RUN HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    return True
Example #11
0
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0):
###############################################################################

    pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle)

    model = case.get_value("MODEL")

    # Set OMP_NUM_THREADS
    env_mach_pes = case.get_env("mach_pes")
    comp_classes = case.get_values("COMP_CLASSES")
    thread_count = env_mach_pes.get_max_thread_count(comp_classes)
    os.environ["OMP_NUM_THREADS"] = str(thread_count)

    # Run the model
    logger.info("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    cmd = case.get_mpirun_cmd(job="case.run")
    cmd = case.get_resolved_value(cmd)
    logger.info("run command is {} ".format(cmd))

    rundir = case.get_value("RUNDIR")
    loop = True

    while loop:
        loop = False
        stat = run_cmd(cmd, from_dir=rundir)[0]
        model_logfile = os.path.join(rundir, model + ".log." + lid)
        # Determine if failure was due to a failed node, if so, try to restart
        if stat != 0:
            node_fail_re = case.get_value("NODE_FAIL_REGEX")
            if node_fail_re:
                node_fail_regex = re.compile(node_fail_re)
                model_logfile = os.path.join(rundir, model + ".log." + lid)
                if os.path.exists(model_logfile):
                    num_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read()))
                    if num_fails > 0 and case.spare_nodes >= num_fails:
                        # We failed due to node failure!
                        logger.warning("Detected model run failed due to node failure, restarting")

                        # Archive the last consistent set of restart files and restore them
                        case_st_archive(case, no_resubmit=True)
                        restore_from_archive(case)

                        orig_cont = case.get_value("CONTINUE_RUN")
                        if not orig_cont:
                            case.set_value("CONTINUE_RUN", True)
                            create_namelists(case)

                        lid = new_lid()
                        loop = True

                        case.spare_nodes -= num_fails

            if not loop:
                # We failed and we're not restarting
                expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile))

    logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    post_run_check(case, lid)

    return lid
Example #12
0
def case_run(case, skip_pnl=False):
###############################################################################
    # Set up the run, run the model, do the postrun steps
    run_with_submit = case.get_value("RUN_WITH_SUBMIT")
    expect(run_with_submit,
           "You are not calling the run script via the submit script. "
           "As a result, short-term archiving will not be called automatically."
           "Please submit your run using the submit script like so:"
           " ./case.submit Time: {}".format(get_timestamp()))

    # Forces user to use case.submit if they re-submit
    if case.get_value("TESTCASE") is None:
        case.set_value("RUN_WITH_SUBMIT", False)

    prerun_script = case.get_value("PRERUN_SCRIPT")
    postrun_script = case.get_value("POSTRUN_SCRIPT")

    data_assimilation = case.get_value("DATA_ASSIMILATION")
    data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES")
    data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT")

    # set up the LID
    lid = new_lid()

    save_prerun_provenance(case)

    for cycle in range(data_assimilation_cycles):
        # After the first DA cycle, runs are restart runs
        if cycle > 0:
            case.set_value("CONTINUE_RUN", "TRUE")
            lid = new_lid()

        if prerun_script:
            case.flush()
            do_external(prerun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"),
                        lid, prefix="prerun")
            case.read_xml()

        lid = run_model(case, lid, skip_pnl, da_cycle=cycle)
        save_logs(case, lid)       # Copy log files back to caseroot
        if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"):
            get_timing(case, lid)     # Run the getTiming script

        if data_assimilation:
            case.flush()
            do_data_assimilation(data_assimilation_script, case.get_value("CASEROOT"), cycle, lid,
                                 case.get_value("RUNDIR"))
            case.read_xml()

        if postrun_script:
            case.flush()
            do_external(postrun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"),
                        lid, prefix="postrun")
            case.read_xml()

        save_postrun_provenance(case)

    logger.warning("check for resubmit")
    resubmit_check(case)

    return True
Example #13
0
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0):
###############################################################################

    _pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle)

    model = case.get_value("MODEL")

    # Set OMP_NUM_THREADS
    os.environ["OMP_NUM_THREADS"] = str(case.thread_count)

    # Run the model
    logger.info("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    cmd = case.get_mpirun_cmd(allow_unresolved_envvars=False)
    logger.info("run command is {} ".format(cmd))

    rundir = case.get_value("RUNDIR")
    loop = True

    while loop:
        loop = False

        save_prerun_provenance(case)
        run_func = lambda: run_cmd(cmd, from_dir=rundir)[0]
        stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT"))

        model_logfile = os.path.join(rundir, model + ".log." + lid)
        # Determine if failure was due to a failed node, if so, try to restart
        if stat != 0:
            node_fail_re = case.get_value("NODE_FAIL_REGEX")
            if node_fail_re:
                node_fail_regex = re.compile(node_fail_re)
                model_logfile = os.path.join(rundir, model + ".log." + lid)
                if os.path.exists(model_logfile):
                    num_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read()))
                    if num_fails > 0 and case.spare_nodes >= num_fails:
                        # We failed due to node failure!
                        logger.warning("Detected model run failed due to node failure, restarting")

                        # Archive the last consistent set of restart files and restore them
                        case.case_st_archive(no_resubmit=True)
                        case.restore_from_archive()

                        case.set_value("CONTINUE_RUN",
                                       case.get_value("RESUBMIT_SETS_CONTINUE_RUN"))

                        lid = new_lid()
                        loop = True

                        case.create_namelists()

                        case.spare_nodes -= num_fails

            if not loop:
                # We failed and we're not restarting
                expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile))

    logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S")))

    _post_run_check(case, lid)

    return lid