def case_run(case): ############################################################################### # Set up the run, run the model, do the postrun steps run_with_submit = case.get_value("RUN_WITH_SUBMIT") expect( run_with_submit, "You are not calling the run script via the submit script. " "As a result, short-term archiving will not be called automatically." "Please submit your run using the submit script like so:" " ./case.submit") data_assimilation = case.get_value("DATA_ASSIMILATION") data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT") # set up the LID lid = new_lid() save_prerun_provenance(case) for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: case.set_value("CONTINUE_RUN", "TRUE") lid = new_lid() pre_run_check(case) run_model(case) post_run_check(case, lid) save_logs(case, lid) # Copy log files back to caseroot if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"): get_timing(case, lid) # Run the getTiming script
def case_run(self, skip_pnl=False, set_continue_run=False, submit_resubmits=False): ############################################################################### # Set up the run, run the model, do the postrun steps prerun_script = self.get_value("PRERUN_SCRIPT") postrun_script = self.get_value("POSTRUN_SCRIPT") data_assimilation_cycles = self.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = self.get_value("DATA_ASSIMILATION_SCRIPT") data_assimilation = (data_assimilation_cycles > 0 and len(data_assimilation_script) > 0 and os.path.isfile(data_assimilation_script)) # set up the LID lid = new_lid() if prerun_script: self.flush() _do_external(prerun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"), lid, prefix="prerun") self.read_xml() for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: lid = new_lid() self.set_value("CONTINUE_RUN", self.get_value("RESUBMIT_SETS_CONTINUE_RUN")) lid = _run_model(self, lid, skip_pnl, da_cycle=cycle) if self.get_value("CHECK_TIMING") or self.get_value("SAVE_TIMING"): get_timing(self, lid) # Run the getTiming script if data_assimilation: self.flush() _do_data_assimilation(data_assimilation_script, self.get_value("CASEROOT"), cycle, lid, self.get_value("RUNDIR")) self.read_xml() _save_logs(self, lid) save_postrun_provenance(self) if postrun_script: self.flush() _do_external(postrun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"), lid, prefix="postrun") self.read_xml() _save_logs(self, lid) if set_continue_run: self.set_value("CONTINUE_RUN", self.get_value("RESUBMIT_SETS_CONTINUE_RUN")) logger.warning("check for resubmit") if submit_resubmits: _resubmit_check(self) return True
def case_run(case, skip_pnl=False): ############################################################################### # Set up the run, run the model, do the postrun steps prerun_script = case.get_value("PRERUN_SCRIPT") postrun_script = case.get_value("POSTRUN_SCRIPT") data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT") data_assimilation = (data_assimilation_cycles > 0 and len(data_assimilation_script) > 0 and os.path.isfile(data_assimilation_script)) # set up the LID lid = new_lid() if prerun_script: case.flush() do_external(prerun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"), lid, prefix="prerun") case.read_xml() for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: lid = new_lid() case.set_value("CONTINUE_RUN", case.get_value("RESUBMIT_SETS_CONTINUE_RUN")) lid = run_model(case, lid, skip_pnl, da_cycle=cycle) if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"): get_timing(case, lid) # Run the getTiming script if data_assimilation: case.flush() do_data_assimilation(data_assimilation_script, case.get_value("CASEROOT"), cycle, lid, case.get_value("RUNDIR")) case.read_xml() save_logs(case, lid) # Copy log files back to caseroot save_postrun_provenance(case) if postrun_script: case.flush() do_external(postrun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"), lid, prefix="postrun") case.read_xml() save_logs(case, lid) # Copy log files back to caseroot logger.warning("check for resubmit") resubmit_check(case) return True
def run_phase(self): # This mimics a bit of what's done in the typical case.run. Note that # case.get_mpirun_cmd creates a command that runs the executable given by # case.run_exe. So it's important that (elsewhere in this test script) we create a # link pointing from that to the atm_driver.exe executable. lid = new_lid() os.environ["OMP_NUM_THREADS"] = str(self._case.thread_count) cmd = self._case.get_mpirun_cmd(allow_unresolved_envvars=False) run_cmd_no_fail(cmd, from_dir=self._atm_driver_rundir()) self._link_to_output_files()
def case_run(case): ############################################################################### # Set up the run, run the model, do the postrun steps run_with_submit = case.get_value("RUN_WITH_SUBMIT") expect(run_with_submit, "You are not calling the run script via the submit script. " "As a result, short-term archiving will not be called automatically." "Please submit your run using the submit script like so:" " ./case.submit") data_assimilation = case.get_value("DATA_ASSIMILATION") data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT") # set up the LID lid = new_lid() save_prerun_provenance(case) for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: case.set_value("CONTINUE_RUN", "TRUE") lid = new_lid() pre_run_check(case) run_model(case) post_run_check(case, lid) save_logs(case, lid) # Copy log files back to caseroot if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"): get_timing(case, lid) # Run the getTiming script if data_assimilation: do_data_assimilation(data_assimilation_script, cycle, data_assimilation_cycles, lid) save_postrun_provenance(case) logger.warn("check for resubmit") resubmit_check(case) return True
def case_run(self, skip_pnl=False, set_continue_run=False, submit_resubmits=False): ############################################################################### model_log( "e3sm", logger, "{} CASE.RUN BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) # Set up the run, run the model, do the postrun steps # set up the LID lid = new_lid() prerun_script = self.get_value("PRERUN_SCRIPT") if prerun_script: model_log( "e3sm", logger, "{} PRERUN_SCRIPT BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_external(prerun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"), lid, prefix="prerun") self.read_xml() model_log( "e3sm", logger, "{} PRERUN_SCRIPT HAS FINISHED".format( time.strftime("%Y-%m-%d %H:%M:%S"))) # We might need to tweak these if we want to allow the user to change them data_assimilation_cycles = self.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = self.get_value("DATA_ASSIMILATION_SCRIPT") data_assimilation = (data_assimilation_cycles > 0 and len(data_assimilation_script) > 0 and os.path.isfile(data_assimilation_script)) for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: lid = new_lid() self.set_value("CONTINUE_RUN", self.get_value("RESUBMIT_SETS_CONTINUE_RUN")) # WARNING: All case variables are reloaded during run_model to get # new values of any variables that may have been changed by # the user during model execution. Thus, any local variables # set from case variables before this point may be # inconsistent with their latest values in the xml files, so # should generally be reloaded (via case.get_value(XXX)) if they are still needed. model_log( "e3sm", logger, "{} RUN_MODEL BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) lid = _run_model(self, lid, skip_pnl, da_cycle=cycle) model_log( "e3sm", logger, "{} RUN_MODEL HAS FINISHED".format( time.strftime("%Y-%m-%d %H:%M:%S"))) if self.get_value("CHECK_TIMING") or self.get_value("SAVE_TIMING"): model_log( "e3sm", logger, "{} GET_TIMING BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) get_timing(self, lid) # Run the getTiming script model_log( "e3sm", logger, "{} GET_TIMING HAS FINISHED".format( time.strftime("%Y-%m-%d %H:%M:%S"))) if data_assimilation: model_log( "e3sm", logger, "{} DO_DATA_ASSIMILATION BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_data_assimilation(data_assimilation_script, self.get_value("CASEROOT"), cycle, lid, self.get_value("RUNDIR")) self.read_xml() model_log( "e3sm", logger, "{} DO_DATA_ASSIMILATION HAS FINISHED".format( time.strftime("%Y-%m-%d %H:%M:%S"))) _save_logs(self, lid) # Copy log files back to caseroot model_log( "e3sm", logger, "{} SAVE_POSTRUN_PROVENANCE BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) save_postrun_provenance(self) model_log( "e3sm", logger, "{} SAVE_POSTRUN_PROVENANCE HAS FINISHED".format( time.strftime("%Y-%m-%d %H:%M:%S"))) postrun_script = self.get_value("POSTRUN_SCRIPT") if postrun_script: model_log( "e3sm", logger, "{} POSTRUN_SCRIPT BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_external(postrun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"), lid, prefix="postrun") self.read_xml() _save_logs(self, lid) model_log( "e3sm", logger, "{} POSTRUN_SCRIPT HAS FINISHED".format( time.strftime("%Y-%m-%d %H:%M:%S"))) if set_continue_run: self.set_value("CONTINUE_RUN", self.get_value("RESUBMIT_SETS_CONTINUE_RUN")) external_workflow = self.get_value("EXTERNAL_WORKFLOW") if not external_workflow: logger.warning("check for resubmit") logger.debug("submit_resubmits is {}".format(submit_resubmits)) if submit_resubmits: _resubmit_check(self) model_log( "e3sm", logger, "{} CASE.RUN HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) return True
loop = True case.set_value( "CONTINUE_RUN", case.get_value("RESUBMIT_SETS_CONTINUE_RUN")) elif num_retry_fails > 0 and retry_count >= num_retry_fails: logger.warning("Detected model run failed, restarting") retry_count -= 1 loop = True if loop: # Archive the last consistent set of restart files and restore them if case.get_value("DOUT_S"): case.case_st_archive(resubmit=False) case.restore_from_archive() lid = new_lid() case.create_namelists() if not cmd_success and not loop: # We failed and we're not restarting expect( False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}". format(cmd, model_logfile)) model_log( "e3sm", logger, "{} POST_RUN_CHECK BEGINS HERE".format( time.strftime("%Y-%m-%d %H:%M:%S"))) _post_run_check(case, lid) model_log( "e3sm", logger, "{} POST_RUN_CHECK HAS FINISHED".format(
def case_run(self, skip_pnl=False, set_continue_run=False, submit_resubmits=False): ############################################################################### model_log("e3sm", logger, "{} CASE.RUN BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) # Set up the run, run the model, do the postrun steps prerun_script = self.get_value("PRERUN_SCRIPT") postrun_script = self.get_value("POSTRUN_SCRIPT") data_assimilation_cycles = self.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = self.get_value("DATA_ASSIMILATION_SCRIPT") data_assimilation = (data_assimilation_cycles > 0 and len(data_assimilation_script) > 0 and os.path.isfile(data_assimilation_script)) # set up the LID lid = new_lid() if prerun_script: model_log("e3sm", logger, "{} PRERUN_SCRIPT BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_external(prerun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"), lid, prefix="prerun") self.read_xml() model_log("e3sm", logger, "{} PRERUN_SCRIPT HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: lid = new_lid() self.set_value("CONTINUE_RUN", self.get_value("RESUBMIT_SETS_CONTINUE_RUN")) model_log("e3sm", logger, "{} RUN_MODEL BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) lid = _run_model(self, lid, skip_pnl, da_cycle=cycle) model_log("e3sm", logger, "{} RUN_MODEL HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if self.get_value("CHECK_TIMING") or self.get_value("SAVE_TIMING"): model_log("e3sm", logger, "{} GET_TIMING BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) get_timing(self, lid) # Run the getTiming script model_log("e3sm", logger, "{} GET_TIMING HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if data_assimilation: model_log("e3sm", logger, "{} DO_DATA_ASSIMILATION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_data_assimilation(data_assimilation_script, self.get_value("CASEROOT"), cycle, lid, self.get_value("RUNDIR")) self.read_xml() model_log("e3sm", logger, "{} DO_DATA_ASSIMILATION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _save_logs(self, lid) # Copy log files back to caseroot model_log("e3sm", logger, "{} SAVE_POSTRUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) save_postrun_provenance(self) model_log("e3sm", logger, "{} SAVE_POSTRUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if postrun_script: model_log("e3sm", logger, "{} POSTRUN_SCRIPT BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_external(postrun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"), lid, prefix="postrun") self.read_xml() _save_logs(self, lid) model_log("e3sm", logger, "{} POSTRUN_SCRIPT HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if set_continue_run: self.set_value("CONTINUE_RUN", self.get_value("RESUBMIT_SETS_CONTINUE_RUN")) logger.warning("check for resubmit") if submit_resubmits: _resubmit_check(self) model_log("e3sm", logger, "{} CASE.RUN HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) return True
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0): ############################################################################### logger.debug("{} PRE_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle) logger.debug("{} PRE_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) model = case.get_value("MODEL") # Set OMP_NUM_THREADS os.environ["OMP_NUM_THREADS"] = str(case.thread_count) # Run the model cmd = case.get_mpirun_cmd(allow_unresolved_envvars=False) logger.info("run command is {} ".format(cmd)) rundir = case.get_value("RUNDIR") loop = True # MPIRUN_RETRY_REGEX allows the mpi command to be reattempted if the # failure described by that regular expression is matched in the model log # case.spare_nodes is overloaded and may also represent the number of # retries to attempt if ALLOCATE_SPARE_NODES is False retry_run_re = case.get_value("MPIRUN_RETRY_REGEX") node_fail_re = case.get_value("NODE_FAIL_REGEX") retry_count = 0 if retry_run_re: retry_run_regex = re.compile(re.escape(retry_run_re)) retry_count = case.get_value("MPIRUN_RETRY_COUNT") if node_fail_re: node_fail_regex = re.compile(re.escape(node_fail_re)) while loop: loop = False logger.debug("{} SAVE_PRERUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) save_prerun_provenance(case) logger.debug("{} SAVE_PRERUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) logger.debug("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) run_func = lambda: run_cmd(cmd, from_dir=rundir)[0] stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT")) logger.debug("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) model_logfile = os.path.join(rundir, model + ".log." + lid) # Determine if failure was due to a failed node, if so, try to restart if retry_run_re or node_fail_re: model_logfile = os.path.join(rundir, model + ".log." + lid) if os.path.exists(model_logfile): num_node_fails=0 num_retry_fails=0 if node_fail_re: num_node_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read())) if retry_run_re: num_retry_fails = len(retry_run_regex.findall(open(model_logfile, 'r').read())) logger.debug ("RETRY: num_retry_fails {} spare_nodes {} retry_count {}". format(num_retry_fails, case.spare_nodes, retry_count)) if num_node_fails > 0 and case.spare_nodes >= num_node_fails: # We failed due to node failure! logger.warning("Detected model run failed due to node failure, restarting") case.spare_nodes -= num_node_fails loop = True case.set_value("CONTINUE_RUN", case.get_value("RESUBMIT_SETS_CONTINUE_RUN")) elif num_retry_fails > 0 and retry_count >= num_retry_fails: logger.warning("Detected model run failed, restarting") retry_count -= 1 loop = True if loop: # Archive the last consistent set of restart files and restore them if case.get_value("DOUT_S"): case.case_st_archive(resubmit=False) case.restore_from_archive() lid = new_lid() case.create_namelists() if stat != 0 and not loop: # We failed and we're not restarting expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile)) logger.debug("{} POST_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _post_run_check(case, lid) logger.debug("{} POST_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) return lid
def case_run(self, skip_pnl=False, set_continue_run=False, submit_resubmits=False): ############################################################################### logger.debug("{} CASE.RUN BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) # Set up the run, run the model, do the postrun steps prerun_script = self.get_value("PRERUN_SCRIPT") postrun_script = self.get_value("POSTRUN_SCRIPT") driver = self.get_value("COMP_INTERFACE") data_assimilation_cycles = self.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = self.get_value("DATA_ASSIMILATION_SCRIPT") data_assimilation = (data_assimilation_cycles > 0 and len(data_assimilation_script) > 0 and os.path.isfile(data_assimilation_script)) driver = self.get_value("COMP_INTERFACE") # set up the LID lid = new_lid() if prerun_script: logger.debug("{} PRERUN_SCRIPT BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_external(prerun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"), lid, prefix="prerun") self.read_xml() logger.debug("{} PRERUN_SCRIPT HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: lid = new_lid() self.set_value("CONTINUE_RUN", self.get_value("RESUBMIT_SETS_CONTINUE_RUN")) logger.debug("{} RUN_MODEL BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) lid = _run_model(self, lid, skip_pnl, da_cycle=cycle) logger.debug("{} RUN_MODEL HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) # TODO mvertens: remove the hard-wiring for nuopc below if driver != 'nuopc': if self.get_value("CHECK_TIMING") or self.get_value("SAVE_TIMING"): logger.debug("{} GET_TIMING BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) get_timing(self, lid) # Run the getTiming script logger.debug("{} GET_TIMING HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) else: self.set_value("CHECK_TIMING",False) if data_assimilation: logger.debug("{} DO_DATA_ASSIMILATION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_data_assimilation(data_assimilation_script, self.get_value("CASEROOT"), cycle, lid, self.get_value("RUNDIR")) self.read_xml() logger.debug("{} DO_DATA_ASSIMILATION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _save_logs(self, lid) # Copy log files back to caseroot logger.debug("{} SAVE_POSTRUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) save_postrun_provenance(self) logger.debug("{} SAVE_POSTRUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if postrun_script: logger.debug("{} POSTRUN_SCRIPT BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) self.flush() _do_external(postrun_script, self.get_value("CASEROOT"), self.get_value("RUNDIR"), lid, prefix="postrun") self.read_xml() _save_logs(self, lid) logger.debug("{} POSTRUN_SCRIPT HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if set_continue_run: self.set_value("CONTINUE_RUN", self.get_value("RESUBMIT_SETS_CONTINUE_RUN")) logger.warning("check for resubmit") if submit_resubmits: _resubmit_check(self) logger.debug("{} CASE.RUN HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) return True
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0): ############################################################################### pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle) model = case.get_value("MODEL") # Set OMP_NUM_THREADS env_mach_pes = case.get_env("mach_pes") comp_classes = case.get_values("COMP_CLASSES") thread_count = env_mach_pes.get_max_thread_count(comp_classes) os.environ["OMP_NUM_THREADS"] = str(thread_count) # Run the model logger.info("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) cmd = case.get_mpirun_cmd(job="case.run") cmd = case.get_resolved_value(cmd) logger.info("run command is {} ".format(cmd)) rundir = case.get_value("RUNDIR") loop = True while loop: loop = False stat = run_cmd(cmd, from_dir=rundir)[0] model_logfile = os.path.join(rundir, model + ".log." + lid) # Determine if failure was due to a failed node, if so, try to restart if stat != 0: node_fail_re = case.get_value("NODE_FAIL_REGEX") if node_fail_re: node_fail_regex = re.compile(node_fail_re) model_logfile = os.path.join(rundir, model + ".log." + lid) if os.path.exists(model_logfile): num_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read())) if num_fails > 0 and case.spare_nodes >= num_fails: # We failed due to node failure! logger.warning("Detected model run failed due to node failure, restarting") # Archive the last consistent set of restart files and restore them case_st_archive(case, no_resubmit=True) restore_from_archive(case) orig_cont = case.get_value("CONTINUE_RUN") if not orig_cont: case.set_value("CONTINUE_RUN", True) create_namelists(case) lid = new_lid() loop = True case.spare_nodes -= num_fails if not loop: # We failed and we're not restarting expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile)) logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) post_run_check(case, lid) return lid
def case_run(case, skip_pnl=False): ############################################################################### # Set up the run, run the model, do the postrun steps run_with_submit = case.get_value("RUN_WITH_SUBMIT") expect(run_with_submit, "You are not calling the run script via the submit script. " "As a result, short-term archiving will not be called automatically." "Please submit your run using the submit script like so:" " ./case.submit Time: {}".format(get_timestamp())) # Forces user to use case.submit if they re-submit if case.get_value("TESTCASE") is None: case.set_value("RUN_WITH_SUBMIT", False) prerun_script = case.get_value("PRERUN_SCRIPT") postrun_script = case.get_value("POSTRUN_SCRIPT") data_assimilation = case.get_value("DATA_ASSIMILATION") data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT") # set up the LID lid = new_lid() save_prerun_provenance(case) for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: case.set_value("CONTINUE_RUN", "TRUE") lid = new_lid() if prerun_script: case.flush() do_external(prerun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"), lid, prefix="prerun") case.read_xml() lid = run_model(case, lid, skip_pnl, da_cycle=cycle) save_logs(case, lid) # Copy log files back to caseroot if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"): get_timing(case, lid) # Run the getTiming script if data_assimilation: case.flush() do_data_assimilation(data_assimilation_script, case.get_value("CASEROOT"), cycle, lid, case.get_value("RUNDIR")) case.read_xml() if postrun_script: case.flush() do_external(postrun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"), lid, prefix="postrun") case.read_xml() save_postrun_provenance(case) logger.warning("check for resubmit") resubmit_check(case) return True
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0): ############################################################################### _pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle) model = case.get_value("MODEL") # Set OMP_NUM_THREADS os.environ["OMP_NUM_THREADS"] = str(case.thread_count) # Run the model logger.info("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) cmd = case.get_mpirun_cmd(allow_unresolved_envvars=False) logger.info("run command is {} ".format(cmd)) rundir = case.get_value("RUNDIR") loop = True while loop: loop = False save_prerun_provenance(case) run_func = lambda: run_cmd(cmd, from_dir=rundir)[0] stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT")) model_logfile = os.path.join(rundir, model + ".log." + lid) # Determine if failure was due to a failed node, if so, try to restart if stat != 0: node_fail_re = case.get_value("NODE_FAIL_REGEX") if node_fail_re: node_fail_regex = re.compile(node_fail_re) model_logfile = os.path.join(rundir, model + ".log." + lid) if os.path.exists(model_logfile): num_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read())) if num_fails > 0 and case.spare_nodes >= num_fails: # We failed due to node failure! logger.warning("Detected model run failed due to node failure, restarting") # Archive the last consistent set of restart files and restore them case.case_st_archive(no_resubmit=True) case.restore_from_archive() case.set_value("CONTINUE_RUN", case.get_value("RESUBMIT_SETS_CONTINUE_RUN")) lid = new_lid() loop = True case.create_namelists() case.spare_nodes -= num_fails if not loop: # We failed and we're not restarting expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile)) logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _post_run_check(case, lid) return lid