def case_run(case): ############################################################################### # Set up the run, run the model, do the postrun steps run_with_submit = case.get_value("RUN_WITH_SUBMIT") expect(run_with_submit, "You are not calling the run script via the submit script. " "As a result, short-term archiving will not be called automatically." "Please submit your run using the submit script like so:" " ./case.submit") data_assimilation = case.get_value("DATA_ASSIMILATION") data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT") # set up the LID lid = new_lid() save_prerun_provenance(case) for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: case.set_value("CONTINUE_RUN", "TRUE") lid = new_lid() pre_run_check(case) run_model(case) post_run_check(case, lid) save_logs(case, lid) # Copy log files back to caseroot if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"): get_timing(case, lid) # Run the getTiming script if data_assimilation: do_data_assimilation(data_assimilation_script, cycle, data_assimilation_cycles, lid) save_postrun_provenance(case) logger.warn("check for resubmit") resubmit_check(case) return True
def case_run(case): ############################################################################### # Set up the run, run the model, do the postrun steps run_with_submit = case.get_value("RUN_WITH_SUBMIT") expect (run_with_submit, "You are not calling the run script via the submit script. " "As a result, short-term archiving will not be called automatically." "Please submit your run using the submit script like so:" " ./case.submit") data_assimilation = case.get_value("DATA_ASSIMILATION") data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT") # set up the LID lid = time.strftime("%y%m%d-%H%M%S") os.environ["LID"] = lid save_prerun_provenance(case) for _ in range(data_assimilation_cycles): pre_run_check(case) run_model(case) post_run_check(case, lid) save_logs(case, lid) # Copy log files back to caseroot if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"): get_timing(case, lid) # Run the getTiming script if data_assimilation: do_data_assimilation(data_assimilation_script, lid) save_postrun_provenance(case) logger.warn("check for resubmit") resubmit_check(case) return True
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0): ############################################################################### logger.debug("{} PRE_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle) logger.debug("{} PRE_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) model = case.get_value("MODEL") # Set OMP_NUM_THREADS os.environ["OMP_NUM_THREADS"] = str(case.thread_count) # Run the model cmd = case.get_mpirun_cmd(allow_unresolved_envvars=False) logger.info("run command is {} ".format(cmd)) rundir = case.get_value("RUNDIR") loop = True # MPIRUN_RETRY_REGEX allows the mpi command to be reattempted if the # failure described by that regular expression is matched in the model log # case.spare_nodes is overloaded and may also represent the number of # retries to attempt if ALLOCATE_SPARE_NODES is False retry_run_re = case.get_value("MPIRUN_RETRY_REGEX") node_fail_re = case.get_value("NODE_FAIL_REGEX") retry_count = 0 if retry_run_re: retry_run_regex = re.compile(re.escape(retry_run_re)) retry_count = case.get_value("MPIRUN_RETRY_COUNT") if node_fail_re: node_fail_regex = re.compile(re.escape(node_fail_re)) while loop: loop = False logger.debug("{} SAVE_PRERUN_PROVENANCE BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) save_prerun_provenance(case) logger.debug("{} SAVE_PRERUN_PROVENANCE HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) logger.debug("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) run_func = lambda: run_cmd(cmd, from_dir=rundir)[0] stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT")) logger.debug("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) model_logfile = os.path.join(rundir, model + ".log." + lid) # Determine if failure was due to a failed node, if so, try to restart if retry_run_re or node_fail_re: model_logfile = os.path.join(rundir, model + ".log." + lid) if os.path.exists(model_logfile): num_node_fails=0 num_retry_fails=0 if node_fail_re: num_node_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read())) if retry_run_re: num_retry_fails = len(retry_run_regex.findall(open(model_logfile, 'r').read())) logger.debug ("RETRY: num_retry_fails {} spare_nodes {} retry_count {}". format(num_retry_fails, case.spare_nodes, retry_count)) if num_node_fails > 0 and case.spare_nodes >= num_node_fails: # We failed due to node failure! logger.warning("Detected model run failed due to node failure, restarting") case.spare_nodes -= num_node_fails loop = True case.set_value("CONTINUE_RUN", case.get_value("RESUBMIT_SETS_CONTINUE_RUN")) elif num_retry_fails > 0 and retry_count >= num_retry_fails: logger.warning("Detected model run failed, restarting") retry_count -= 1 loop = True if loop: # Archive the last consistent set of restart files and restore them if case.get_value("DOUT_S"): case.case_st_archive(resubmit=False) case.restore_from_archive() lid = new_lid() case.create_namelists() if stat != 0 and not loop: # We failed and we're not restarting expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile)) logger.debug("{} POST_RUN_CHECK BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) _post_run_check(case, lid) logger.debug("{} POST_RUN_CHECK HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) return lid
def case_run(case, skip_pnl=False): ############################################################################### # Set up the run, run the model, do the postrun steps run_with_submit = case.get_value("RUN_WITH_SUBMIT") expect(run_with_submit, "You are not calling the run script via the submit script. " "As a result, short-term archiving will not be called automatically." "Please submit your run using the submit script like so:" " ./case.submit Time: {}".format(get_timestamp())) # Forces user to use case.submit if they re-submit if case.get_value("TESTCASE") is None: case.set_value("RUN_WITH_SUBMIT", False) prerun_script = case.get_value("PRERUN_SCRIPT") postrun_script = case.get_value("POSTRUN_SCRIPT") data_assimilation = case.get_value("DATA_ASSIMILATION") data_assimilation_cycles = case.get_value("DATA_ASSIMILATION_CYCLES") data_assimilation_script = case.get_value("DATA_ASSIMILATION_SCRIPT") # set up the LID lid = new_lid() save_prerun_provenance(case) for cycle in range(data_assimilation_cycles): # After the first DA cycle, runs are restart runs if cycle > 0: case.set_value("CONTINUE_RUN", "TRUE") lid = new_lid() if prerun_script: case.flush() do_external(prerun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"), lid, prefix="prerun") case.read_xml() lid = run_model(case, lid, skip_pnl, da_cycle=cycle) save_logs(case, lid) # Copy log files back to caseroot if case.get_value("CHECK_TIMING") or case.get_value("SAVE_TIMING"): get_timing(case, lid) # Run the getTiming script if data_assimilation: case.flush() do_data_assimilation(data_assimilation_script, case.get_value("CASEROOT"), cycle, lid, case.get_value("RUNDIR")) case.read_xml() if postrun_script: case.flush() do_external(postrun_script, case.get_value("CASEROOT"), case.get_value("RUNDIR"), lid, prefix="postrun") case.read_xml() save_postrun_provenance(case) logger.warning("check for resubmit") resubmit_check(case) return True
def _run_model_impl(case, lid, skip_pnl=False, da_cycle=0): ############################################################################### pre_run_check(case, lid, skip_pnl=skip_pnl, da_cycle=da_cycle) model = case.get_value("MODEL") # Set OMP_NUM_THREADS env_mach_pes = case.get_env("mach_pes") comp_classes = case.get_values("COMP_CLASSES") thread_count = env_mach_pes.get_max_thread_count(comp_classes) os.environ["OMP_NUM_THREADS"] = str(thread_count) # Run the model logger.info("{} MODEL EXECUTION BEGINS HERE".format(time.strftime("%Y-%m-%d %H:%M:%S"))) cmd = case.get_mpirun_cmd(job="case.run") cmd = case.get_resolved_value(cmd) logger.info("run command is {} ".format(cmd)) rundir = case.get_value("RUNDIR") loop = True while loop: loop = False save_prerun_provenance(case) run_func = lambda: run_cmd(cmd, from_dir=rundir)[0] stat = run_and_log_case_status(run_func, "model execution", caseroot=case.get_value("CASEROOT")) model_logfile = os.path.join(rundir, model + ".log." + lid) # Determine if failure was due to a failed node, if so, try to restart if stat != 0: node_fail_re = case.get_value("NODE_FAIL_REGEX") if node_fail_re: node_fail_regex = re.compile(node_fail_re) model_logfile = os.path.join(rundir, model + ".log." + lid) if os.path.exists(model_logfile): num_fails = len(node_fail_regex.findall(open(model_logfile, 'r').read())) if num_fails > 0 and case.spare_nodes >= num_fails: # We failed due to node failure! logger.warning("Detected model run failed due to node failure, restarting") # Archive the last consistent set of restart files and restore them case_st_archive(case, no_resubmit=True) restore_from_archive(case) case.set_value("CONTINUE_RUN", case.get_value("RESUBMIT_SETS_CONTINUE_RUN")) create_namelists(case) lid = new_lid() loop = True case.spare_nodes -= num_fails if not loop: # We failed and we're not restarting expect(False, "RUN FAIL: Command '{}' failed\nSee log file for details: {}".format(cmd, model_logfile)) logger.info("{} MODEL EXECUTION HAS FINISHED".format(time.strftime("%Y-%m-%d %H:%M:%S"))) post_run_check(case, lid) return lid