def get_test_suite(suite, machine=None, compiler=None): ############################################################################### """ Return a list of FULL test names for a suite. """ expect(suite in _TEST_SUITES, "Unknown test suite: '%s'" % suite) machobj = Machines(machine=machine) machine = machobj.get_machine_name() if (compiler is None): compiler = machobj.get_default_compiler() expect(machobj.is_valid_compiler(compiler), "Compiler %s not valid for machine %s" % (compiler, machine)) inherits_from, tests_raw = _TEST_SUITES[suite] tests = [] for item in tests_raw: test_mod = None if (isinstance(item, str)): test_name = item else: expect(isinstance(item, tuple), "Bad item type for item '%s'" % str(item)) expect( len(item) in [2, 3], "Expected two or three items in item '%s'" % str(item)) expect(isinstance(item[0], str), "Expected string in first field of item '%s'" % str(item)) expect(isinstance(item[1], str), "Expected string in second field of item '%s'" % str(item)) test_name = item[0] if (len(item) == 2): test_mod = item[1] else: expect( type(item[2]) in [str, tuple], "Expected string or tuple for third field of item '%s'" % str(item)) test_mod_machines = [item[2]] if isinstance(item[2], str) else item[2] if (machine in test_mod_machines): test_mod = item[1] tests.append( CIME.utils.get_full_test_name(test_name, machine=machine, compiler=compiler, testmod=test_mod)) if (inherits_from is not None): inherited_tests = get_test_suite(inherits_from, machine, compiler) expect( len(set(tests) & set(inherited_tests)) == 0, "Tests %s defined in multiple suites" % ", ".join(set(tests) & set(inherited_tests))) tests.extend(inherited_tests) return tests
def _main_func(options, work_dir): ############################################################################### """Construct machines html from an XML file.""" # Initialize a variables for the html template mach_dict = dict() model_version = options.version[0] # get the machine config file files = Files() config_file = files.get_value("MACHINES_SPEC_FILE") expect(os.path.isfile(config_file), "Cannot find config_file {} on disk".format(config_file)) # instantiate a machines object and read XML values into a dictionary machines = Machines(config_file, machine="Query") mach_list = machines.list_available_machines() # get all the machine values loaded into the mach_dict mach_dict = machines.return_values() # intialize the support keys for machine in mach_list: mach_dict[(machine, 'support')] = "Unsupported" # loop through the list of supported machines and flag in the dictionary supported = options.supported[0].split(',') for machine in supported: mach_dict[(machine, 'support')] = "Scientific" # loop through the list of tested machines and flag in the dictionary tested = options.tested[0].split(',') for machine in tested: mach_dict[(machine, 'support')] = "Tested" # load up jinja template templateLoader = jinja2.FileSystemLoader( searchpath='{0}/templates'.format(work_dir)) templateEnv = jinja2.Environment(loader=templateLoader) # TODO - get the cesm_version for the CIME root tmplFile = 'machdef2html.tmpl' template = templateEnv.get_template(tmplFile) templateVars = { 'mach_list': mach_list, 'mach_dict': mach_dict, 'today': _now, 'model_version': model_version } # render the template mach_tmpl = template.render(templateVars) # write the output file with open(options.htmlfile[0], 'w') as html: html.write(mach_tmpl) return 0
def get_test_suite(suite, machine=None, compiler=None, skip_inherit=False, skip_tests=None): ############################################################################### """ Return a list of FULL test names for a suite. """ expect(suite in get_test_suites(), "Unknown test suite: '{}'".format(suite)) machobj = Machines(machine=machine) machine = machobj.get_machine_name() if compiler is None: compiler = machobj.get_default_compiler() expect( machobj.is_valid_compiler(compiler), "Compiler {} not valid for machine {}".format(compiler, machine), ) inherits_from, _, _, tests_raw = get_test_data(suite) tests = [] for item in tests_raw: expect( isinstance(item, str), "Bad type of test {}, expected string".format(item), ) test_mods = None test_components = item.split(".") expect(len(test_components) in [3, 4], "Bad test name {}".format(item)) if len(test_components) == 4: test_name = ".".join(test_components[:-1]) test_mods = test_components[-1] else: test_name = item if not skip_tests or not test_name in skip_tests: tests.append( CIME.utils.get_full_test_name( test_name, machine=machine, compiler=compiler, testmods_string=test_mods, )) if not skip_inherit: for inherits in inherits_from: inherited_tests = get_test_suite(inherits, machine, compiler) for inherited_test in inherited_tests: if inherited_test not in tests: tests.append(inherited_test) return tests
def _case_two_setup(self): mach_name = self._case.get_value("MACH") mach_obj = Machines(machine=mach_name) if mach_obj.is_valid_MPIlib("mpi-serial"): self._case.set_value("MPILIB", "mpi-serial") else: logger.warning( "mpi-serial is not supported on machine '{}', " "so we have to fall back to default MPI and " "therefore very little is being tested".format(mach_name)) if os.path.isfile("Macros"): os.remove("Macros")
def _case_two_setup(self): mach_name = self._case.get_value("MACH") mach_obj = Machines(machine=mach_name) if mach_obj.is_valid_MPIlib("mpi-serial"): self._case.set_value("MPILIB","mpi-serial") else: logger.warning("mpi-serial is not supported on machine '{}', " "so we have to fall back to default MPI and " "therefore very little is being tested".format(mach_name)) if os.path.isfile("Macros"): os.remove("Macros") self._case.case_setup(test_mode=True, reset=True)
def _create_caseroot_tools(self): machines_dir = os.path.abspath(self.get_value("MACHDIR")) toolsdir = os.path.join(self.get_value("CIMEROOT"),"scripts","Tools") # setup executable files in caseroot/ exefiles = (os.path.join(toolsdir, "case.setup"), os.path.join(toolsdir, "case.build"), os.path.join(toolsdir, "case.submit"), os.path.join(toolsdir, "preview_namelists"), os.path.join(toolsdir, "check_input_data"), os.path.join(toolsdir, "check_case"), os.path.join(toolsdir, "archive_metadata.sh"), os.path.join(toolsdir, "xmlchange"), os.path.join(toolsdir, "xmlquery")) try: for exefile in exefiles: destfile = os.path.join(self._caseroot,os.path.basename(exefile)) os.symlink(exefile, destfile) except Exception as e: logger.warning("FAILED to set up exefiles: %s" % str(e)) # set up utility files in caseroot/Tools/ toolfiles = (os.path.join(toolsdir, "check_lockedfiles"), os.path.join(toolsdir, "lt_archive.sh"), os.path.join(toolsdir, "getTiming"), os.path.join(toolsdir, "save_provenance"), os.path.join(machines_dir,"Makefile"), os.path.join(machines_dir,"mkSrcfiles"), os.path.join(machines_dir,"mkDepends")) for toolfile in toolfiles: destfile = os.path.join(self._caseroot,"Tools",os.path.basename(toolfile)) expect(os.path.isfile(toolfile)," File %s does not exist"%toolfile) try: os.symlink(toolfile, destfile) except Exception as e: logger.warning("FAILED to set up toolfiles: %s %s %s" % (str(e), toolfile, destfile)) # Create Macros file. machine = self.get_value("MACH") files = Files() # Use config_build if the environment variable is set, or if there is no # config_compilers file. if os.getenv("CIME_USE_CONFIG_BUILD") == "TRUE" or \ files.get_value("COMPILERS_SPEC_FILE") is None: build_file = files.get_value("BUILD_SPEC_FILE") machobj = Machines(machine=machine, files=files) macro_maker = Build(machobj) macros_path = os.path.join(self._caseroot, "Macros") with open(macros_path, "w") as macros_file: macro_maker.write_macros('Makefile', build_file, macros_file) # Copy any system or compiler Depends files to the case. compiler = self.get_value("COMPILER") for dep in (machine, compiler): dfile = "Depends.%s"%dep if os.path.isfile(os.path.join(machines_dir,dfile)): shutil.copyfile(os.path.join(machines_dir,dfile), os.path.join(self._caseroot,dfile)) dfile = "Depends.%s.%s"%(machine,compiler) if os.path.isfile(os.path.join(machines_dir,dfile)): shutil.copyfile(os.path.join(machines_dir,dfile), os.path.join(self._caseroot, dfile))
def list_machines(self): self.MachineList.Machobj = Machines() mach_list = self.MachineList.Machobj.list_available_machines() self.MachineList.addItems(mach_list) name = self.MachineList.Machobj.get_machine_name() if name is not None: self.MachineList.setCurrentIndex(mach_list.index(name)) self.MachineSelect(name)
def get_test_suite(suite, machine=None, compiler=None): ############################################################################### """ Return a list of FULL test names for a suite. """ expect(suite in _ALL_TESTS, "Unknown test suite: '{}'".format(suite)) machobj = Machines(machine=machine) machine = machobj.get_machine_name() if(compiler is None): compiler = machobj.get_default_compiler() expect(machobj.is_valid_compiler(compiler),"Compiler {} not valid for machine {}".format(compiler,machine)) inherits_from, _, tests_raw = _ALL_TESTS[suite] tests = [] for item in tests_raw: test_mod = None if (isinstance(item, six.string_types)): test_name = item else: expect(isinstance(item, tuple), "Bad item type for item '{}'".format(str(item))) expect(len(item) in [2, 3], "Expected two or three items in item '{}'".format(str(item))) expect(isinstance(item[0], six.string_types), "Expected string in first field of item '{}'".format(str(item))) expect(isinstance(item[1], six.string_types), "Expected string in second field of item '{}'".format(str(item))) test_name = item[0] if (len(item) == 2): test_mod = item[1] else: expect(type(item[2]) in [six.string_types, tuple], "Expected string or tuple for third field of item '{}'".format(str(item))) test_mod_machines = [item[2]] if isinstance(item[2], six.string_types) else item[2] if (machine in test_mod_machines): test_mod = item[1] tests.append(CIME.utils.get_full_test_name(test_name, machine=machine, compiler=compiler, testmod=test_mod)) if (inherits_from is not None): inherits_from = [inherits_from] if isinstance(inherits_from, six.string_types) else inherits_from for inherits in inherits_from: inherited_tests = get_test_suite(inherits, machine, compiler) expect(len(set(tests) & set(inherited_tests)) == 0, "Tests {} defined in multiple suites".format(", ".join(set(tests) & set(inherited_tests)))) tests.extend(inherited_tests) return tests
def __init__(self, machine, compiler, cimeroot, caseroot, mpilib, debug=False): self._machine = Machines(machine=machine) self._compiler = compiler self._cimeroot = cimeroot self._caseroot = caseroot self._mpilib = mpilib self._debug = debug self._module_system = self._machine.get_module_system_type()
def run_model(case): ############################################################################### # Set OMP_NUM_THREADS tm = TaskMaker(case) num_threads = tm.thread_count os.environ["OMP_NUM_THREADS"] = str(num_threads) # Run the model logger.info("%s MODEL EXECUTION BEGINS HERE" %(time.strftime("%Y-%m-%d %H:%M:%S"))) machine = Machines(machine=case.get_value("MACH")) cmd = machine.get_full_mpirun(tm, case, "case.run") cmd = case.get_resolved_value(cmd) logger.info("run command is %s " %cmd) rundir = case.get_value("RUNDIR") run_cmd_no_fail(cmd, from_dir=rundir) logger.info( "%s MODEL EXECUTION HAS FINISHED" %(time.strftime("%Y-%m-%d %H:%M:%S")))
def find_all_supported_platforms(): ############################################################################### """ Returns a set of all ACME supported platforms as defined in the XML configuration file config_machines.xml in the ACME source tree. A platform is defined by a triple (machine name, compiler, mpi library). """ machines = CIME.utils.get_machines() machobj = Machines(machine=machine) platform_set = set() for machine in machines: machobj.set_machine(machine) compilers, mpilibs = machobj.get_value("COMPILERS"), machobj.get_value("MPILIBS") for compiler in compilers: for mpilib in mpilibs: platform_set.add((machine, compiler, mpilib)) return list(platform_set)
def get_test_suite(suite, machine=None, compiler=None, skip_inherit=False): ############################################################################### """ Return a list of FULL test names for a suite. """ expect(suite in get_test_suites(), "Unknown test suite: '{}'".format(suite)) machobj = Machines(machine=machine) machine = machobj.get_machine_name() if(compiler is None): compiler = machobj.get_default_compiler() expect(machobj.is_valid_compiler(compiler),"Compiler {} not valid for machine {}".format(compiler,machine)) inherits_from, _, _, tests_raw = get_test_data(suite) tests = [] for item in tests_raw: expect(isinstance(item, six.string_types), "Bad type of test {}, expected string".format(item)) test_mod = None test_components = item.split(".") expect(len(test_components) in [3, 4], "Bad test name {}".format(item)) if (len(test_components) == 4): test_name = ".".join(test_components[:-1]) test_mod = test_components[-1] else: test_name = item tests.append(CIME.utils.get_full_test_name(test_name, machine=machine, compiler=compiler, testmod=test_mod)) if not skip_inherit: for inherits in inherits_from: inherited_tests = get_test_suite(inherits, machine, compiler) for inherited_test in inherited_tests: if inherited_test not in tests: tests.append(inherited_test) return tests
class TestScheduler(object): ############################################################################### ########################################################################### def __init__(self, test_names, test_data=None, no_run=False, no_build=False, no_setup=False, no_batch=None, test_root=None, test_id=None, machine_name=None, compiler=None, baseline_root=None, baseline_cmp_name=None, baseline_gen_name=None, clean=False, namelists_only=False, project=None, parallel_jobs=None, walltime=None, proc_pool=None, use_existing=False, save_timing=False, queue=None, allow_baseline_overwrite=False, output_root=None, force_procs=None, force_threads=None, mpilib=None, input_dir=None, pesfile=None, mail_user=None, mail_type=None): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = get_model() self._save_timing = save_timing self._queue = queue self._test_data = {} if test_data is None else test_data # Format: {test_name -> {data_name -> data}} self._mpilib = mpilib # allow override of default mpilib self._completed_tests = 0 self._input_dir = input_dir self._pesfile = pesfile self._allow_baseline_overwrite = allow_baseline_overwrite self._mail_user = mail_user self._mail_type = mail_type self._machobj = Machines(machine=machine_name) self._model_build_cost = 4 # If user is forcing procs or threads, re-write test names to reflect this. if force_procs or force_threads: test_names = _translate_test_names_for_new_pecount(test_names, force_procs, force_threads) self._no_setup = no_setup self._no_build = no_build or no_setup or namelists_only self._no_run = no_run or self._no_build self._output_root = output_root # Figure out what project to use if project is None: self._project = CIME.utils.get_project() if self._project is None: self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() expect(not (self._no_batch and self._queue is not None), "Does not make sense to request a queue without batch system") # Determine and resolve test_root if test_root is not None: self._test_root = test_root elif self._output_root is not None: self._test_root = self._output_root else: self._test_root = self._machobj.get_value("CIME_OUTPUT_ROOT") if self._project is not None: self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_timestamp() self._compiler = self._machobj.get_default_compiler() if compiler is None else compiler self._clean = clean self._namelists_only = namelists_only self._walltime = walltime if parallel_jobs is None: self._parallel_jobs = min(len(test_names), self._machobj.get_value("MAX_MPITASKS_PER_NODE")) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = baseline_cmp_name # Implies comparison should be done if not None self._baseline_gen_name = baseline_gen_name # Implies generation should be done if not None # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None \ else self._machobj.get_value("BASELINE_ROOT") if self._project is not None: self._baseline_root = self._baseline_root.replace("$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if baseline_cmp_name or baseline_gen_name: if self._baseline_cmp_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect(os.path.isdir(full_baseline_dir), "Missing baseline comparison directory {}".format(full_baseline_dir)) # the following is to assure that the existing generate directory is not overwritten if self._baseline_gen_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name) existing_baselines = [] for test_name in test_names: test_baseline = os.path.join(full_baseline_dir, test_name) if os.path.isdir(test_baseline): existing_baselines.append(test_baseline) expect(allow_baseline_overwrite or len(existing_baselines) == 0, "Baseline directories already exists {}\n" \ "Use -o to avoid this error".format(existing_baselines)) if self._cime_model == "e3sm": _order_tests_by_runtime(test_names, self._baseline_root) # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # name -> (phase, status) self._tests = OrderedDict() for test_name in test_names: self._tests[test_name] = (TEST_START, TEST_PASS_STATUS) # Oversubscribe by 1/4 if proc_pool is None: pes = int(self._machobj.get_value("MAX_TASKS_PER_NODE")) self._proc_pool = int(pes * 1.25) else: self._proc_pool = int(proc_pool) self._procs_avail = self._proc_pool # Setup phases self._phases = list(PHASES) if self._no_setup: self._phases.remove(SETUP_PHASE) if self._no_build: self._phases.remove(SHAREDLIB_BUILD_PHASE) self._phases.remove(MODEL_BUILD_PHASE) if self._no_run: self._phases.remove(RUN_PHASE) if use_existing: for test in self._tests: with TestStatus(self._get_test_dir(test)) as ts: for phase, status in ts: if phase in CORE_PHASES: if status in [TEST_PEND_STATUS, TEST_FAIL_STATUS]: if status == TEST_FAIL_STATUS: # Import for potential subsequent waits ts.set_status(phase, TEST_PEND_STATUS) # We need to pick up here break else: if phase != SUBMIT_PHASE: # Somewhat subtle. Create_test considers submit/run to be the run phase, # so don't try to update test status for a passed submit phase self._update_test_status(test, phase, TEST_PEND_STATUS) self._update_test_status(test, phase, status) if phase == RUN_PHASE: logger.info("Test {} passed and will not be re-run".format(test)) logger.info("Using existing test directory {}".format(self._get_test_dir(test))) else: # None of the test directories should already exist. for test in self._tests: expect(not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '{}', it already exists." " Pick a different test-id".format(self._get_test_dir(test))) logger.info("Creating test directory {}".format(self._get_test_dir(test))) # By the end of this constructor, this program should never hard abort, # instead, errors will be placed in the TestStatus files for the various # tests cases ########################################################################### def get_testnames(self): ########################################################################### return list(self._tests.keys()) ########################################################################### def _log_output(self, test, output): ########################################################################### test_dir = self._get_test_dir(test) if not os.path.isdir(test_dir): # Note: making this directory could cause create_newcase to fail # if this is run before. os.makedirs(test_dir) append_testlog(output, caseroot=test_dir) ########################################################################### def _get_case_id(self, test): ########################################################################### baseline_action_code = "" if self._baseline_gen_name: baseline_action_code += "G" if self._baseline_cmp_name: baseline_action_code += "C" if len(baseline_action_code) > 0: return "{}.{}.{}".format(test, baseline_action_code, self._test_id) else: return "{}.{}".format(test, self._test_id) ########################################################################### def _get_test_dir(self, test): ########################################################################### return os.path.join(self._test_root, self._get_case_id(test)) ########################################################################### def _get_test_data(self, test): ########################################################################### # Must be atomic return self._tests[test] ########################################################################### def _is_broken(self, test): ########################################################################### status = self._get_test_status(test) return status != TEST_PASS_STATUS and status != TEST_PEND_STATUS ########################################################################### def _work_remains(self, test): ########################################################################### test_phase, test_status = self._get_test_data(test) return (test_status == TEST_PASS_STATUS or test_status == TEST_PEND_STATUS) and\ test_phase != self._phases[-1] ########################################################################### def _get_test_status(self, test, phase=None): ########################################################################### curr_phase, curr_status = self._get_test_data(test) if phase is None or phase == curr_phase: return curr_status else: expect(phase is None or self._phases.index(phase) < self._phases.index(curr_phase), "Tried to see the future") # Assume all older phases PASSed return TEST_PASS_STATUS ########################################################################### def _get_test_phase(self, test): ########################################################################### return self._get_test_data(test)[0] ########################################################################### def _update_test_status(self, test, phase, status): ########################################################################### phase_idx = self._phases.index(phase) old_phase, old_status = self._get_test_data(test) if old_phase == phase: expect(old_status == TEST_PEND_STATUS, "Only valid to transition from PEND to something else, found '{}' for phase '{}'".format(old_status, phase)) expect(status != TEST_PEND_STATUS, "Cannot transition from PEND -> PEND") else: expect(old_status == TEST_PASS_STATUS, "Why did we move on to next phase when prior phase did not pass?") expect(status == TEST_PEND_STATUS, "New phase should be set to pending status") expect(self._phases.index(old_phase) == phase_idx - 1, "Skipped phase? {} {}".format(old_phase, phase_idx)) # Must be atomic self._tests[test] = (phase, status) ########################################################################### def _shell_cmd_for_phase(self, test, cmd, phase, from_dir=None): ########################################################################### while True: rc, output, errput = run_cmd(cmd, from_dir=from_dir) if rc != 0: self._log_output(test, "{} FAILED for test '{}'.\nCommand: {}\nOutput: {}\n". format(phase, test, cmd, output.encode('utf-8') + b"\n" + errput.encode('utf-8'))) # Temporary hack to get around odd file descriptor use by # buildnml scripts. if "bad interpreter" in output: time.sleep(1) continue else: return False, errput else: # We don't want "RUN PASSED" in the TestStatus.log if the only thing that # succeeded was the submission. phase = "SUBMIT" if phase == RUN_PHASE else phase self._log_output(test, "{} PASSED for test '{}'.\nCommand: {}\nOutput: {}\n". format(phase, test, cmd, output.encode('utf-8') + b"\n" + errput.encode('utf-8'))) return True, errput ########################################################################### def _create_newcase_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) _, case_opts, grid, compset,\ machine, compiler, test_mods = CIME.utils.parse_test_name(test) create_newcase_cmd = "{} --case {} --res {} --compset {}"\ " --test".format(os.path.join(self._cime_root, "scripts", "create_newcase"), test_dir, grid, compset) if machine is not None: create_newcase_cmd += " --machine {}".format(machine) if compiler is not None: create_newcase_cmd += " --compiler {}".format(compiler) if self._project is not None: create_newcase_cmd += " --project {} ".format(self._project) if self._output_root is not None: create_newcase_cmd += " --output-root {} ".format(self._output_root) if self._input_dir is not None: create_newcase_cmd += " --input-dir {} ".format(self._input_dir) if self._pesfile is not None: create_newcase_cmd += " --pesfile {} ".format(self._pesfile) if test_mods is not None: files = Files() (component, modspath) = test_mods.split('/',1) testmods_dir = files.get_value("TESTS_MODS_DIR", {"component": component}) test_mod_file = os.path.join(testmods_dir, component, modspath) if not os.path.exists(test_mod_file): error = "Missing testmod file '{}'".format(test_mod_file) self._log_output(test, error) return False, error create_newcase_cmd += " --user-mods-dir {}".format(test_mod_file) mpilib = None ninst = 1 ncpl = 1 if case_opts is not None: for case_opt in case_opts: # pylint: disable=not-an-iterable if case_opt.startswith('M'): mpilib = case_opt[1:] create_newcase_cmd += " --mpilib {}".format(mpilib) logger.debug (" MPILIB set to {}".format(mpilib)) elif case_opt.startswith('N'): expect(ncpl == 1,"Cannot combine _C and _N options") ninst = case_opt[1:] create_newcase_cmd += " --ninst {}".format(ninst) logger.debug (" NINST set to {}".format(ninst)) elif case_opt.startswith('C'): expect(ninst == 1,"Cannot combine _C and _N options") ncpl = case_opt[1:] create_newcase_cmd += " --ninst {} --multi-driver" .format(ncpl) logger.debug (" NCPL set to {}" .format(ncpl)) elif case_opt.startswith('P'): pesize = case_opt[1:] create_newcase_cmd += " --pecount {}".format(pesize) elif case_opt.startswith('V'): driver = case_opt[1:] create_newcase_cmd += " --driver {}".format(driver) # create_test mpilib option overrides default but not explicitly set case_opt mpilib if mpilib is None and self._mpilib is not None: create_newcase_cmd += " --mpilib {}".format(self._mpilib) logger.debug (" MPILIB set to {}".format(self._mpilib)) if self._queue is not None: create_newcase_cmd += " --queue={}".format(self._queue) if self._walltime is not None: create_newcase_cmd += " --walltime {}".format(self._walltime) else: # model specific ways of setting time if self._cime_model == "e3sm": recommended_time = _get_time_est(test, self._baseline_root) if recommended_time is not None: create_newcase_cmd += " --walltime {}".format(recommended_time) else: if test in self._test_data and "options" in self._test_data[test] and \ "wallclock" in self._test_data[test]['options']: create_newcase_cmd += " --walltime {}".format(self._test_data[test]['options']['wallclock']) logger.debug("Calling create_newcase: " + create_newcase_cmd) return self._shell_cmd_for_phase(test, create_newcase_cmd, CREATE_NEWCASE_PHASE) ########################################################################### def _xml_phase(self, test): ########################################################################### test_case = CIME.utils.parse_test_name(test)[0] # Create, fill and write an envtest object test_dir = self._get_test_dir(test) envtest = EnvTest(test_dir) # Determine list of component classes that this coupler/driver knows how # to deal with. This list follows the same order as compset longnames follow. files = Files() drv_config_file = files.get_value("CONFIG_CPL_FILE") drv_comp = Component(drv_config_file, "CPL") envtest.add_elements_by_group(files, {}, "env_test.xml") envtest.add_elements_by_group(drv_comp, {}, "env_test.xml") envtest.set_value("TESTCASE", test_case) envtest.set_value("TEST_TESTID", self._test_id) envtest.set_value("CASEBASEID", test) if test in self._test_data and "options" in self._test_data[test] and \ "memleak_tolerance" in self._test_data[test]['options']: envtest.set_value("TEST_MEMLEAK_TOLERANCE", self._test_data[test]['options']['memleak_tolerance']) test_argv = "-testname {} -testroot {}".format(test, self._test_root) if self._baseline_gen_name: test_argv += " -generate {}".format(self._baseline_gen_name) basegen_case_fullpath = os.path.join(self._baseline_root,self._baseline_gen_name, test) logger.debug("basegen_case is {}".format(basegen_case_fullpath)) envtest.set_value("BASELINE_NAME_GEN", self._baseline_gen_name) envtest.set_value("BASEGEN_CASE", os.path.join(self._baseline_gen_name, test)) if self._baseline_cmp_name: test_argv += " -compare {}".format(self._baseline_cmp_name) envtest.set_value("BASELINE_NAME_CMP", self._baseline_cmp_name) envtest.set_value("BASECMP_CASE", os.path.join(self._baseline_cmp_name, test)) envtest.set_value("TEST_ARGV", test_argv) envtest.set_value("CLEANUP", self._clean) envtest.set_value("BASELINE_ROOT", self._baseline_root) envtest.set_value("GENERATE_BASELINE", self._baseline_gen_name is not None) envtest.set_value("COMPARE_BASELINE", self._baseline_cmp_name is not None) envtest.set_value("CCSM_CPRNC", self._machobj.get_value("CCSM_CPRNC", resolved=False)) tput_tolerance = self._machobj.get_value("TEST_TPUT_TOLERANCE", resolved=False) envtest.set_value("TEST_TPUT_TOLERANCE", 0.25 if tput_tolerance is None else tput_tolerance) # Add the test instructions from config_test to env_test in the case config_test = Tests() testnode = config_test.get_test_node(test_case) envtest.add_test(testnode) # Determine the test_case from the test name test_case, case_opts = CIME.utils.parse_test_name(test)[:2] # Determine case_opts from the test_case if case_opts is not None: logger.debug("case_opts are {} ".format(case_opts)) for opt in case_opts: # pylint: disable=not-an-iterable logger.debug("case_opt is {}".format(opt)) if opt == 'D': envtest.set_test_parameter("DEBUG", "TRUE") logger.debug (" DEBUG set to TRUE") elif opt == 'E': envtest.set_test_parameter("USE_ESMF_LIB", "TRUE") logger.debug (" USE_ESMF_LIB set to TRUE") elif opt == 'CG': envtest.set_test_parameter("CALENDAR", "GREGORIAN") logger.debug (" CALENDAR set to {}".format(opt)) elif opt.startswith('L'): match = re.match('L([A-Za-z])([0-9]*)', opt) stop_option = {"y":"nyears", "m":"nmonths", "d":"ndays", "h":"nhours", "s":"nseconds", "n":"nsteps"} opt = match.group(1) envtest.set_test_parameter("STOP_OPTION",stop_option[opt]) opti = match.group(2) envtest.set_test_parameter("STOP_N", opti) logger.debug (" STOP_OPTION set to {}".format(stop_option[opt])) logger.debug (" STOP_N set to {}".format(opti)) elif opt.startswith('R'): # R option is for testing in PTS_MODE or Single Column Model # (SCM) mode envtest.set_test_parameter("PTS_MODE", "TRUE") # For PTS_MODE, compile with mpi-serial envtest.set_test_parameter("MPILIB", "mpi-serial") elif (opt.startswith('I') or # Marker to distinguish tests with same name - ignored opt.startswith('M') or # handled in create_newcase opt.startswith('P') or # handled in create_newcase opt.startswith('N') or # handled in create_newcase opt.startswith('C') or # handled in create_newcase opt.startswith('V')): # handled in create_newcase pass elif opt.startswith('IOP'): logger.warning("IOP test option not yet implemented") else: expect(False, "Could not parse option '{}' ".format(opt)) envtest.write() lock_file("env_run.xml", caseroot=test_dir, newname="env_run.orig.xml") with Case(test_dir, read_only=False) as case: if self._output_root is None: self._output_root = case.get_value("CIME_OUTPUT_ROOT") # if we are running a single test we don't need sharedlibroot if len(self._tests) > 1 and self._cime_model != "e3sm": case.set_value("SHAREDLIBROOT", os.path.join(self._output_root, "sharedlibroot.{}".format(self._test_id))) envtest.set_initial_values(case) case.set_value("TEST", True) case.set_value("SAVE_TIMING", self._save_timing) # Scale back build parallelism on systems with few cores if self._model_build_cost > self._proc_pool: case.set_value("GMAKE_J", self._proc_pool) self._model_build_cost = self._proc_pool return True, "" ########################################################################### def _setup_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) rv = self._shell_cmd_for_phase(test, "./case.setup", SETUP_PHASE, from_dir=test_dir) # It's OK for this command to fail with baseline diffs but not catastrophically if rv[0]: cmdstat, output, _ = run_cmd("./case.cmpgen_namelists", combine_output=True, from_dir=test_dir) expect(cmdstat in [0, TESTS_FAILED_ERR_CODE], "Fatal error in case.cmpgen_namelists: {}".format(output)) return rv ########################################################################### def _sharedlib_build_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.build --sharedlib-only", SHAREDLIB_BUILD_PHASE, from_dir=test_dir) ########################################################################### def _model_build_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.build --model-only", MODEL_BUILD_PHASE, from_dir=test_dir) ########################################################################### def _run_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) cmd = "./case.submit --skip-preview-namelist" if self._no_batch: cmd += " --no-batch" if self._mail_user: cmd += " --mail-user={}".format(self._mail_user) if self._mail_type: cmd += " -M={}".format(",".join(self._mail_type)) return self._shell_cmd_for_phase(test, cmd, RUN_PHASE, from_dir=test_dir) ########################################################################### def _run_catch_exceptions(self, test, phase, run): ########################################################################### try: return run(test) except (SystemExit, Exception) as e: exc_tb = sys.exc_info()[2] errput = "Test '{}' failed in phase '{}' with exception '{}'\n".format(test, phase, str(e)) errput += ''.join(traceback.format_tb(exc_tb)) self._log_output(test, errput) return False, errput ########################################################################### def _get_procs_needed(self, test, phase, threads_in_flight=None, no_batch=False): ########################################################################### if phase == RUN_PHASE and (self._no_batch or no_batch): test_dir = self._get_test_dir(test) total_pes = int(run_cmd_no_fail("./xmlquery TOTALPES --value", from_dir=test_dir)) threads = eval(run_cmd_no_fail("./xmlquery NTHRDS --value", from_dir=test_dir)) max_threads = 0 for item in threads: _, comp_threads = item.split(":") comp_threads = int(comp_threads) if comp_threads > max_threads: max_threads = comp_threads max_cores = total_pes * max_threads return max_cores elif (phase == SHAREDLIB_BUILD_PHASE): if self._cime_model == "cesm": # Will force serialization of sharedlib builds # TODO - instead of serializing, compute all library configs needed and build # them all in parallel for _, _, running_phase in threads_in_flight.values(): if (running_phase == SHAREDLIB_BUILD_PHASE): return self._proc_pool + 1 return 1 elif (phase == MODEL_BUILD_PHASE): # Model builds now happen in parallel return self._model_build_cost else: return 1 ########################################################################### def _wait_for_something_to_finish(self, threads_in_flight): ########################################################################### expect(len(threads_in_flight) <= self._parallel_jobs, "Oversubscribed?") finished_tests = [] while not finished_tests: for test, thread_info in threads_in_flight.items(): if not thread_info[0].is_alive(): finished_tests.append((test, thread_info[1])) if not finished_tests: time.sleep(0.2) for finished_test, procs_needed in finished_tests: self._procs_avail += procs_needed del threads_in_flight[finished_test] ########################################################################### def _update_test_status_file(self, test, test_phase, status): ########################################################################### """ In general, test_scheduler should not be responsible for updating the TestStatus file, but there are a few cases where it has to. """ test_dir = self._get_test_dir(test) with TestStatus(test_dir=test_dir, test_name=test) as ts: ts.set_status(test_phase, status) ########################################################################### def _consumer(self, test, test_phase, phase_method): ########################################################################### before_time = time.time() success, errors = self._run_catch_exceptions(test, test_phase, phase_method) elapsed_time = time.time() - before_time status = (TEST_PEND_STATUS if test_phase == RUN_PHASE and not \ self._no_batch else TEST_PASS_STATUS) if success else TEST_FAIL_STATUS if status != TEST_PEND_STATUS: self._update_test_status(test, test_phase, status) if not self._work_remains(test): self._completed_tests += 1 total = len(self._tests) status_str = "Finished {} for test {} in {:f} seconds ({}). [COMPLETED {:d} of {:d}]".format(test_phase, test, elapsed_time, status, self._completed_tests, total) else: status_str = "Finished {} for test {} in {:f} seconds ({})".format(test_phase, test, elapsed_time, status) if not success: status_str += "\n Case dir: {}\n".format(self._get_test_dir(test)) status_str += " Errors were:\n {}\n".format("\n ".join(errors.splitlines())) logger.info(status_str) if test_phase in [CREATE_NEWCASE_PHASE, XML_PHASE]: # These are the phases for which TestScheduler is reponsible for # updating the TestStatus file self._update_test_status_file(test, test_phase, status) if test_phase == XML_PHASE: append_status("Case Created using: "+" ".join(sys.argv), "README.case", caseroot=self._get_test_dir(test)) # On batch systems, we want to immediately submit to the queue, because # it's very cheap to submit and will get us a better spot in line if (success and not self._no_run and not self._no_batch and test_phase == MODEL_BUILD_PHASE): logger.info("Starting {} for test {} with 1 proc on interactive node and {:d} procs on compute nodes".format(RUN_PHASE, test, self._get_procs_needed(test, RUN_PHASE, no_batch=True))) self._update_test_status(test, RUN_PHASE, TEST_PEND_STATUS) self._consumer(test, RUN_PHASE, self._run_phase) ########################################################################### def _producer(self): ########################################################################### threads_in_flight = {} # test-name -> (thread, procs, phase) while True: work_to_do = False num_threads_launched_this_iteration = 0 for test in self._tests: logger.debug("test_name: " + test) if self._work_remains(test): work_to_do = True # If we have no workers available, immediately break out of loop so we can wait if len(threads_in_flight) == self._parallel_jobs: break if test not in threads_in_flight: test_phase, test_status = self._get_test_data(test) expect(test_status != TEST_PEND_STATUS, test) next_phase = self._phases[self._phases.index(test_phase) + 1] procs_needed = self._get_procs_needed(test, next_phase, threads_in_flight) if procs_needed <= self._procs_avail: self._procs_avail -= procs_needed # Necessary to print this way when multiple threads printing logger.info("Starting {} for test {} with {:d} procs".format(next_phase, test, procs_needed)) self._update_test_status(test, next_phase, TEST_PEND_STATUS) new_thread = threading.Thread(target=self._consumer, args=(test, next_phase, getattr(self, "_{}_phase".format(next_phase.lower())) )) threads_in_flight[test] = (new_thread, procs_needed, next_phase) new_thread.start() num_threads_launched_this_iteration += 1 logger.debug(" Current workload:") total_procs = 0 for the_test, the_data in six.iteritems(threads_in_flight): logger.debug(" {}: {} -> {}".format(the_test, the_data[2], the_data[1])) total_procs += the_data[1] logger.debug(" Total procs in use: {}".format(total_procs)) else: if not threads_in_flight: msg = "Phase '{}' for test '{}' required more processors, {:d}, than this machine can provide, {:d}".format(next_phase, test, procs_needed, self._procs_avail) logger.warning(msg) self._update_test_status(test, next_phase, TEST_PEND_STATUS) self._update_test_status(test, next_phase, TEST_FAIL_STATUS) self._log_output(test, msg) if next_phase == RUN_PHASE: self._update_test_status_file(test, SUBMIT_PHASE, TEST_PASS_STATUS) self._update_test_status_file(test, next_phase, TEST_FAIL_STATUS) else: self._update_test_status_file(test, next_phase, TEST_FAIL_STATUS) num_threads_launched_this_iteration += 1 if not work_to_do: break if num_threads_launched_this_iteration == 0: # No free resources, wait for something in flight to finish self._wait_for_something_to_finish(threads_in_flight) for unfinished_thread, _, _ in threads_in_flight.values(): unfinished_thread.join() ########################################################################### def _setup_cs_files(self): ########################################################################### try: python_libs_root = CIME.utils.get_python_libs_root() template_file = os.path.join(python_libs_root, "cs.status.template") template = open(template_file, "r").read() template = template.replace("<PATH>", os.path.join(self._cime_root,"scripts","Tools")).replace\ ("<TESTID>", self._test_id).replace\ ("<TESTROOT>", self._test_root) if not os.path.exists(self._test_root): os.makedirs(self._test_root) cs_status_file = os.path.join(self._test_root, "cs.status.{}".format(self._test_id)) with open(cs_status_file, "w") as fd: fd.write(template) os.chmod(cs_status_file, os.stat(cs_status_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) template_file = os.path.join(python_libs_root, "cs.submit.template") template = open(template_file, "r").read() setup_cmd = "./case.setup" if self._no_setup else ":" build_cmd = "./case.build" if self._no_build else ":" test_cmd = "./case.submit" template = template.replace("<SETUP_CMD>", setup_cmd).\ replace("<BUILD_CMD>", build_cmd).\ replace("<RUN_CMD>", test_cmd).\ replace("<TESTID>", self._test_id) if self._no_run: cs_submit_file = os.path.join(self._test_root, "cs.submit.{}".format(self._test_id)) with open(cs_submit_file, "w") as fd: fd.write(template) os.chmod(cs_submit_file, os.stat(cs_submit_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) if self._cime_model == "cesm": template_file = os.path.join(python_libs_root, "testreporter.template") template = open(template_file, "r").read() template = template.replace("<PATH>", os.path.join(self._cime_root, "scripts", "Tools")) testreporter_file = os.path.join(self._test_root, "testreporter") with open(testreporter_file, "w") as fd: fd.write(template) os.chmod(testreporter_file, os.stat(testreporter_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) except Exception as e: logger.warning("FAILED to set up cs files: {}".format(str(e))) ########################################################################### def run_tests(self, wait=False, wait_check_throughput=False, wait_check_memory=False, wait_ignore_namelists=False, wait_ignore_memleak=False): ########################################################################### """ Main API for this class. Return True if all tests passed. """ start_time = time.time() # Tell user what will be run logger.info( "RUNNING TESTS:") for test in self._tests: logger.info( " {}".format(test)) # Setup cs files self._setup_cs_files() GenericXML.DISABLE_CACHING = True self._producer() GenericXML.DISABLE_CACHING = False expect(threading.active_count() == 1, "Leftover threads?") wait_handles_report = False if not self._no_run and not self._no_batch: if wait: logger.info("Waiting for tests to finish") rv = wait_for_tests(glob.glob(os.path.join(self._test_root, "*{}/TestStatus".format(self._test_id))), check_throughput=wait_check_throughput, check_memory=wait_check_memory, ignore_namelists=wait_ignore_namelists, ignore_memleak=wait_ignore_memleak) wait_handles_report = True else: logger.info("Due to presence of batch system, create_test will exit before tests are complete.\n" \ "To force create_test to wait for full completion, use --wait") # Return True if all tests passed from our point of view if not wait_handles_report: logger.info( "At test-scheduler close, state is:") rv = True for test in self._tests: phase, status = self._get_test_data(test) # Give highest priority to fails in test schduler if status not in [TEST_PASS_STATUS, TEST_PEND_STATUS]: logger.info( "{} {} (phase {})".format(status, test, phase)) rv = False else: # Be cautious about telling the user that the test passed. This # status should match what they would see on the dashboard. Our # self._test_states does not include comparison fail information, # so we need to parse test status. ts = TestStatus(self._get_test_dir(test)) nlfail = ts.get_status(NAMELIST_PHASE) == TEST_FAIL_STATUS ts_status = ts.get_overall_test_status(ignore_namelists=True, check_memory=False, check_throughput=False) if ts_status not in [TEST_PASS_STATUS, TEST_PEND_STATUS]: logger.info( "{} {} (phase {})".format(ts_status, test, phase)) rv = False elif nlfail: logger.info( "{} {} (but otherwise OK) {}".format(NAMELIST_FAIL_STATUS, test, phase)) rv = False else: logger.info("{} {} {}".format(status, test, phase)) logger.info( " Case dir: {}".format(self._get_test_dir(test))) logger.info( "test-scheduler took {} seconds".format(time.time() - start_time)) return rv
def _main(): output, build_dir, build_optimized, clean,\ cmake_args, compiler, enable_genf90, machine, machines_dir,\ make_j, use_mpi, mpilib, mpirun_command, test_spec_dir, ctest_args,\ use_openmp, xml_test_list, verbose \ = parse_command_line(sys.argv) #================================================= # Find directory and file paths. #================================================= suite_specs = [] # TODO: this violates cime policy of direct access to xml # should be moved to CIME/XML if xml_test_list is not None: test_xml_tree = ElementTree() test_xml_tree.parse(xml_test_list) known_paths = { "here": os.path.abspath(os.path.dirname(xml_test_list)), } suite_specs.extend(suites_from_xml(test_xml_tree, known_paths)) if test_spec_dir is not None: suite_specs.append( TestSuiteSpec("__command_line_test__", ["__command_line_test__"], [os.path.abspath(test_spec_dir)])) if machines_dir is not None: machines_file = os.path.join(machines_dir, "config_machines.xml") machobj = Machines(infile=machines_file, machine=machine) else: machobj = Machines(machine=machine) # Create build directory if necessary. build_dir = os.path.abspath(build_dir) if not os.path.isdir(build_dir): os.mkdir(build_dir) # Switch to the build directory. os.chdir(build_dir) if clean: pwd_contents = os.listdir(os.getcwd()) # Clear CMake cache. for file_ in pwd_contents: if file_ in ("Macros.cmake", "env_mach_specific.xml") \ or file_.startswith('Depends') or file_.startswith(".env_mach_specific"): os.remove(file_) #================================================= # Functions to perform various stages of build. #================================================= if not use_mpi: mpilib = "mpi-serial" elif mpilib is None: mpilib = machobj.get_default_MPIlib() logger.info("Using mpilib: {}".format(mpilib)) if compiler is None: compiler = machobj.get_default_compiler() logger.info("Compiler is {}".format(compiler)) compilerobj = Compilers(machobj, compiler=compiler, mpilib=mpilib) pfunit_path = find_pfunit(compilerobj, mpilib=mpilib, use_openmp=use_openmp) debug = not build_optimized os_ = machobj.get_value("OS") # Create the environment, and the Macros.cmake file # # configure(machobj, build_dir, ["CMake"], compiler, mpilib, debug, os_, unit_testing=True) machspecific = EnvMachSpecific(build_dir, unit_testing=True) fake_case = FakeCase(compiler, mpilib, debug) machspecific.load_env(fake_case) os.environ["OS"] = os_ os.environ["COMPILER"] = compiler os.environ["DEBUG"] = stringify_bool(debug) os.environ["MPILIB"] = mpilib if use_openmp: os.environ["compile_threaded"] = "true" else: os.environ["compile_threaded"] = "false" os.environ["UNIT_TEST_HOST"] = socket.gethostname() if "NETCDF_PATH" in os.environ and not "NETCDF" in os.environ: # The CMake Netcdf find utility that we use (from pio2) seems to key off # of the environment variable NETCDF, but not NETCDF_PATH logger.info("Setting NETCDF environment variable: {}".format( os.environ["NETCDF_PATH"])) os.environ["NETCDF"] = os.environ["NETCDF_PATH"] if not use_mpi: mpirun_command = "" elif mpirun_command is None: mpi_attribs = { "compiler": compiler, "mpilib": mpilib, "threaded": use_openmp, "unit_testing": True } # We can get away with specifying case=None since we're using exe_only=True mpirun_command, _ = machspecific.get_mpirun(None, mpi_attribs, None, exe_only=True) mpirun_command = machspecific.get_resolved_value(mpirun_command) logger.info("mpirun command is '{}'".format(mpirun_command)) #================================================= # Run tests. #================================================= for spec in suite_specs: os.chdir(build_dir) if os.path.isdir(spec.name): if clean: rmtree(spec.name) if not os.path.isdir(spec.name): os.mkdir(spec.name) for label, directory in spec: os.chdir(os.path.join(build_dir, spec.name)) if not os.path.isdir(label): os.mkdir(label) os.chdir(label) name = spec.name + "/" + label if not os.path.islink("Macros.cmake"): os.symlink(os.path.join(build_dir, "Macros.cmake"), "Macros.cmake") use_mpiserial = not use_mpi cmake_stage(name, directory, build_optimized, use_mpiserial, mpirun_command, output, pfunit_path, verbose=verbose, enable_genf90=enable_genf90, cmake_args=cmake_args) make_stage(name, output, make_j, clean=clean, verbose=verbose) for spec in suite_specs: os.chdir(os.path.join(build_dir, spec.name)) for label, directory in spec: name = spec.name + "/" + label output.print_header("Running CTest tests for " + name + ".") ctest_command = ["ctest", "--output-on-failure"] if verbose: ctest_command.append("-VV") if ctest_args is not None: ctest_command.extend(ctest_args.split(" ")) run_cmd_no_fail(" ".join(ctest_command), from_dir=label, arg_stdout=None, arg_stderr=subprocess.STDOUT)
def _main(): output, build_dir, build_optimized, clean,\ cmake_args, compiler, enable_genf90, machine, machines_dir,\ make_j, use_mpi, mpilib, mpirun_command, test_spec_dir, ctest_args,\ use_openmp, xml_test_list, verbose \ = parse_command_line(sys.argv) #================================================= # Find directory and file paths. #================================================= suite_specs = [] # TODO: this violates cime policy of direct access to xml # should be moved to CIME/XML if xml_test_list is not None: test_xml_tree = ElementTree() test_xml_tree.parse(xml_test_list) known_paths = { "here": os.path.abspath(os.path.dirname(xml_test_list)), } suite_specs.extend(suites_from_xml(test_xml_tree, known_paths)) if test_spec_dir is not None: suite_specs.append( TestSuiteSpec("__command_line_test__", ["__command_line_test__"], [os.path.abspath(test_spec_dir)]) ) if machines_dir is not None: machines_file = os.path.join(machines_dir, "config_machines.xml") machobj = Machines(infile=machines_file, machine=machine) else: machobj = Machines(machine=machine) # Create build directory if necessary. build_dir = os.path.abspath(build_dir) if not os.path.isdir(build_dir): os.mkdir(build_dir) # Switch to the build directory. os.chdir(build_dir) #================================================= # Functions to perform various stages of build. #================================================= if not use_mpi: mpilib = "mpi-serial" elif mpilib is None: mpilib = machobj.get_default_MPIlib() logger.info("Using mpilib: {}".format(mpilib)) if compiler is None: compiler = machobj.get_default_compiler() logger.info("Compiler is {}".format(compiler)) compilerobj = Compilers(machobj, compiler=compiler, mpilib=mpilib) pfunit_path = find_pfunit(compilerobj, mpilib=mpilib, use_openmp=use_openmp) debug = not build_optimized os_ = machobj.get_value("OS") # Create the environment, and the Macros.cmake file # # configure(machobj, build_dir, ["CMake"], compiler, mpilib, debug, os_, unit_testing=True) machspecific = EnvMachSpecific(build_dir, unit_testing=True) fake_case = FakeCase(compiler, mpilib, debug) machspecific.load_env(fake_case) os.environ["OS"] = os_ os.environ["COMPILER"] = compiler os.environ["DEBUG"] = stringify_bool(debug) os.environ["MPILIB"] = mpilib if use_openmp: os.environ["compile_threaded"] = "true" else: os.environ["compile_threaded"] = "false" os.environ["UNIT_TEST_HOST"] = socket.gethostname() if "NETCDF_PATH" in os.environ and not "NETCDF" in os.environ: # The CMake Netcdf find utility that we use (from pio2) seems to key off # of the environment variable NETCDF, but not NETCDF_PATH logger.info("Setting NETCDF environment variable: {}".format(os.environ["NETCDF_PATH"])) os.environ["NETCDF"] = os.environ["NETCDF_PATH"] if not use_mpi: mpirun_command = "" elif mpirun_command is None: mpi_attribs = { "compiler" : compiler, "mpilib" : mpilib, "threaded" : use_openmp, "unit_testing" : True } # We can get away with specifying case=None since we're using exe_only=True mpirun_command, _ = machspecific.get_mpirun(case=None, attribs=mpi_attribs, exe_only=True) mpirun_command = machspecific.get_resolved_value(mpirun_command) logger.info("mpirun command is '{}'".format(mpirun_command)) #================================================= # Run tests. #================================================= for spec in suite_specs: os.chdir(build_dir) if os.path.isdir(spec.name): if clean: rmtree(spec.name) if not os.path.isdir(spec.name): os.mkdir(spec.name) for label, directory in spec: os.chdir(os.path.join(build_dir,spec.name)) if not os.path.isdir(label): os.mkdir(label) os.chdir(label) name = spec.name+"/"+label if not os.path.islink("Macros.cmake"): os.symlink(os.path.join(build_dir,"Macros.cmake"), "Macros.cmake") use_mpiserial = not use_mpi cmake_stage(name, directory, build_optimized, use_mpiserial, mpirun_command, output, pfunit_path, verbose=verbose, enable_genf90=enable_genf90, cmake_args=cmake_args) make_stage(name, output, make_j, clean=clean, verbose=verbose) for spec in suite_specs: os.chdir(os.path.join(build_dir,spec.name)) for label, directory in spec: name = spec.name+"/"+label output.print_header("Running CTest tests for "+name+".") ctest_command = ["ctest", "--output-on-failure"] if verbose: ctest_command.append("-VV") if ctest_args is not None: ctest_command.extend(ctest_args.split(" ")) run_cmd_no_fail(" ".join(ctest_command), from_dir=label, arg_stdout=None, arg_stderr=subprocess.STDOUT)
def parse_command_line(args, description): ############################################################################### help_str = """ Solve a Mixed Integer Linear Program to find a PE layout that minimizes the wall-clock time per model day. """ parser = argparse.ArgumentParser( usage=help_str, description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) CIME.utils.setup_standard_logging_options(parser) parser.add_argument('--test-id', default=DEFAULT_TESTID, help='test-id to use for all timing runs') parser.add_argument( "-r", "--test-root", help="Where test cases were created." " Will default to output root as defined in the config_machines file") parser.add_argument('--timing-dir', help='alternative to using casename ' 'to find timing data, instead read all files in' ' this directory') parser.add_argument('--blocksize', help='default minimum size of blocks to assign to all ' 'components. Components can be assigned different ' 'blocksizes using --blocksize_XXX. Default 1', type=int) for c in COMPONENT_LIST: parser.add_argument('--blocksize-%s' % c.lower(), help='minimum blocksize for component %s, if ' 'different from --blocksize', type=int) parser.add_argument('--total-tasks', type=int, help='Number of pes available for assignment') #-------------------SK added this- July-2020------------------------------------------------- parser.add_argument( '--optimizer', type=int, help= 'chose between the two Mixed Integer Linear Solvers 1-PuLP_COIN-CBC, 2-PuLP_GLPK' ) #------------------------------------------------------------------------------------ parser.add_argument( "--layout", help="name of layout to solve (default selected internally)") parser.add_argument("--graph-models", action="store_true", help="plot cost v. ntasks models. requires matplotlib") parser.add_argument("--print-models", action="store_true", help="print all costs and ntasks") parser.add_argument("--pe-output", help="write pe layout to file") parser.add_argument('--json-output', help="write MILP data to .json file") parser.add_argument('--json-input', help="solve using data from .json file") args = CIME.utils.parse_args_and_handle_standard_logging_options( args, parser) if args.total_tasks is None and args.json_input is None: expect(args.total_tasks is not None or args.json_input is not None, "--total-tasks or --json-input option must be set") #------------------------SK added this-July 2020----------------- if args.optimizer is None: expect(args.optimizer is not None, "--optimizer must be set") #---------------------------------------------------- blocksizes = {} for c in COMPONENT_LIST: attrib = 'blocksize_%s' % c.lower() if getattr(args, attrib) is not None: blocksizes[c] = getattr(args, attrib) elif args.blocksize is not None: blocksizes[c] = args.blocksize for c in COMPONENT_LIST: print c, blocksizes[c] test_root = args.test_root if test_root is None: machobj = Machines() test_root = machobj.get_value("CIME_OUTPUT_ROOT") # SK added the args.optimizer to the below line return (args.test_id, test_root, args.timing_dir, blocksizes, args.total_tasks, args.optimizer, args.layout, args.graph_models, args.print_models, args.pe_output, args.json_output, args.json_input)
if not non_local: case.load_env() models = case.get_values("COMP_CLASSES") mach = case.get_value("MACH") compiler = case.get_value("COMPILER") debug = case.get_value("DEBUG") mpilib = case.get_value("MPILIB") sysos = case.get_value("OS") comp_interface = case.get_value("COMP_INTERFACE") expect(mach is not None, "xml variable MACH is not set") # creates the Macros.make, Depends.compiler, Depends.machine, Depends.machine.compiler # and env_mach_specific.xml if they don't already exist. if not os.path.isfile("Macros.make") or not os.path.isfile("env_mach_specific.xml"): configure(Machines(machine=mach), caseroot, ["Makefile"], compiler, mpilib, debug, comp_interface, sysos) # Set tasks to 1 if mpi-serial library if mpilib == "mpi-serial": for vid, value in case: if vid.startswith("NTASKS") and value != 1: case.set_value(vid, 1) # Check ninst. # In CIME there can be multiple instances of each component model (an ensemble) NINST is the instance of that component. comp_interface = case.get_value("COMP_INTERFACE") if comp_interface == "nuopc": ninst = case.get_value("NINST") multi_driver = case.get_value("MULTI_DRIVER")
import CIME.wait_for_tests from CIME.utils import expect from CIME.XML.machines import Machines import os, shutil, glob, signal, logging _MACHINE = Machines() ############################################################################### def cleanup_queue(set_of_jobs_we_created): ############################################################################### """ Delete all jobs left in the queue """ current_jobs = set(CIME.utils.get_my_queued_jobs()) jobs_to_delete = set_of_jobs_we_created & current_jobs if (jobs_to_delete): logging.warning( "Found leftover batch jobs that need to be deleted: %s" % ", ".join(jobs_to_delete)) success = CIME.utils.delete_jobs(jobs_to_delete) if not success: logging.warning("FAILED to clean up leftover jobs!") ############################################################################### def jenkins_generic_job(generate_baselines, submit_to_cdash, no_batch, baseline_name, arg_cdash_build_name, cdash_project, arg_test_suite, cdash_build_group, baseline_compare,
def configure(self, compset_name, grid_name, machine_name=None, project=None, pecount=None, compiler=None, mpilib=None, user_compset=False, pesfile=None, user_grid=False, gridfile=None, ninst=1, test=False, walltime=None, queue=None): #-------------------------------------------- # compset, pesfile, and compset components #-------------------------------------------- self._set_compset_and_pesfile(compset_name, user_compset=user_compset, pesfile=pesfile) self._components = self.get_compset_components() #FIXME - if --user-compset is True then need to determine that #all of the compset settings are valid #-------------------------------------------- # grid #-------------------------------------------- if user_grid is True and gridfile is not None: self.set_value("GRIDS_SPEC_FILE", gridfile) grids = Grids(gridfile) gridinfo = grids.get_grid_info(name=grid_name, compset=self._compsetname) self._gridname = gridinfo["GRID"] for key,value in gridinfo.items(): logger.debug("Set grid %s %s"%(key,value)) self.set_lookup_value(key,value) #-------------------------------------------- # component config data #-------------------------------------------- self._get_component_config_data() self.get_compset_var_settings() #-------------------------------------------- # machine #-------------------------------------------- # set machine values in env_xxx files machobj = Machines(machine=machine_name) machine_name = machobj.get_machine_name() self.set_value("MACH",machine_name) nodenames = machobj.get_node_names() nodenames = [x for x in nodenames if '_system' not in x and '_variables' not in x and 'mpirun' not in x and\ 'COMPILER' not in x and 'MPILIB' not in x] for nodename in nodenames: value = machobj.get_value(nodename, resolved=False) type_str = self.get_type_info(nodename) if type_str is not None: logger.debug("machine nodname %s value %s"%(nodename, value)) self.set_value(nodename, convert_to_type(value, type_str, nodename)) if compiler is None: compiler = machobj.get_default_compiler() else: expect(machobj.is_valid_compiler(compiler), "compiler %s is not supported on machine %s" %(compiler, machine_name)) self.set_value("COMPILER",compiler) if mpilib is None: mpilib = machobj.get_default_MPIlib({"compiler":compiler}) else: expect(machobj.is_valid_MPIlib(mpilib, {"compiler":compiler}), "MPIlib %s is not supported on machine %s" %(mpilib, machine_name)) self.set_value("MPILIB",mpilib) machdir = machobj.get_machines_dir() self.set_value("MACHDIR", machdir) # Create env_mach_specific settings from machine info. env_mach_specific_obj = self.get_env("mach_specific") env_mach_specific_obj.populate(machobj) self.schedule_rewrite(env_mach_specific_obj) #-------------------------------------------- # pe payout #-------------------------------------------- match1 = re.match('([0-9]+)x([0-9]+)', "" if pecount is None else pecount) match2 = re.match('([0-9]+)', "" if pecount is None else pecount) pes_ntasks = {} pes_nthrds = {} pes_rootpe = {} if match1: opti_tasks = match1.group(1) opti_thrds = match1.group(2) elif match2: opti_tasks = match2.group(1) opti_thrds = 1 other = {} if match1 or match2: for component_class in self._component_classes: if component_class == "DRV": component_class = "CPL" string = "NTASKS_" + component_class pes_ntasks[string] = opti_tasks string = "NTHRDS_" + component_class pes_nthrds[string] = opti_thrds string = "ROOTPE_" + component_class pes_rootpe[string] = 0 else: pesobj = Pes(self._pesfile) pes_ntasks, pes_nthrds, pes_rootpe, other = pesobj.find_pes_layout(self._gridname, self._compsetname, machine_name, pesize_opts=pecount) mach_pes_obj = self.get_env("mach_pes") totaltasks = {} # Since other items may include PES_PER_NODE we need to do this first # we can get rid of this code when all of the perl is removed for key, value in other.items(): self.set_value(key, value) pes_per_node = self.get_value("PES_PER_NODE") for key, value in pes_ntasks.items(): totaltasks[key[-3:]] = int(value) mach_pes_obj.set_value(key,int(value), pes_per_node=pes_per_node) for key, value in pes_rootpe.items(): totaltasks[key[-3:]] += int(value) mach_pes_obj.set_value(key,int(value), pes_per_node=pes_per_node) for key, value in pes_nthrds.items(): totaltasks[key[-3:]] *= int(value) mach_pes_obj.set_value(key,int(value), pes_per_node=pes_per_node) maxval = 1 if mpilib != "mpi-serial": for key, val in totaltasks.items(): if val < 0: val = -1*val*pes_per_node if val > maxval: maxval = val # Make sure that every component has been accounted for # set, nthrds and ntasks to 1 otherwise. Also set the ninst values here. for compclass in self._component_classes: if compclass == "DRV": continue key = "NINST_%s"%compclass mach_pes_obj.set_value(key, ninst) key = "NTASKS_%s"%compclass if key not in pes_ntasks.keys(): mach_pes_obj.set_value(key,1) key = "NTHRDS_%s"%compclass if compclass not in pes_nthrds.keys(): mach_pes_obj.set_value(compclass,1) # FIXME - this is a short term fix for dealing with the restriction that # CISM1 cannot run on multiple cores if "CISM1" in self._compsetname: mach_pes_obj.set_value("NTASKS_GLC",1) mach_pes_obj.set_value("NTHRDS_GLC",1) #-------------------------------------------- # batch system #-------------------------------------------- batch_system_type = machobj.get_value("BATCH_SYSTEM") batch = Batch(batch_system=batch_system_type, machine=machine_name) bjobs = batch.get_batch_jobs() env_batch = self.get_env("batch") env_batch.set_batch_system(batch, batch_system_type=batch_system_type) env_batch.create_job_groups(bjobs) env_batch.set_job_defaults(bjobs, pesize=maxval, walltime=walltime, force_queue=queue) self.schedule_rewrite(env_batch) self.set_value("COMPSET",self._compsetname) self._set_pio_xml() logger.info(" Compset is: %s " %self._compsetname) logger.info(" Grid is: %s " %self._gridname ) logger.info(" Components in compset are: %s " %self._components) # Set project id if project is None: project = get_project(machobj) if project is not None: self.set_value("PROJECT", project) elif machobj.get_value("PROJECT_REQUIRED"): expect(project is not None, "PROJECT_REQUIRED is true but no project found") # Overwriting an existing exeroot or rundir can cause problems exeroot = self.get_value("EXEROOT") rundir = self.get_value("RUNDIR") for wdir in (exeroot, rundir): logging.debug("wdir is %s"%wdir) if os.path.exists(wdir): expect(not test, "Directory %s already exists, aborting test"% wdir) response = raw_input("\nDirectory %s already exists, (r)eplace, (a)bort, or (u)se existing?"% wdir) if response.startswith("r"): shutil.rmtree(wdir) else: expect(response.startswith("u"), "Aborting by user request") # miscellaneous settings if self.get_value("RUN_TYPE") == 'hybrid': self.set_value("GET_REFCASE", True) # Turn on short term archiving as cesm default setting model = get_model() if model == "cesm" and not test: self.set_value("DOUT_S",True)
def configure_tests(timeout, no_fortran_run, fast, no_batch, no_cmake, no_teardown, machine, compiler, mpilib, test_root, **kwargs): config = CIME.utils.get_cime_config() if timeout: BaseTestCase.GLOBAL_TIMEOUT = str(timeout) BaseTestCase.NO_FORTRAN_RUN = no_fortran_run or False BaseTestCase.FAST_ONLY = fast or no_fortran_run BaseTestCase.NO_BATCH = no_batch or False BaseTestCase.NO_CMAKE = no_cmake or False BaseTestCase.NO_TEARDOWN = no_teardown or False # make sure we have default values MACHINE = None TEST_COMPILER = None TEST_MPILIB = None if machine is not None: MACHINE = Machines(machine=machine) os.environ["CIME_MACHINE"] = machine elif "CIME_MACHINE" in os.environ: MACHINE = Machines(machine=os.environ["CIME_MACHINE"]) elif config.has_option("create_test", "MACHINE"): MACHINE = Machines(machine=config.get("create_test", "MACHINE")) elif config.has_option("main", "MACHINE"): MACHINE = Machines(machine=config.get("main", "MACHINE")) else: MACHINE = Machines() BaseTestCase.MACHINE = MACHINE if compiler is not None: TEST_COMPILER = compiler elif config.has_option("create_test", "COMPILER"): TEST_COMPILER = config.get("create_test", "COMPILER") elif config.has_option("main", "COMPILER"): TEST_COMPILER = config.get("main", "COMPILER") BaseTestCase.TEST_COMPILER = TEST_COMPILER if mpilib is not None: TEST_MPILIB = mpilib elif config.has_option("create_test", "MPILIB"): TEST_MPILIB = config.get("create_test", "MPILIB") elif config.has_option("main", "MPILIB"): TEST_MPILIB = config.get("main", "MPILIB") BaseTestCase.TEST_MPILIB = TEST_MPILIB if test_root is not None: TEST_ROOT = test_root elif config.has_option("create_test", "TEST_ROOT"): TEST_ROOT = config.get("create_test", "TEST_ROOT") else: TEST_ROOT = os.path.join( MACHINE.get_value("CIME_OUTPUT_ROOT"), "scripts_regression_test.%s" % CIME.utils.get_timestamp(), ) BaseTestCase.TEST_ROOT = TEST_ROOT write_provenance_info(MACHINE, TEST_COMPILER, TEST_MPILIB, TEST_ROOT) atexit.register(functools.partial(cleanup, TEST_ROOT))
def single_submit_impl( machine_name, test_id, proc_pool, _, args, job_cost_map, wall_time, test_root ): ############################################################################### mach = Machines(machine=machine_name) expect( mach.has_batch_system(), "Single submit does not make sense on non-batch machine '%s'" % mach.get_machine_name(), ) machine_name = mach.get_machine_name() # # Compute arg list for second call to create_test # new_args = list(args) new_args.remove("--single-submit") new_args.append("--no-batch") new_args.append("--use-existing") no_arg_is_a_test_id_arg = True no_arg_is_a_proc_pool_arg = True no_arg_is_a_machine_arg = True for arg in new_args: if arg == "-t" or arg.startswith("--test-id"): no_arg_is_a_test_id_arg = False elif arg.startswith("--proc-pool"): no_arg_is_a_proc_pool_arg = False elif arg == "-m" or arg.startswith("--machine"): no_arg_is_a_machine_arg = True if no_arg_is_a_test_id_arg: new_args.append("-t %s" % test_id) if no_arg_is_a_proc_pool_arg: new_args.append("--proc-pool %d" % proc_pool) if no_arg_is_a_machine_arg: new_args.append("-m %s" % machine_name) # # Resolve batch directives manually. There is currently no other way # to do this without making a Case object. Make a throwaway case object # to help us here. # testcase_dirs = glob.glob("%s/*%s*/TestStatus" % (test_root, test_id)) expect(testcase_dirs, "No test case dirs found!?") first_case = os.path.abspath(os.path.dirname(testcase_dirs[0])) with Case(first_case, read_only=False) as case: env_batch = case.get_env("batch") submit_cmd = env_batch.get_value("batch_submit", subgroup=None) submit_args = env_batch.get_submit_args(case, "case.test") tasks_per_node = mach.get_value("MAX_MPITASKS_PER_NODE") num_nodes = int(math.ceil(float(proc_pool) / tasks_per_node)) if wall_time is None: wall_time = compute_total_time(job_cost_map, proc_pool) wall_time_bab = convert_to_babylonian_time(int(wall_time)) else: wall_time_bab = wall_time queue = env_batch.select_best_queue(num_nodes, proc_pool, walltime=wall_time_bab) wall_time_max_bab = env_batch.get_queue_specs(queue)[3] if wall_time_max_bab is not None: wall_time_max = convert_to_seconds(wall_time_max_bab) if wall_time_max < wall_time: wall_time = wall_time_max wall_time_bab = convert_to_babylonian_time(wall_time) overrides = { "job_id": "create_test_single_submit_%s" % test_id, "num_nodes": num_nodes, "tasks_per_node": tasks_per_node, "totaltasks": tasks_per_node * num_nodes, "job_wallclock_time": wall_time_bab, "job_queue": env_batch.text(queue), } directives = env_batch.get_batch_directives(case, "case.test", overrides=overrides) # # Make simple submit script and submit # script = "#! /bin/bash\n" script += "\n%s" % directives script += "\n" script += "cd %s\n" % os.getcwd() script += "%s %s\n" % (__file__, " ".join(new_args)) submit_cmd = "%s %s" % (submit_cmd, submit_args) logger.info("Script:\n%s" % script) run_cmd_no_fail( submit_cmd, input_str=script, arg_stdout=None, arg_stderr=None, verbose=True )
def parse_command_line(args, description): ############################################################################### parser = argparse.ArgumentParser( description=description, formatter_class=RawTextHelpFormatter ) model = CIME.utils.get_model() CIME.utils.setup_standard_logging_options(parser) config = get_cime_config() parser.add_argument( "--no-run", action="store_true", help="Do not run generated tests" ) parser.add_argument( "--no-build", action="store_true", help="Do not build generated tests, implies --no-run", ) parser.add_argument( "--no-setup", action="store_true", help="Do not setup generated tests, implies --no-build and --no-run", ) parser.add_argument( "-u", "--use-existing", action="store_true", help="Use pre-existing case directories they will pick up at the " "\nlatest PEND state or re-run the first failed state. Requires test-id", ) default = get_default_setting(config, "SAVE_TIMING", False, check_main=False) parser.add_argument( "--save-timing", action="store_true", default=default, help="Enable archiving of performance data.", ) parser.add_argument( "--no-batch", action="store_true", help="Do not submit jobs to batch system, run locally." "\nIf false, this will default to machine setting.", ) parser.add_argument( "--single-exe", action="store_true", default=False, help="Use a single build for all cases. This can " "\ndrastically improve test throughput but is currently use-at-your-own risk." "\nIt's up to the user to ensure that all cases are build-compatible." "\nE3SM tests belonging to a suite with share enabled will always share exes.", ) default = get_default_setting(config, "SINGLE_SUBMIT", False, check_main=False) parser.add_argument( "--single-submit", action="store_true", default=default, help="Use a single interactive allocation to run all the tests. This can " "\ndrastically reduce queue waiting but only makes sense on batch machines.", ) default = get_default_setting(config, "TEST_ROOT", None, check_main=False) parser.add_argument( "-r", "--test-root", default=default, help="Where test cases will be created. The default is output root" "\nas defined in the config_machines file", ) default = get_default_setting(config, "OUTPUT_ROOT", None, check_main=False) parser.add_argument( "--output-root", default=default, help="Where the case output is written." ) default = get_default_setting(config, "BASELINE_ROOT", None, check_main=False) parser.add_argument( "--baseline-root", default=default, help="Specifies a root directory for baseline datasets that will " "\nbe used for Bit-for-bit generate and/or compare testing.", ) default = get_default_setting(config, "CLEAN", False, check_main=False) parser.add_argument( "--clean", action="store_true", default=default, help="Specifies if tests should be cleaned after run. If set, all object" "\nexecutables and data files will be removed after the tests are run.", ) default = get_default_setting(config, "MACHINE", None, check_main=True) parser.add_argument( "-m", "--machine", default=default, help="The machine for creating and building tests. This machine must be defined" "\nin the config_machines.xml file for the given model. The default is to " "\nto match the name of the machine in the test name or the name of the " "\nmachine this script is run on to the NODENAME_REGEX field in " "\nconfig_machines.xml. WARNING: This option is highly unsafe and should " "\nonly be used if you are an expert.", ) default = get_default_setting(config, "MPILIB", None, check_main=True) parser.add_argument( "--mpilib", default=default, help="Specify the mpilib. To see list of supported MPI libraries for each machine, " "\ninvoke ./query_config. The default is the first listing .", ) if model in ["cesm", "ufs"]: parser.add_argument( "-c", "--compare", help="While testing, compare baselines against the given compare directory. ", ) parser.add_argument( "-g", "--generate", help="While testing, generate baselines in the given generate directory. " "\nNOTE: this can also be done after the fact with bless_test_results", ) parser.add_argument( "--xml-machine", help="Use this machine key in the lookup in testlist.xml. " "\nThe default is all if any --xml- argument is used.", ) parser.add_argument( "--xml-compiler", help="Use this compiler key in the lookup in testlist.xml. " "\nThe default is all if any --xml- argument is used.", ) parser.add_argument( "--xml-category", help="Use this category key in the lookup in testlist.xml. " "\nThe default is all if any --xml- argument is used.", ) parser.add_argument( "--xml-testlist", help="Use this testlist to lookup tests.The default is specified in config_files.xml", ) parser.add_argument( "--xml-driver", choices=("mct", "nuopc", "moab"), help="Override driver specified in tests and use this one.", ) parser.add_argument( "testargs", nargs="*", help="Tests to run. Testname form is TEST.GRID.COMPSET[.MACHINE_COMPILER]", ) else: parser.add_argument( "testargs", nargs="+", help="Tests or test suites to run." " Testname form is TEST.GRID.COMPSET[.MACHINE_COMPILER]", ) parser.add_argument( "-b", "--baseline-name", help="If comparing or generating baselines, use this directory under baseline root. " "\nDefault will be current branch name.", ) parser.add_argument( "-c", "--compare", action="store_true", help="While testing, compare baselines", ) parser.add_argument( "-g", "--generate", action="store_true", help="While testing, generate baselines. " "\nNOTE: this can also be done after the fact with bless_test_results", ) default = get_default_setting(config, "COMPILER", None, check_main=True) parser.add_argument( "--compiler", default=default, help="Compiler for building cime. Default will be the name in the " "\nTestname or the default defined for the machine.", ) parser.add_argument( "-n", "--namelists-only", action="store_true", help="Only perform namelist actions for tests", ) parser.add_argument( "-p", "--project", help="Specify a project id for the case (optional)." "\nUsed for accounting and directory permissions when on a batch system." "\nThe default is user or machine specified by PROJECT." "\nAccounting (only) may be overridden by user or machine specified CHARGE_ACCOUNT.", ) parser.add_argument( "-t", "--test-id", help="Specify an 'id' for the test. This is simply a string that is appended " "\nto the end of a test name. If no test-id is specified, a time stamp plus a " "\nrandom string will be used (ensuring a high probability of uniqueness). " "\nIf a test-id is specified, it is the user's responsibility to ensure that " "\neach run of create_test uses a unique test-id. WARNING: problems will occur " "\nif you use the same test-id twice on the same file system, even if the test " "\nlists are completely different.", ) default = get_default_setting(config, "PARALLEL_JOBS", None, check_main=False) parser.add_argument( "-j", "--parallel-jobs", type=int, default=default, help="Number of tasks create_test should perform simultaneously. The default " "\n is min(num_cores, num_tests).", ) default = get_default_setting(config, "PROC_POOL", None, check_main=False) parser.add_argument( "--proc-pool", type=int, default=default, help="The size of the processor pool that create_test can use. The default is " "\nMAX_MPITASKS_PER_NODE + 25 percent.", ) default = os.getenv("CIME_GLOBAL_WALLTIME") if default is None: default = get_default_setting(config, "WALLTIME", None, check_main=True) parser.add_argument( "--walltime", default=default, help="Set the wallclock limit for all tests in the suite. " "\nUse the variable CIME_GLOBAL_WALLTIME to set this for all tests.", ) default = get_default_setting(config, "JOB_QUEUE", None, check_main=True) parser.add_argument( "-q", "--queue", default=default, help="Force batch system to use a certain queue", ) parser.add_argument( "-f", "--testfile", help="A file containing an ascii list of tests to run" ) default = get_default_setting( config, "ALLOW_BASELINE_OVERWRITE", False, check_main=False ) parser.add_argument( "-o", "--allow-baseline-overwrite", action="store_true", default=default, help="If the --generate option is given, then an attempt to overwrite " "\nan existing baseline directory will raise an error. WARNING: Specifying this " "\noption will allow existing baseline directories to be silently overwritten.", ) default = get_default_setting(config, "WAIT", False, check_main=False) parser.add_argument( "--wait", action="store_true", default=default, help="On batch systems, wait for submitted jobs to complete", ) default = get_default_setting(config, "ALLOW_PNL", False, check_main=False) parser.add_argument( "--allow-pnl", action="store_true", default=default, help="Do not pass skip-pnl to case.submit", ) parser.add_argument( "--check-throughput", action="store_true", help="Fail if throughput check fails. Requires --wait on batch systems", ) parser.add_argument( "--check-memory", action="store_true", help="Fail if memory check fails. Requires --wait on batch systems", ) parser.add_argument( "--ignore-namelists", action="store_true", help="Do not fail if there namelist diffs", ) parser.add_argument( "--ignore-memleak", action="store_true", help="Do not fail if there's a memleak" ) default = get_default_setting(config, "FORCE_PROCS", None, check_main=False) parser.add_argument( "--force-procs", type=int, default=default, help="For all tests to run with this number of processors", ) default = get_default_setting(config, "FORCE_THREADS", None, check_main=False) parser.add_argument( "--force-threads", type=int, default=default, help="For all tests to run with this number of threads", ) default = get_default_setting(config, "INPUT_DIR", None, check_main=True) parser.add_argument( "-i", "--input-dir", default=default, help="Use a non-default location for input files", ) default = get_default_setting(config, "PESFILE", None, check_main=True) parser.add_argument( "--pesfile", default=default, help="Full pathname of an optional pes specification file. The file" "\ncan follow either the config_pes.xml or the env_mach_pes.xml format.", ) default = get_default_setting(config, "RETRY", 0, check_main=False) parser.add_argument( "--retry", type=int, default=default, help="Automatically retry failed tests. >0 implies --wait", ) parser.add_argument( "-N", "--non-local", action="store_true", help="Use when you've requested a machine that you aren't on. " "Will reduce errors for missing directories etc.", ) if config and config.has_option("main", "workflow"): workflow_default = config.get("main", "workflow") else: workflow_default = "default" parser.add_argument( "--workflow", default=workflow_default, help="A workflow from config_workflow.xml to apply to this case. ", ) parser.add_argument( "--chksum", action="store_true", help="Verifies input data checksums." ) srcroot_default = utils.get_src_root() parser.add_argument( "--srcroot", default=srcroot_default, help="Alternative pathname for source root directory. " f"The default is {srcroot_default}", ) CIME.utils.add_mail_type_args(parser) args = CIME.utils.parse_args_and_handle_standard_logging_options(args, parser) CIME.utils.resolve_mail_type_args(args) # generate and compare flags may not point to the same directory if model in ["cesm", "ufs"]: if args.generate is not None: expect( not (args.generate == args.compare), "Cannot generate and compare baselines at the same time", ) if args.xml_testlist is not None: expect( not ( args.xml_machine is None and args.xml_compiler is None and args.xml_category is None ), "If an xml-testlist is present at least one of --xml-machine, " "--xml-compiler, --xml-category must also be present", ) else: expect( not ( args.baseline_name is not None and (not args.compare and not args.generate) ), "Provided baseline name but did not specify compare or generate", ) expect( not (args.compare and args.generate), "Tried to compare and generate at same time", ) expect( not (args.namelists_only and not (args.generate or args.compare)), "Must provide either --compare or --generate with --namelists-only", ) if args.retry > 0: args.wait = True if args.parallel_jobs is not None: expect( args.parallel_jobs > 0, "Invalid value for parallel_jobs: %d" % args.parallel_jobs, ) if args.use_existing: expect(args.test_id is not None, "Must provide test-id of pre-existing cases") if args.no_setup: args.no_build = True if args.no_build: args.no_run = True # Namelist-only forces some other options: if args.namelists_only: expect(not args.no_setup, "Cannot compare namelists without setup") args.no_build = True args.no_run = True args.no_batch = True expect( not (args.non_local and not args.no_build), "Cannot build on non-local machine" ) if args.single_submit: expect( not args.no_run, "Doesn't make sense to request single-submit if no-run is on", ) args.no_build = True args.no_run = True args.no_batch = True if args.test_id is None: args.test_id = "%s_%s" % (CIME.utils.get_timestamp(), CIME.utils.id_generator()) else: expect( CIME.utils.check_name(args.test_id, additional_chars="."), "invalid test-id argument provided", ) if args.testfile is not None: with open(args.testfile, "r") as fd: args.testargs.extend( [ line.strip() for line in fd.read().splitlines() if line.strip() and not line.startswith("#") ] ) # Propagate `srcroot` to `GenericXML` to resolve $SRCROOT # See call to `Machines` below utils.GLOBAL["SRCROOT"] = args.srcroot # Compute list of fully-resolved test_names test_extra_data = {} if model in ["cesm", "ufs"]: machine_name = args.xml_machine if args.machine is None else args.machine # If it's still unclear what machine to use, look at test names if machine_name is None: for test in args.testargs: testsplit = CIME.utils.parse_test_name(test) if testsplit[4] is not None: if machine_name is None: machine_name = testsplit[4] else: expect( machine_name == testsplit[4], "ambiguity in machine, please use the --machine option", ) mach_obj = Machines(machine=machine_name) if args.testargs: args.compiler = ( mach_obj.get_default_compiler() if args.compiler is None else args.compiler ) test_names = get_tests.get_full_test_names( args.testargs, mach_obj.get_machine_name(), args.compiler ) else: expect( not ( args.xml_machine is None and args.xml_compiler is None and args.xml_category is None and args.xml_testlist is None ), "At least one of --xml-machine, --xml-testlist, " "--xml-compiler, --xml-category or a valid test name must be provided.", ) test_data = get_tests_from_xml( xml_machine=args.xml_machine, xml_category=args.xml_category, xml_compiler=args.xml_compiler, xml_testlist=args.xml_testlist, machine=machine_name, compiler=args.compiler, driver=args.xml_driver, ) test_names = [item["name"] for item in test_data] for test_datum in test_data: test_extra_data[test_datum["name"]] = test_datum logger.info("Testnames: %s" % test_names) else: if args.machine is None: args.machine = get_tests.infer_machine_name_from_tests(args.testargs) mach_obj = Machines(machine=args.machine) args.compiler = ( mach_obj.get_default_compiler() if args.compiler is None else args.compiler ) test_names = get_tests.get_full_test_names( args.testargs, mach_obj.get_machine_name(), args.compiler ) expect( mach_obj.is_valid_compiler(args.compiler), "Compiler %s not valid for machine %s" % (args.compiler, mach_obj.get_machine_name()), ) if not args.wait and mach_obj.has_batch_system() and not args.no_batch: expect( not args.check_throughput, "Makes no sense to use --check-throughput without --wait", ) expect( not args.check_memory, "Makes no sense to use --check-memory without --wait" ) # Normalize compare/generate between the models baseline_cmp_name = None baseline_gen_name = None if args.compare or args.generate: if model in ["cesm", "ufs"]: if args.compare is not None: baseline_cmp_name = args.compare if args.generate is not None: baseline_gen_name = args.generate else: baseline_name = ( args.baseline_name if args.baseline_name else CIME.utils.get_current_branch(repo=CIME.utils.get_cime_root()) ) expect( baseline_name is not None, "Could not determine baseline name from branch, please use -b option", ) if args.compare: baseline_cmp_name = baseline_name elif args.generate: baseline_gen_name = baseline_name if args.input_dir is not None: args.input_dir = os.path.abspath(args.input_dir) # sanity check for name in test_names: dot_count = name.count(".") expect(dot_count > 1 and dot_count <= 4, "Invalid test Name, '{}'".format(name)) # for e3sm, sort by walltime if model == "e3sm": if args.walltime is None: # Longest tests should run first test_names.sort(key=get_tests.key_test_time, reverse=True) else: test_names.sort() return ( test_names, test_extra_data, args.compiler, mach_obj.get_machine_name(), args.no_run, args.no_build, args.no_setup, args.no_batch, args.test_root, args.baseline_root, args.clean, baseline_cmp_name, baseline_gen_name, args.namelists_only, args.project, args.test_id, args.parallel_jobs, args.walltime, args.single_submit, args.proc_pool, args.use_existing, args.save_timing, args.queue, args.allow_baseline_overwrite, args.output_root, args.wait, args.force_procs, args.force_threads, args.mpilib, args.input_dir, args.pesfile, args.retry, args.mail_user, args.mail_type, args.check_throughput, args.check_memory, args.ignore_namelists, args.ignore_memleak, args.allow_pnl, args.non_local, args.single_exe, args.workflow, args.chksum, )
def _compare_baseline(self): with self._test_status: if int(self._case.get_value("RESUBMIT")) > 0: # This is here because the comparison is run for each submission # and we only want to compare once the whole run is finished. We # need to return a pass here to continue the submission process. self._test_status.set_status(CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_PASS_STATUS) return self._test_status.set_status(CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_FAIL_STATUS) run_dir = self._case.get_value("RUNDIR") case_name = self._case.get_value("CASE") base_dir = os.path.join(self._case.get_value("BASELINE_ROOT"), self._case.get_value("BASECMP_CASE")) test_name = "{}".format(case_name.split('.')[-1]) evv_config = { test_name: { "module": os.path.join(evv_lib_dir, "extensions", "ks.py"), "test-case": "Test", "test-dir": run_dir, "ref-case": "Baseline", "ref-dir": base_dir, "var-set": "default", "ninst": NINST, "critical": 13 } } json_file = os.path.join(run_dir, '.'.join([case_name, 'json'])) with open(json_file, 'w') as config_file: json.dump(evv_config, config_file, indent=4) evv_out_dir = os.path.join(run_dir, '.'.join([case_name, 'evv'])) evv(['-e', json_file, '-o', evv_out_dir]) with open(os.path.join(evv_out_dir, 'index.json')) as evv_f: evv_status = json.load(evv_f) comments = "" for evv_elem in evv_status['Data']['Elements']: if evv_elem['Type'] == 'ValSummary' \ and evv_elem['TableTitle'] == 'Kolmogorov-Smirnov test': comments = "; ".join("{}: {}".format(key, val) for key, val in evv_elem['Data'][test_name][''].items()) if evv_elem['Data'][test_name]['']['Test status'].lower() == 'pass': self._test_status.set_status(CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_PASS_STATUS) break status = self._test_status.get_status(CIME.test_status.BASELINE_PHASE) mach_name = self._case.get_value("MACH") mach_obj = Machines(machine=mach_name) htmlroot = CIME.utils.get_htmlroot(mach_obj) urlroot = CIME.utils.get_urlroot(mach_obj) if htmlroot is not None: with CIME.utils.SharedArea(): dir_util.copy_tree(evv_out_dir, os.path.join(htmlroot, 'evv', case_name), preserve_mode=False) if urlroot is None: urlroot = "[{}_URL]".format(mach_name.capitalize()) viewing = "{}/evv/{}/index.html".format(urlroot, case_name) else: viewing = "{}\n" \ " EVV viewing instructions can be found at: " \ " https://github.com/E3SM-Project/E3SM/blob/master/cime/scripts/" \ "climate_reproducibility/README.md#test-passfail-and-extended-output" \ "".format(evv_out_dir) comments = "{} {} for test '{}'.\n" \ " {}\n" \ " EVV results can be viewed at:\n" \ " {}".format(CIME.test_status.BASELINE_PHASE, status, test_name, comments, viewing) CIME.utils.append_testlog(comments, self._orig_caseroot)
def __init__(self, test_names, test_data=None, no_run=False, no_build=False, no_setup=False, no_batch=None, test_root=None, test_id=None, machine_name=None, compiler=None, baseline_root=None, baseline_cmp_name=None, baseline_gen_name=None, clean=False, namelists_only=False, project=None, parallel_jobs=None, walltime=None, proc_pool=None, use_existing=False, save_timing=False, queue=None, allow_baseline_overwrite=False): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = CIME.utils.get_model() self._allow_baseline_overwrite = allow_baseline_overwrite self._save_timing = save_timing self._queue = queue self._test_data = {} if test_data is None else test_data # Format: {test_name -> {data_name -> data}} self._machobj = Machines(machine=machine_name) self._no_setup = no_setup self._no_build = no_build or no_setup or namelists_only self._no_run = no_run or self._no_build # Figure out what project to use if project is None: self._project = CIME.utils.get_project() if self._project is None: self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() expect(not (self._no_batch and self._queue is not None), "Does not make sense to request a queue without batch system") # Determine and resolve test_root self._test_root = self._machobj.get_value("CESMSCRATCHROOT") if test_root is None else test_root if self._project is not None: self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_timestamp() self._compiler = self._machobj.get_default_compiler() if compiler is None else compiler self._clean = clean self._namelists_only = namelists_only self._walltime = walltime if parallel_jobs is None: self._parallel_jobs = min(len(test_names), int(self._machobj.get_value("MAX_TASKS_PER_NODE"))) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = baseline_cmp_name # Implies comparison should be done if not None self._baseline_gen_name = baseline_gen_name # Implies generation should be done if not None if baseline_cmp_name or baseline_gen_name: # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None \ else self._machobj.get_value("CCSM_BASELINE") if self._project is not None: self._baseline_root = self._baseline_root.replace("$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if self._baseline_cmp_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect(os.path.isdir(full_baseline_dir), "Missing baseline comparison directory %s" % full_baseline_dir) # the following is to assure that the existing generate directory is not overwritten if self._baseline_gen_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name) existing_baselines = [] for test_name in test_names: test_baseline = os.path.join(full_baseline_dir, test_name) if os.path.isdir(test_baseline): existing_baselines.append(test_baseline) expect(allow_baseline_overwrite or len(existing_baselines) == 0, "Baseline directories already exists %s\n"\ "Use --allow_baseline_overwrite to avoid this error"%existing_baselines) else: self._baseline_root = None # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # Since the name-list phase can fail without aborting later phases, we # need some extra state to remember tests that had namelist problems. # name -> (phase, status, has_namelist_problem) self._tests = {} for test_name in test_names: self._tests[test_name] = (TEST_START, TEST_PASS_STATUS, False) # Oversubscribe by 1/4 if proc_pool is None: pes = int(self._machobj.get_value("PES_PER_NODE")) self._proc_pool = int(pes * 1.25) else: self._proc_pool = int(proc_pool) self._procs_avail = self._proc_pool # Setup phases self._phases = list(PHASES) if self._no_setup: self._phases.remove(SETUP_PHASE) if self._no_build: self._phases.remove(SHAREDLIB_BUILD_PHASE) self._phases.remove(MODEL_BUILD_PHASE) if self._no_run: self._phases.remove(RUN_PHASE) if not self._baseline_cmp_name and not self._baseline_gen_name: self._phases.remove(NAMELIST_PHASE) if use_existing: for test in self._tests: ts = TestStatus(self._get_test_dir(test)) for phase, status in ts: if phase in CORE_PHASES: if status in [TEST_PEND_STATUS, TEST_FAIL_STATUS]: # We need to pick up here break else: self._update_test_status(test, phase, TEST_PEND_STATUS) self._update_test_status(test, phase, status) else: # None of the test directories should already exist. for test in self._tests: expect(not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '%s', it already exists." " Pick a different test-id" % self._get_test_dir(test))
class TestScheduler(object): ############################################################################### ########################################################################### def __init__(self, test_names, test_data=None, no_run=False, no_build=False, no_setup=False, no_batch=None, test_root=None, test_id=None, machine_name=None, compiler=None, baseline_root=None, baseline_cmp_name=None, baseline_gen_name=None, clean=False, namelists_only=False, project=None, parallel_jobs=None, walltime=None, proc_pool=None, use_existing=False, save_timing=False, queue=None, allow_baseline_overwrite=False): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = CIME.utils.get_model() self._allow_baseline_overwrite = allow_baseline_overwrite self._save_timing = save_timing self._queue = queue self._test_data = {} if test_data is None else test_data # Format: {test_name -> {data_name -> data}} self._machobj = Machines(machine=machine_name) self._no_setup = no_setup self._no_build = no_build or no_setup or namelists_only self._no_run = no_run or self._no_build # Figure out what project to use if project is None: self._project = CIME.utils.get_project() if self._project is None: self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() expect(not (self._no_batch and self._queue is not None), "Does not make sense to request a queue without batch system") # Determine and resolve test_root self._test_root = self._machobj.get_value("CESMSCRATCHROOT") if test_root is None else test_root if self._project is not None: self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_timestamp() self._compiler = self._machobj.get_default_compiler() if compiler is None else compiler self._clean = clean self._namelists_only = namelists_only self._walltime = walltime if parallel_jobs is None: self._parallel_jobs = min(len(test_names), int(self._machobj.get_value("MAX_TASKS_PER_NODE"))) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = baseline_cmp_name # Implies comparison should be done if not None self._baseline_gen_name = baseline_gen_name # Implies generation should be done if not None if baseline_cmp_name or baseline_gen_name: # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None \ else self._machobj.get_value("CCSM_BASELINE") if self._project is not None: self._baseline_root = self._baseline_root.replace("$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if self._baseline_cmp_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect(os.path.isdir(full_baseline_dir), "Missing baseline comparison directory %s" % full_baseline_dir) # the following is to assure that the existing generate directory is not overwritten if self._baseline_gen_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name) existing_baselines = [] for test_name in test_names: test_baseline = os.path.join(full_baseline_dir, test_name) if os.path.isdir(test_baseline): existing_baselines.append(test_baseline) expect(allow_baseline_overwrite or len(existing_baselines) == 0, "Baseline directories already exists %s\n"\ "Use --allow_baseline_overwrite to avoid this error"%existing_baselines) else: self._baseline_root = None # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # Since the name-list phase can fail without aborting later phases, we # need some extra state to remember tests that had namelist problems. # name -> (phase, status, has_namelist_problem) self._tests = {} for test_name in test_names: self._tests[test_name] = (TEST_START, TEST_PASS_STATUS, False) # Oversubscribe by 1/4 if proc_pool is None: pes = int(self._machobj.get_value("PES_PER_NODE")) self._proc_pool = int(pes * 1.25) else: self._proc_pool = int(proc_pool) self._procs_avail = self._proc_pool # Setup phases self._phases = list(PHASES) if self._no_setup: self._phases.remove(SETUP_PHASE) if self._no_build: self._phases.remove(SHAREDLIB_BUILD_PHASE) self._phases.remove(MODEL_BUILD_PHASE) if self._no_run: self._phases.remove(RUN_PHASE) if not self._baseline_cmp_name and not self._baseline_gen_name: self._phases.remove(NAMELIST_PHASE) if use_existing: for test in self._tests: ts = TestStatus(self._get_test_dir(test)) for phase, status in ts: if phase in CORE_PHASES: if status in [TEST_PEND_STATUS, TEST_FAIL_STATUS]: # We need to pick up here break else: self._update_test_status(test, phase, TEST_PEND_STATUS) self._update_test_status(test, phase, status) else: # None of the test directories should already exist. for test in self._tests: expect(not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '%s', it already exists." " Pick a different test-id" % self._get_test_dir(test)) # By the end of this constructor, this program should never hard abort, # instead, errors will be placed in the TestStatus files for the various # tests cases ########################################################################### def _log_output(self, test, output): ########################################################################### test_dir = self._get_test_dir(test) if not os.path.isdir(test_dir): # Note: making this directory could cause create_newcase to fail # if this is run before. os.makedirs(test_dir) append_status(output,caseroot=test_dir,sfile="TestStatus.log") ########################################################################### def _get_case_id(self, test): ########################################################################### baseline_action_code = "" if self._baseline_gen_name: baseline_action_code += "G" if self._baseline_cmp_name: baseline_action_code += "C" if len(baseline_action_code) > 0: return "%s.%s.%s" % (test, baseline_action_code, self._test_id) else: return "%s.%s" % (test, self._test_id) ########################################################################### def _get_test_dir(self, test): ########################################################################### return os.path.join(self._test_root, self._get_case_id(test)) ########################################################################### def _get_test_data(self, test): ########################################################################### # Must be atomic return self._tests[test] ########################################################################### def _is_broken(self, test): ########################################################################### status = self._get_test_status(test) return status not in CONTINUE and status != TEST_PEND_STATUS ########################################################################### def _work_remains(self, test): ########################################################################### test_phase, test_status, _ = self._get_test_data(test) return (test_status in CONTINUE or test_status == TEST_PEND_STATUS) and\ test_phase != self._phases[-1] ########################################################################### def _get_test_status(self, test, phase=None): ########################################################################### curr_phase, curr_status, nl_fail = self._get_test_data(test) if phase == NAMELIST_PHASE and nl_fail: return NAMELIST_FAIL_STATUS elif phase is None or phase == curr_phase: return curr_status else: expect(phase is None or self._phases.index(phase) < self._phases.index(curr_phase), "Tried to see the future") # Assume all older phases PASSed return TEST_PASS_STATUS ########################################################################### def _get_test_phase(self, test): ########################################################################### return self._get_test_data(test)[0] ########################################################################### def _update_test_status(self, test, phase, status): ########################################################################### phase_idx = self._phases.index(phase) old_phase, old_status, old_nl_fail = self._get_test_data(test) if old_phase == phase: expect(old_status == TEST_PEND_STATUS, "Only valid to transition from PEND to something else, found '%s' for phase '%s'" % (old_status, phase)) expect(status != TEST_PEND_STATUS, "Cannot transition from PEND -> PEND") else: expect(old_status in CONTINUE, "Why did we move on to next phase when prior phase did not pass?") expect(status == TEST_PEND_STATUS, "New phase should be set to pending status") expect(self._phases.index(old_phase) == phase_idx - 1, "Skipped phase? %s %s"%(old_phase, phase_idx)) # Must be atomic self._tests[test] = (phase, status, old_nl_fail) ########################################################################### def _test_has_nl_problem(self, test): ########################################################################### curr_phase, curr_status, _ = self._get_test_data(test) expect(curr_phase == NAMELIST_PHASE, "Setting namelist status outside of namelist phase?") # Must be atomic self._tests[test] = (curr_phase, curr_status, True) ########################################################################### def _shell_cmd_for_phase(self, test, cmd, phase, from_dir=None): ########################################################################### while True: rc, output, errput = run_cmd(cmd, from_dir=from_dir) if rc != 0: self._log_output(test, "%s FAILED for test '%s'.\nCommand: %s\nOutput: %s\n\nErrput: %s" % (phase, test, cmd, output, errput)) # Temporary hack to get around odd file descriptor use by # buildnml scripts. if "bad interpreter" in errput: time.sleep(1) continue else: break else: # We don't want "RUN PASSED" in the TestStatus.log if the only thing that # succeeded was the submission. if phase != RUN_PHASE or self._no_batch: self._log_output(test, "%s PASSED for test '%s'.\nCommand: %s\nOutput: %s\n\nErrput: %s" % (phase, test, cmd, output, errput)) break return rc == 0 ########################################################################### def _create_newcase_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) _, case_opts, grid, compset,\ machine, compiler, test_mods = CIME.utils.parse_test_name(test) create_newcase_cmd = "%s --case %s --res %s --mach %s --compiler %s --compset %s"\ " --test" % \ (os.path.join(self._cime_root, "scripts", "create_newcase"), test_dir, grid, machine, compiler, compset) if self._project is not None: create_newcase_cmd += " --project %s " % self._project if test_mods is not None: files = Files() (component,modspath) = test_mods.split('/',1) testmods_dir = files.get_value("TESTS_MODS_DIR", {"component": component}) test_mod_file = os.path.join(testmods_dir, component, modspath) if not os.path.exists(test_mod_file): self._log_output(test, "Missing testmod file '%s'" % test_mod_file) return False create_newcase_cmd += " --user-mods-dir %s" % test_mod_file if case_opts is not None: for case_opt in case_opts: # pylint: disable=not-an-iterable if case_opt.startswith('M'): mpilib = case_opt[1:] create_newcase_cmd += " --mpilib %s" % mpilib logger.debug (" MPILIB set to %s" % mpilib) if case_opt.startswith('N'): ninst = case_opt[1:] create_newcase_cmd += " --ninst %s" %ninst logger.debug (" NINST set to %s" % ninst) if case_opt.startswith('P'): pesize = case_opt[1:] create_newcase_cmd += " --pecount %s"%pesize if self._queue is not None: create_newcase_cmd += " --queue=%s" % self._queue if self._walltime is not None: create_newcase_cmd += " --walltime %s" % self._walltime elif test in self._test_data and "options" in self._test_data[test] and \ "wallclock" in self._test_data[test]['options']: create_newcase_cmd += " --walltime %s" % self._test_data[test]['options']['wallclock'] logger.debug("Calling create_newcase: " + create_newcase_cmd) return self._shell_cmd_for_phase(test, create_newcase_cmd, CREATE_NEWCASE_PHASE) ########################################################################### def _xml_phase(self, test): ########################################################################### test_case = CIME.utils.parse_test_name(test)[0] # Create, fill and write an envtest object test_dir = self._get_test_dir(test) envtest = EnvTest(test_dir) # Determine list of component classes that this coupler/driver knows how # to deal with. This list follows the same order as compset longnames follow. files = Files() drv_config_file = files.get_value("CONFIG_DRV_FILE") drv_comp = Component(drv_config_file) envtest.add_elements_by_group(files, {}, "env_test.xml") envtest.add_elements_by_group(drv_comp, {}, "env_test.xml") envtest.set_value("TESTCASE", test_case) envtest.set_value("TEST_TESTID", self._test_id) envtest.set_value("CASEBASEID", test) if test in self._test_data and "options" in self._test_data[test] and \ "memleak_tolerance" in self._test_data[test]['options']: envtest.set_value("TEST_MEMLEAK_TOLERANCE", self._test_data[test]['options']['memleak_tolerance']) test_argv = "-testname %s -testroot %s" % (test, self._test_root) if self._baseline_gen_name: test_argv += " -generate %s" % self._baseline_gen_name basegen_case_fullpath = os.path.join(self._baseline_root,self._baseline_gen_name, test) logger.debug("basegen_case is %s"%basegen_case_fullpath) envtest.set_value("BASELINE_NAME_GEN", self._baseline_gen_name) envtest.set_value("BASEGEN_CASE", os.path.join(self._baseline_gen_name, test)) if self._baseline_cmp_name: test_argv += " -compare %s" % self._baseline_cmp_name envtest.set_value("BASELINE_NAME_CMP", self._baseline_cmp_name) envtest.set_value("BASECMP_CASE", os.path.join(self._baseline_cmp_name, test)) envtest.set_value("TEST_ARGV", test_argv) envtest.set_value("CLEANUP", self._clean) if self._baseline_gen_name or self._baseline_cmp_name: envtest.set_value("BASELINE_ROOT", self._baseline_root) envtest.set_value("GENERATE_BASELINE", self._baseline_gen_name is not None) envtest.set_value("COMPARE_BASELINE", self._baseline_cmp_name is not None) envtest.set_value("CCSM_CPRNC", self._machobj.get_value("CCSM_CPRNC", resolved=False)) # Add the test instructions from config_test to env_test in the case config_test = Tests() testnode = config_test.get_test_node(test_case) envtest.add_test(testnode) # Determine the test_case from the test name test_case, case_opts = CIME.utils.parse_test_name(test)[:2] # Determine case_opts from the test_case if case_opts is not None: logger.debug("case_opts are %s " %case_opts) for opt in case_opts: logger.debug("case_opt is %s" %opt) if opt == 'D': envtest.set_test_parameter("DEBUG", "TRUE") logger.debug (" DEBUG set to TRUE") elif opt == 'E': envtest.set_test_parameter("USE_ESMF_LIB", "TRUE") envtest.set_test_parameter("COMP_INTERFACE", "ESMF") logger.debug (" USE_ESMF_LIB set to TRUE") logger.debug (" COMP_INTERFACE set to ESMF") elif opt == 'CG': envtest.set_test_parameter("CALENDAR", "GREGORIAN") logger.debug (" CALENDAR set to %s" %opt) elif opt.startswith('L'): match = re.match('L([A-Za-z])([0-9]*)', opt) stop_option = {"y":"nyears", "m":"nmonths", "d":"ndays", "h":"nhours", "s":"nseconds", "n":"nsteps"} opt = match.group(1) envtest.set_test_parameter("STOP_OPTION",stop_option[opt]) opti = match.group(2) envtest.set_test_parameter("STOP_N", opti) logger.debug (" STOP_OPTION set to %s" %stop_option[opt]) logger.debug (" STOP_N set to %s" %opti) elif opt.startswith('M'): # M option handled by create newcase continue elif opt.startswith('P'): # P option handled by create newcase continue elif opt.startswith('N'): # handled in create_newcase continue elif opt.startswith('IOP'): logger.warn("IOP test option not yet implemented") else: expect(False, "Could not parse option '%s' " %opt) envtest.write() lockedfiles = os.path.join(test_dir, "LockedFiles") if not os.path.exists(lockedfiles): os.mkdir(lockedfiles) shutil.copy(os.path.join(test_dir,"env_run.xml"), os.path.join(lockedfiles, "env_run.orig.xml")) with Case(test_dir, read_only=False) as case: case.set_value("SHAREDLIBROOT", os.path.join(self._test_root, "sharedlibroot.%s"%self._test_id)) envtest.set_initial_values(case) if self._save_timing: case.set_value("SAVE_TIMING", True) return True ########################################################################### def _setup_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.setup", SETUP_PHASE, from_dir=test_dir) ########################################################################### def _nlcomp_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) casedoc_dir = os.path.join(test_dir, "CaseDocs") compare_nl = os.path.join(CIME.utils.get_scripts_root(), "Tools", "compare_namelists") simple_compare = os.path.join(CIME.utils.get_scripts_root(), "Tools", "simple_compare") if self._baseline_cmp_name: has_fails = False baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name, test) baseline_casedocs = os.path.join(baseline_dir, "CaseDocs") # Start off by comparing everything in CaseDocs except a few arbitrary files (ugh!) # TODO: Namelist files should have consistent suffix all_items_to_compare = [item for item in glob.glob("%s/*" % casedoc_dir)\ if "README" not in os.path.basename(item)\ and not item.endswith("doc")\ and not item.endswith("prescribed")\ and not os.path.basename(item).startswith(".")] + \ glob.glob("%s/*user_nl*" % test_dir) for item in all_items_to_compare: baseline_counterpart = os.path.join(baseline_casedocs \ if os.path.dirname(item).endswith("CaseDocs") \ else baseline_dir,os.path.basename(item)) if not os.path.exists(baseline_counterpart): self._log_output(test, "Missing baseline namelist '%s'" % baseline_counterpart) has_fails = True else: if CIME.compare_namelists.is_namelist_file(item): rc, output, _ = run_cmd("%s %s %s -c %s 2>&1" % (compare_nl, baseline_counterpart, item, test)) else: rc, output, _ = run_cmd("%s %s %s -c %s 2>&1" % (simple_compare, baseline_counterpart, item, test)) if rc != 0: has_fails = True self._log_output(test, output) if has_fails: self._test_has_nl_problem(test) if self._baseline_gen_name: baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name, test) baseline_casedocs = os.path.join(baseline_dir, "CaseDocs") if not os.path.isdir(baseline_dir): os.makedirs(baseline_dir, stat.S_IRWXU | stat.S_IRWXG | stat.S_IXOTH | stat.S_IROTH) if os.path.isdir(baseline_casedocs): shutil.rmtree(baseline_casedocs) shutil.copytree(casedoc_dir, baseline_casedocs) os.chmod(baseline_casedocs, stat.S_IRWXU | stat.S_IRWXG | stat.S_IXOTH | stat.S_IROTH) for item in glob.glob("%s/*" % baseline_casedocs): os.chmod(item, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) for item in glob.glob(os.path.join(test_dir, "user_nl*")): preexisting_baseline = os.path.join(baseline_dir, os.path.basename(item)) if (os.path.exists(preexisting_baseline)): os.remove(preexisting_baseline) shutil.copy2(item, baseline_dir) os.chmod(preexisting_baseline, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # Always mark as passed unless we hit exception return True ########################################################################### def _sharedlib_build_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.build --sharedlib-only", SHAREDLIB_BUILD_PHASE, from_dir=test_dir) ########################################################################### def _model_build_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.build --model-only", MODEL_BUILD_PHASE, from_dir=test_dir) ########################################################################### def _run_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) if self._no_batch: cmd = "./case.submit --no-batch" else: cmd = "./case.submit " return self._shell_cmd_for_phase(test, cmd, RUN_PHASE, from_dir=test_dir) ########################################################################### def _run_catch_exceptions(self, test, phase, run): ########################################################################### try: return run(test) except (SystemExit, Exception) as e: exc_tb = sys.exc_info()[2] errput = "Test '%s' failed in phase '%s' with exception '%s'" % (test, phase, str(e)) self._log_output(test, errput) logger.warning("Caught exception: %s" % str(e)) traceback.print_tb(exc_tb) return False ########################################################################### def _get_procs_needed(self, test, phase, threads_in_flight=None, no_batch=False): ########################################################################### if phase == RUN_PHASE and (self._no_batch or no_batch): test_dir = self._get_test_dir(test) out = run_cmd_no_fail("./xmlquery TOTALPES -value", from_dir=test_dir) return int(out) elif (phase == SHAREDLIB_BUILD_PHASE): # Will force serialization of sharedlib builds # TODO - instead of serializing, compute all library configs needed and build # them all in parallel for _, _, running_phase in threads_in_flight.values(): if (running_phase == SHAREDLIB_BUILD_PHASE): return self._proc_pool + 1 return 1 elif (phase == MODEL_BUILD_PHASE): # Model builds now happen in parallel return 4 else: return 1 ########################################################################### def _wait_for_something_to_finish(self, threads_in_flight): ########################################################################### expect(len(threads_in_flight) <= self._parallel_jobs, "Oversubscribed?") finished_tests = [] while not finished_tests: for test, thread_info in threads_in_flight.iteritems(): if not thread_info[0].is_alive(): finished_tests.append((test, thread_info[1])) if not finished_tests: time.sleep(0.2) for finished_test, procs_needed in finished_tests: self._procs_avail += procs_needed del threads_in_flight[finished_test] ########################################################################### def _update_test_status_file(self, test, test_phase, status): ########################################################################### """ In general, test_scheduler should not be responsible for updating the TestStatus file, but there are a few cases where it has to. """ test_dir = self._get_test_dir(test) with TestStatus(test_dir=test_dir, test_name=test) as ts: ts.set_status(test_phase, status) ########################################################################### def _consumer(self, test, test_phase, phase_method): ########################################################################### before_time = time.time() success = self._run_catch_exceptions(test, test_phase, phase_method) elapsed_time = time.time() - before_time status = (TEST_PEND_STATUS if test_phase == RUN_PHASE and not \ self._no_batch else TEST_PASS_STATUS) if success else TEST_FAIL_STATUS if status != TEST_PEND_STATUS: self._update_test_status(test, test_phase, status) status_str = "Finished %s for test %s in %f seconds (%s)" %\ (test_phase, test, elapsed_time, status) if not success: status_str += " Case dir: %s" % self._get_test_dir(test) logger.info(status_str) if test_phase in [CREATE_NEWCASE_PHASE, XML_PHASE, NAMELIST_PHASE]: # These are the phases for which TestScheduler is reponsible for # updating the TestStatus file nl_problem = self._get_test_data(test)[2] status = TEST_FAIL_STATUS if nl_problem and test_phase == NAMELIST_PHASE else status self._update_test_status_file(test, test_phase, status) # On batch systems, we want to immediately submit to the queue, because # it's very cheap to submit and will get us a better spot in line if (success and not self._no_run and not self._no_batch and test_phase == MODEL_BUILD_PHASE): logger.info("Starting %s for test %s with 1 proc on interactive node and %d procs on compute nodes" % (RUN_PHASE, test, self._get_procs_needed(test, RUN_PHASE, no_batch=True))) self._update_test_status(test, RUN_PHASE, TEST_PEND_STATUS) self._consumer(test, RUN_PHASE, self._run_phase) ########################################################################### def _producer(self): ########################################################################### threads_in_flight = {} # test-name -> (thread, procs, phase) while True: work_to_do = False num_threads_launched_this_iteration = 0 for test in self._tests: logger.debug("test_name: " + test) # If we have no workers available, immediately wait if len(threads_in_flight) == self._parallel_jobs: self._wait_for_something_to_finish(threads_in_flight) if self._work_remains(test): work_to_do = True if test not in threads_in_flight: test_phase, test_status, _ = self._get_test_data(test) expect(test_status != TEST_PEND_STATUS, test) next_phase = self._phases[self._phases.index(test_phase) + 1] procs_needed = self._get_procs_needed(test, next_phase, threads_in_flight) if procs_needed <= self._procs_avail: self._procs_avail -= procs_needed # Necessary to print this way when multiple threads printing logger.info("Starting %s for test %s with %d procs" % (next_phase, test, procs_needed)) self._update_test_status(test, next_phase, TEST_PEND_STATUS) new_thread = threading.Thread(target=self._consumer, args=(test, next_phase, getattr(self, "_%s_phase" % next_phase.lower())) ) threads_in_flight[test] = (new_thread, procs_needed, next_phase) new_thread.start() num_threads_launched_this_iteration += 1 else: if not threads_in_flight: msg = "Phase '%s' for test '%s' required more processors, %d, than this machine can provide, %d" % \ (next_phase, test, procs_needed, self._procs_avail) logger.warning(msg) self._update_test_status(test, next_phase, TEST_PEND_STATUS) self._update_test_status(test, next_phase, TEST_FAIL_STATUS) self._log_output(test, msg) self._update_test_status_file(test, next_phase, TEST_FAIL_STATUS) num_threads_launched_this_iteration += 1 if not work_to_do: break if num_threads_launched_this_iteration == 0: # No free resources, wait for something in flight to finish self._wait_for_something_to_finish(threads_in_flight) for unfinished_thread, _, _ in threads_in_flight.values(): unfinished_thread.join() ########################################################################### def _setup_cs_files(self): ########################################################################### try: python_libs_root = CIME.utils.get_python_libs_root() template_file = os.path.join(python_libs_root, "cs.status.template") template = open(template_file, "r").read() template = template.replace("<PATH>", os.path.join(self._cime_root,"scripts","Tools")).replace\ ("<TESTID>", self._test_id) cs_status_file = os.path.join(self._test_root, "cs.status.%s" % self._test_id) with open(cs_status_file, "w") as fd: fd.write(template) os.chmod(cs_status_file, os.stat(cs_status_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) template_file = os.path.join(python_libs_root, "cs.submit.template") template = open(template_file, "r").read() setup_cmd = "./case.setup" if self._no_setup else ":" build_cmd = "./case.build" if self._no_build else ":" test_cmd = "./case.submit" template = template.replace("<SETUP_CMD>", setup_cmd).\ replace("<BUILD_CMD>", build_cmd).\ replace("<RUN_CMD>", test_cmd).\ replace("<TESTID>", self._test_id) if self._no_run: cs_submit_file = os.path.join(self._test_root, "cs.submit.%s" % self._test_id) with open(cs_submit_file, "w") as fd: fd.write(template) os.chmod(cs_submit_file, os.stat(cs_submit_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) if CIME.utils.get_model == "cesm": testreporter = os.path.join(self._test_root,"testreporter.pl") shutil.copy(os.path.join(self._cime_root,"scripts","Testing","testreporter.pl"), testreporter) os.chmod(testreporter, os.stat(testreporter).st_mode | stat.S_IXUSR | stat.S_IXGRP) except Exception as e: logger.warning("FAILED to set up cs files: %s" % str(e)) ########################################################################### def run_tests(self): ########################################################################### """ Main API for this class. Return True if all tests passed. """ start_time = time.time() # Tell user what will be run logger.info( "RUNNING TESTS:") for test in self._tests: logger.info( " %s"% test) # TODO - documentation self._producer() expect(threading.active_count() == 1, "Leftover threads?") # Setup cs files self._setup_cs_files() # Return True if all tests passed from our point of view logger.info( "At test-scheduler close, state is:") rv = True for test in self._tests: phase, status, nl_fail = self._get_test_data(test) if status == TEST_PASS_STATUS and phase == RUN_PHASE: # Be cautious about telling the user that the test passed. This # status should match what they would see on the dashboard. Our # self._test_states does not include comparison fail information, # so we need to parse test status. ts = TestStatus(self._get_test_dir(test)) status = ts.get_overall_test_status() if status not in [TEST_PASS_STATUS, TEST_PEND_STATUS]: logger.info( "%s %s (phase %s)" % (status, test, phase)) rv = False elif nl_fail: logger.info( "%s %s (but otherwise OK)" % (NAMELIST_FAIL_STATUS, test)) rv = False else: logger.info("%s %s %s" % (status, test, phase)) logger.info( " Case dir: %s" % self._get_test_dir(test)) logger.info( "test-scheduler took %s seconds"% (time.time() - start_time)) return rv
def _compare_baseline(self): with self._test_status as ts: ts.set_status(CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_FAIL_STATUS) run_dir = self._case.get_value("RUNDIR") case_name = self._case.get_value("CASE") base_dir = os.path.join(self._case.get_value("BASELINE_ROOT"), self._case.get_value("BASECMP_CASE")) test_name = "{}".format(case_name.split('.')[-1]) evv_config = { test_name: { "module": os.path.join(evv_lib_dir, "extensions", "tsc.py"), "test-case": case_name, "test-dir": run_dir, "ref-case": "Baseline", "ref-dir": base_dir, "time-slice": [OUT_FREQ, SIM_LENGTH], "inspect-times": INSPECT_AT, "variables": VAR_LIST, "p-threshold": P_THRESHOLD, } } json_file = os.path.join(run_dir, '.'.join([case_name, 'json'])) with open(json_file, 'w') as config_file: json.dump(evv_config, config_file, indent=4) evv_out_dir = os.path.join(run_dir, '.'.join([case_name, 'evv'])) evv(['-e', json_file, '-o', evv_out_dir]) with open(os.path.join(evv_out_dir, 'index.json'), 'r') as evv_f: evv_status = json.load(evv_f) comments = "" for evv_elem in evv_status['Data']['Elements']: if evv_elem['Type'] == 'ValSummary' \ and evv_elem['TableTitle'] == 'Time step convergence test': comments = "; ".join("{}: {}".format(key, val) for key, val in evv_elem['Data'] [test_name][''].items()) if evv_elem['Data'][test_name]['']['Test status'].lower( ) == 'pass': self._test_status.set_status( CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_PASS_STATUS) break status = self._test_status.get_status( CIME.test_status.BASELINE_PHASE) mach_name = self._case.get_value("MACH") mach_obj = Machines(machine=mach_name) htmlroot = CIME.utils.get_htmlroot(mach_obj) urlroot = CIME.utils.get_urlroot(mach_obj) if htmlroot is not None: with CIME.utils.SharedArea(): dir_util.copy_tree(evv_out_dir, os.path.join(htmlroot, 'evv', case_name), preserve_mode=False) if urlroot is None: urlroot = "[{}_URL]".format(mach_name.capitalize()) viewing = "{}/evv/{}/index.html".format(urlroot, case_name) else: viewing = "{}\n" \ " EVV viewing instructions can be found at: " \ " https://github.com/E3SM-Project/E3SM/blob/master/cime/scripts/" \ "climate_reproducibility/README.md#test-passfail-and-extended-output" \ "".format(evv_out_dir) comments = "{} {} for test '{}'.\n" \ " {}\n" \ " EVV results can be viewed at:\n" \ " {}".format(CIME.test_status.BASELINE_PHASE, status, test_name, comments, viewing) CIME.utils.append_testlog(comments, self._orig_caseroot)
def __init__(self, test_names, no_run=False, no_build=False, no_batch=None, test_root=None, test_id=None, machine_name=None,compiler=None, baseline_root=None, baseline_name=None, clean=False,compare=False, generate=False, namelists_only=False, project=None, parallel_jobs=None, xml_machine=None, xml_compiler=None, xml_category=None,xml_testlist=None): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = CIME.utils.get_model() # needed for perl interface os.environ["CIMEROOT"] = self._cime_root self._machobj = Machines(machine=machine_name) machine_name = self._machobj.get_machine_name() self._no_build = no_build if not namelists_only else True self._no_run = no_run if not self._no_build else True # Figure out what project to use if (project is None): self._project = CIME.utils.get_project() if (self._project is None): self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() self._test_root = test_root if test_root is not None else self._machobj.get_value("CESMSCRATCHROOT") if (self._project is not None): self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_utc_timestamp() self._compiler = compiler if compiler is not None else self._machobj.get_default_compiler() expect(self._machobj.is_valid_compiler(self._compiler), "Compiler %s not valid for machine %s" % (self._compiler,machine_name)) self._clean = clean self._namelists_only = namelists_only # Extra data associated with tests, do not modify after construction # test_name -> test_data # test_data: name -> value self._test_data = {} # If xml options are provided get tests from xml file, otherwise use acme dictionary if(not test_names and (xml_machine is not None or xml_category is not None or xml_compiler is not None or xml_testlist is not None)): test_data = CIME.test_utils.get_tests_from_xml(xml_machine, xml_category, xml_compiler, xml_testlist, machine_name, compiler) test_names = [item["name"] for item in test_data] for test_datum in test_data: self._test_data[test_datum["name"]] = test_datum else: expect(len(test_names) > 0, "No tests to run") test_names = update_acme_tests.get_full_test_names(test_names, machine_name, self._compiler) if (parallel_jobs is None): self._parallel_jobs = min(len(test_names), int(self._machobj.get_value("MAX_TASKS_PER_NODE"))) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = None self._baseline_gen_name = None self._compare = False self._generate = False if (compare or generate): # Figure out what baseline name to use if (baseline_name is None): if(compare is not None and isinstance(compare,str)): self._baseline_cmp_name = compare self._compare = True if(generate is not None and isinstance(generate,str)): self._baseline_gen_name = generate self._generate = True branch_name = CIME.utils.get_current_branch(repo=self._cime_root) expect(branch_name is not None, "Could not determine baseline name from branch, please use -b option") if(self._compare and self._baseline_cmp_name is None): self._baseline_cmp_name = os.path.join(self._compiler, branch_name) if(self._generate and self._baseline_gen_name is None): self._baseline_gen_name = os.path.join(self._compiler, branch_name) else: if(compare): self._compare = True self._baseline_cmp_name = baseline_name if (not self._baseline_cmp_name.startswith("%s/" % self._compiler)): self._baseline_cmp_name = os.path.join(self._compiler, self._baseline_cmp_name) if(generate): self._generate = True self._baseline_gen_name = baseline_name if (not self._baseline_gen_name.startswith("%s/" % self._compiler)): self._baseline_gen_name = os.path.join(self._compiler, self._baseline_gen_name) # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None else self._machobj.get_value("CCSM_BASELINE") if (self._project is not None): self._baseline_root = self._baseline_root.replace("$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if (self._compare): full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect(os.path.isdir(full_baseline_dir), "Missing baseline comparison directory %s" % full_baseline_dir) else: self._baseline_root = None # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # Since the name-list phase can fail without aborting later phases, we # need some extra state to remember tests that had namelist problems. # name -> (phase, status, has_namelist_problem) self._tests = {} for test_name in test_names: self._tests[test_name] = (INITIAL_PHASE, TEST_PASS_STATUS, False) # Oversubscribe by 1/4 pes = int(self._machobj.get_value("PES_PER_NODE")) self._proc_pool = int(pes * 1.25) # Setup phases self._phases = list(PHASES) if (no_build): self._phases.remove(BUILD_PHASE) if (no_run): self._phases.remove(RUN_PHASE) if (not self._compare and not self._generate): self._phases.remove(NAMELIST_PHASE) # None of the test directories should already exist. for test in self._tests: expect(not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '%s', it already exists. Pick a different test-id" % self._get_test_dir(test))
class EnvModule(object): # TODO - write env_mach_specific files into case # Public API def __init__(self, machine, compiler, cimeroot, caseroot, mpilib, debug=False): self._machine = Machines(machine=machine) self._compiler = compiler self._cimeroot = cimeroot self._caseroot = caseroot self._mpilib = mpilib self._debug = debug self._module_system = self._machine.get_module_system_type() def load_env_for_case(self): mach_specific = EnvMachSpecific(caseroot=self._caseroot) module_nodes = mach_specific.get_node("modules") env_nodes = mach_specific.get_node("environment_variables") if (module_nodes is not None): modules_to_load = self._compute_module_actions(module_nodes) self.load_modules(modules_to_load) if (env_nodes is not None): envs_to_set = self._compute_env_actions(env_nodes) self.load_envs(envs_to_set) def load_modules(self, modules_to_load): if (self._module_system == "module"): self._load_module_modules(modules_to_load) elif (self._module_system == "soft"): self._load_soft_modules(modules_to_load) elif (self._module_system == "dotkit"): self._load_dotkit_modules(modules_to_load) elif (self._module_system == "none"): self._load_none_modules(modules_to_load) else: expect(False, "Unhandled module system '%s'" % self._module_system) def load_envs(self, envs_to_set): for env_name, env_value in envs_to_set: # Let bash do the work on evaluating and resolving env_value os.environ[env_name] = run_cmd("echo %s" % env_value) # Private API def _compute_module_actions(self, module_nodes): return self._compute_actions(module_nodes, "command") def _compute_env_actions(self, env_nodes): return self._compute_actions(env_nodes, "env") def _compute_actions(self, nodes, child_tag): result = [] # list of tuples ("name", "argument") for node in nodes: if (self._match_attribs(node.attrib)): for child in node: expect(child.tag == child_tag, "Expected %s element" % child_tag) result.append( (child.get("name"), child.text) ) return result def _match_attribs(self, attribs): if ("compiler" in attribs and not self._match(self._compiler, attribs["compiler"])): return False elif ("mpilib" in attribs and not self._match(self._mpilib, attribs["mpilib"])): return False elif ("debug" in attribs and not self._match("TRUE" if self._debug else "FALSE", attribs["debug"].upper())): return False return True def _match(self, my_value, xml_value): if (xml_value.startswith("!")): return my_value != xml_value[1:] else: return my_value == xml_value def _load_module_modules(self, modules_to_load): python_mod_cmd = self._machine.get_module_system_cmd_path("python") for action, argument in modules_to_load: cmd = "%s %s %s" % (python_mod_cmd, action, argument) py_module_code = run_cmd(cmd) exec(py_module_code) def _load_soft_modules(self, modules_to_load): expect(False, "Not yet implemented") def _load_dotkit_modules(self, modules_to_load): expect(False, "Not yet implemented") def _load_none_modules(self, modules_to_load): expect(False, "Not yet implemented")
def load_balancing_submit(compset, res, pesfile, mpilib, compiler, project, machine, extra_options_file, test_id, force_purge, test_root): ################################################################################ # Read in list of pes from given file if not os.access(pesfile, os.R_OK): logger.critical('ERROR: File %s not found', pesfile) raise SystemExit(1) logger.info('Reading XML file %s. Searching for pesize entries:', pesfile) try: pesobj = Pes(pesfile) except ParseError: logger.critical('ERROR: File %s not parseable', pesfile) raise SystemExit(1) pesize_list = [] for node in pesobj.get_nodes('pes'): pesize = node.get('pesize') if not pesize: logger.critical('No pesize for pes node in file %s', pesfile) if pesize in pesize_list: logger.critical('pesize %s duplicated in file %s', pesize, pesfile) pesize_list.append(pesize) if not pesize_list: logger.critical('ERROR: No grid entries found in pes file %s', pesfile) raise SystemExit(1) machobj = Machines(machine=machine) if test_root is None: test_root = machobj.get_value("CIME_OUTPUT_ROOT") if machine is None: machine = machobj.get_machine_name() print "machine is {}".format(machine) if compiler is None: compiler = machobj.get_default_compiler() print "compiler is {}".format(compiler) if mpilib is None: mpilib = machobj.get_default_MPIlib({"compiler": compiler}) test_names = [] for i in xrange(len(pesize_list)): test_names.append( get_full_test_name("PFS_I{}".format(i), grid=res, compset=compset, machine=machine, compiler=compiler)) casedir = os.path.join(test_root, test_names[-1] + "." + test_id) print "casedir is {}".format(casedir) if os.path.isdir(casedir): if force_purge: logger.info('Removing directory %s', casedir) shutil.rmtree(casedir) else: expect( False, "casedir {} already exists, use the --force-purge option, --test-root or" " --test-id options".format(casedir)) tests = TestScheduler(test_names, no_setup=True, compiler=compiler, machine_name=machine, mpilib=mpilib, test_root=test_root, test_id=test_id, project=project) success = tests.run_tests(wait=True) expect(success, "Error in creating cases") testnames = [] for test in tests.get_testnames(): testname = os.path.join(test_root, test + "." + test_id) testnames.append(testname) logger.info("test is {}".format(testname)) with Case(testname) as case: pes_ntasks, pes_nthrds, pes_rootpe, _ = \ pesobj.find_pes_layout('any', 'any', 'any', pesize_opts=pesize_list.pop(0)) for key in pes_ntasks: case.set_value(key, pes_ntasks[key]) for key in pes_nthrds: case.set_value(key, pes_nthrds[key]) for key in pes_rootpe: case.set_value(key, pes_rootpe[key])
def query_cime(machine, param): ############################################################################### mach_obj = Machines(machine=machine) return mach_obj.get_value(param)
os.chdir(caseroot) non_local = case.get_value("NONLOCAL") models = case.get_values("COMP_CLASSES") mach = case.get_value("MACH") compiler = case.get_value("COMPILER") debug = case.get_value("DEBUG") mpilib = case.get_value("MPILIB") sysos = case.get_value("OS") comp_interface = case.get_value("COMP_INTERFACE") extra_machines_dir = case.get_value("EXTRA_MACHDIR") expect(mach is not None, "xml variable MACH is not set") mach_obj = Machines(machine=mach, extra_machines_dir=extra_machines_dir) # Check that $DIN_LOC_ROOT exists or can be created: if not non_local: din_loc_root = case.get_value("DIN_LOC_ROOT") testcase = case.get_value("TESTCASE") if not os.path.isdir(din_loc_root): try: os.makedirs(din_loc_root) except OSError as e: if e.errno == errno.EACCES: logger.info("Invalid permissions to create {}".format( din_loc_root)) expect(
def create_cdash_xml(results, cdash_build_name, cdash_project, cdash_build_group, force_log_upload=False): ############################################################################### # # Create dart config file # current_time = time.time() utc_time_tuple = time.gmtime(current_time) cdash_timestamp = time.strftime("%H:%M:%S", utc_time_tuple) hostname = Machines().get_machine_name() if hostname is None: hostname = socket.gethostname().split(".")[0] logging.warning( "Could not convert hostname '{}' into an E3SM machine name".format( hostname)) for drop_method in ["https", "http"]: dart_config = """ SourceDirectory: {0} BuildDirectory: {0} # Site is something like machine.domain, i.e. pragmatic.crd Site: {1} # Build name is osname-revision-compiler, i.e. Linux-2.4.2-2smp-c++ BuildName: {2} # Submission information IsCDash: TRUE CDashVersion: QueryCDashVersion: DropSite: my.cdash.org DropLocation: /submit.php?project={3} DropSiteUser: DropSitePassword: DropSiteMode: DropMethod: {6} TriggerSite: ScpCommand: {4} # Dashboard start time NightlyStartTime: {5} UTC UseLaunchers: CurlOptions: CURLOPT_SSL_VERIFYPEER_OFF;CURLOPT_SSL_VERIFYHOST_OFF """.format( os.getcwd(), hostname, cdash_build_name, cdash_project, find_executable("scp"), cdash_timestamp, drop_method, ) with open("DartConfiguration.tcl", "w") as dart_fd: dart_fd.write(dart_config) utc_time = time.strftime("%Y%m%d-%H%M", utc_time_tuple) testing_dir = os.path.join("Testing", utc_time) if os.path.isdir(testing_dir): shutil.rmtree(testing_dir) os.makedirs(os.path.join("Testing", utc_time)) # Make tag file with open("Testing/TAG", "w") as tag_fd: tag_fd.write("{}\n{}\n".format(utc_time, cdash_build_group)) create_cdash_xml_fakes( results, cdash_build_name, cdash_build_group, utc_time, current_time, hostname, ) create_cdash_upload_xml( results, cdash_build_name, cdash_build_group, utc_time, hostname, force_log_upload, ) stat, out, _ = run_cmd("ctest -VV -D NightlySubmit", combine_output=True) if stat != 0: logging.warning("ctest upload drop method {} FAILED:\n{}".format( drop_method, out)) else: logging.info("Upload SUCCESS:\n{}".format(out)) return expect(False, "All cdash upload attempts failed")
"Macros.cmake" ] for file_to_clean in files_to_clean: if os.path.exists(file_to_clean) and not (keep and file_to_clean in keep): os.remove(file_to_clean) logger.info("Successfully cleaned {}".format(file_to_clean)) if not test_mode: # rebuild the models (even on restart) case.set_value("BUILD_COMPLETE", False) # Cannot leave case in bad state (missing env_mach_specific.xml) if clean and not os.path.isfile("env_mach_specific.xml"): case.flush() configure(Machines(machine=mach, extra_machines_dir=extra_machines_dir), caseroot, ["Makefile"], compiler, mpilib, debug, comp_interface, sysos, noenv=True, extra_machines_dir=extra_machines_dir) case.read_xml() if not clean: if not non_local: case.load_env() # creates the Macros.make, Depends.compiler, Depends.machine, Depends.machine.compiler
def create_cdash_xml(results, cdash_build_name, cdash_project, cdash_build_group): ############################################################################### # # Create dart config file # current_time = time.time() utc_time_tuple = time.gmtime(current_time) cdash_timestamp = time.strftime("%H:%M:%S", utc_time_tuple) hostname = Machines().get_machine_name() if (hostname is None): hostname = socket.gethostname().split(".")[0] logging.warning( "Could not convert hostname '{}' into an ACME machine name".format( hostname)) dart_config = \ """ SourceDirectory: {0} BuildDirectory: {0} # Site is something like machine.domain, i.e. pragmatic.crd Site: {1} # Build name is osname-revision-compiler, i.e. Linux-2.4.2-2smp-c++ BuildName: {2} # Submission information IsCDash: TRUE CDashVersion: QueryCDashVersion: DropSite: my.cdash.org DropLocation: /submit.php?project={3} DropSiteUser: DropSitePassword: DropSiteMode: DropMethod: http TriggerSite: ScpCommand: {4} # Dashboard start time NightlyStartTime: {5} UTC """.format(os.getcwd(), hostname, cdash_build_name, cdash_project, distutils.spawn.find_executable("scp"), cdash_timestamp) with open("DartConfiguration.tcl", "w") as dart_fd: dart_fd.write(dart_config) utc_time = time.strftime('%Y%m%d-%H%M', utc_time_tuple) os.makedirs(os.path.join("Testing", utc_time)) # Make tag file with open("Testing/TAG", "w") as tag_fd: tag_fd.write("{}\n{}\n".format(utc_time, cdash_build_group)) create_cdash_test_xml(results, cdash_build_name, cdash_build_group, utc_time, current_time, hostname) create_cdash_upload_xml(results, cdash_build_name, cdash_build_group, utc_time, hostname) CIME.utils.run_cmd_no_fail("ctest -VV -D NightlySubmit", verbose=True)
def _compare_baseline(self): with self._test_status: if int(self._case.get_value("RESUBMIT")) > 0: # This is here because the comparison is run for each submission # and we only want to compare once the whole run is finished. We # need to return a pass here to continue the submission process. self._test_status.set_status( CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_PASS_STATUS ) return self._test_status.set_status( CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_FAIL_STATUS ) run_dir = self._case.get_value("RUNDIR") case_name = self._case.get_value("CASE") base_dir = os.path.join( self._case.get_value("BASELINE_ROOT"), self._case.get_value("BASECMP_CASE"), ) test_name = "{}".format(case_name.split(".")[-1]) evv_config = { test_name: { "module": os.path.join(evv_lib_dir, "extensions", "ks.py"), "test-case": "Test", "test-dir": run_dir, "ref-case": "Baseline", "ref-dir": base_dir, "var-set": "default", "ninst": NINST, "critical": 13, "component": self.component, } } json_file = os.path.join(run_dir, ".".join([case_name, "json"])) with open(json_file, "w") as config_file: json.dump(evv_config, config_file, indent=4) evv_out_dir = os.path.join(run_dir, ".".join([case_name, "evv"])) evv(["-e", json_file, "-o", evv_out_dir]) with open(os.path.join(evv_out_dir, "index.json")) as evv_f: evv_status = json.load(evv_f) comments = "" for evv_ele in evv_status["Page"]["elements"]: if "Table" in evv_ele: comments = "; ".join( "{}: {}".format(key, val[0]) for key, val in evv_ele["Table"]["data"].items() ) if evv_ele["Table"]["data"]["Test status"][0].lower() == "pass": self._test_status.set_status( CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_PASS_STATUS, ) break status = self._test_status.get_status(CIME.test_status.BASELINE_PHASE) mach_name = self._case.get_value("MACH") mach_obj = Machines(machine=mach_name) htmlroot = CIME.utils.get_htmlroot(mach_obj) urlroot = CIME.utils.get_urlroot(mach_obj) if htmlroot is not None: with CIME.utils.SharedArea(): dir_util.copy_tree( evv_out_dir, os.path.join(htmlroot, "evv", case_name), preserve_mode=False, ) if urlroot is None: urlroot = "[{}_URL]".format(mach_name.capitalize()) viewing = "{}/evv/{}/index.html".format(urlroot, case_name) else: viewing = ( "{}\n" " EVV viewing instructions can be found at: " " https://github.com/E3SM-Project/E3SM/blob/master/cime/scripts/" "climate_reproducibility/README.md#test-passfail-and-extended-output" "".format(evv_out_dir) ) comments = ( "{} {} for test '{}'.\n" " {}\n" " EVV results can be viewed at:\n" " {}".format( CIME.test_status.BASELINE_PHASE, status, test_name, comments, viewing, ) ) CIME.utils.append_testlog(comments, self._orig_caseroot)
class TestScheduler(object): ############################################################################### ########################################################################### def __init__(self, test_names, test_data=None, no_run=False, no_build=False, no_setup=False, no_batch=None, test_root=None, test_id=None, machine_name=None, compiler=None, baseline_root=None, baseline_cmp_name=None, baseline_gen_name=None, clean=False, namelists_only=False, project=None, parallel_jobs=None, walltime=None, proc_pool=None, use_existing=False, save_timing=False, queue=None, allow_baseline_overwrite=False, output_root=None): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = CIME.utils.get_model() self._allow_baseline_overwrite = allow_baseline_overwrite self._save_timing = save_timing self._queue = queue self._test_data = {} if test_data is None else test_data # Format: {test_name -> {data_name -> data}} self._machobj = Machines(machine=machine_name) self._no_setup = no_setup self._no_build = no_build or no_setup or namelists_only self._no_run = no_run or self._no_build self._output_root = output_root # Figure out what project to use if project is None: self._project = CIME.utils.get_project() if self._project is None: self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() expect(not (self._no_batch and self._queue is not None), "Does not make sense to request a queue without batch system") # Determine and resolve test_root if test_root is not None: self._test_root = test_root elif self._output_root is not None: self._test_root = self._output_root else: self._test_root = self._machobj.get_value("CIME_OUTPUT_ROOT") if self._project is not None: self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_timestamp( ) self._compiler = self._machobj.get_default_compiler( ) if compiler is None else compiler self._clean = clean self._namelists_only = namelists_only self._walltime = walltime if parallel_jobs is None: self._parallel_jobs = min( len(test_names), int(self._machobj.get_value("MAX_TASKS_PER_NODE"))) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = baseline_cmp_name # Implies comparison should be done if not None self._baseline_gen_name = baseline_gen_name # Implies generation should be done if not None if baseline_cmp_name or baseline_gen_name: # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None \ else self._machobj.get_value("BASELINE_ROOT") if self._project is not None: self._baseline_root = self._baseline_root.replace( "$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if self._baseline_cmp_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect( os.path.isdir(full_baseline_dir), "Missing baseline comparison directory %s" % full_baseline_dir) # the following is to assure that the existing generate directory is not overwritten if self._baseline_gen_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name) existing_baselines = [] for test_name in test_names: test_baseline = os.path.join(full_baseline_dir, test_name) if os.path.isdir(test_baseline): existing_baselines.append(test_baseline) expect(allow_baseline_overwrite or len(existing_baselines) == 0, "Baseline directories already exists %s\n"\ "Use --allow_baseline_overwrite to avoid this error"%existing_baselines) else: self._baseline_root = None # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # name -> (phase, status) self._tests = {} for test_name in test_names: self._tests[test_name] = (TEST_START, TEST_PASS_STATUS) # Oversubscribe by 1/4 if proc_pool is None: pes = int(self._machobj.get_value("PES_PER_NODE")) self._proc_pool = int(pes * 1.25) else: self._proc_pool = int(proc_pool) self._procs_avail = self._proc_pool # Setup phases self._phases = list(PHASES) if self._no_setup: self._phases.remove(SETUP_PHASE) if self._no_build: self._phases.remove(SHAREDLIB_BUILD_PHASE) self._phases.remove(MODEL_BUILD_PHASE) if self._no_run: self._phases.remove(RUN_PHASE) if use_existing: for test in self._tests: ts = TestStatus(self._get_test_dir(test)) for phase, status in ts: if phase in CORE_PHASES: if status in [TEST_PEND_STATUS, TEST_FAIL_STATUS]: # We need to pick up here break else: self._update_test_status(test, phase, TEST_PEND_STATUS) self._update_test_status(test, phase, status) else: # None of the test directories should already exist. for test in self._tests: expect( not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '%s', it already exists." " Pick a different test-id" % self._get_test_dir(test)) # By the end of this constructor, this program should never hard abort, # instead, errors will be placed in the TestStatus files for the various # tests cases ########################################################################### def _log_output(self, test, output): ########################################################################### test_dir = self._get_test_dir(test) if not os.path.isdir(test_dir): # Note: making this directory could cause create_newcase to fail # if this is run before. os.makedirs(test_dir) append_status(output, caseroot=test_dir, sfile="TestStatus.log") ########################################################################### def _get_case_id(self, test): ########################################################################### baseline_action_code = "" if self._baseline_gen_name: baseline_action_code += "G" if self._baseline_cmp_name: baseline_action_code += "C" if len(baseline_action_code) > 0: return "%s.%s.%s" % (test, baseline_action_code, self._test_id) else: return "%s.%s" % (test, self._test_id) ########################################################################### def _get_test_dir(self, test): ########################################################################### return os.path.join(self._test_root, self._get_case_id(test)) ########################################################################### def _get_test_data(self, test): ########################################################################### # Must be atomic return self._tests[test] ########################################################################### def _is_broken(self, test): ########################################################################### status = self._get_test_status(test) return status != TEST_PASS_STATUS and status != TEST_PEND_STATUS ########################################################################### def _work_remains(self, test): ########################################################################### test_phase, test_status = self._get_test_data(test) return (test_status == TEST_PASS_STATUS or test_status == TEST_PEND_STATUS) and\ test_phase != self._phases[-1] ########################################################################### def _get_test_status(self, test, phase=None): ########################################################################### curr_phase, curr_status = self._get_test_data(test) if phase is None or phase == curr_phase: return curr_status else: expect( phase is None or self._phases.index(phase) < self._phases.index(curr_phase), "Tried to see the future") # Assume all older phases PASSed return TEST_PASS_STATUS ########################################################################### def _get_test_phase(self, test): ########################################################################### return self._get_test_data(test)[0] ########################################################################### def _update_test_status(self, test, phase, status): ########################################################################### phase_idx = self._phases.index(phase) old_phase, old_status = self._get_test_data(test) if old_phase == phase: expect( old_status == TEST_PEND_STATUS, "Only valid to transition from PEND to something else, found '%s' for phase '%s'" % (old_status, phase)) expect(status != TEST_PEND_STATUS, "Cannot transition from PEND -> PEND") else: expect( old_status == TEST_PASS_STATUS, "Why did we move on to next phase when prior phase did not pass?" ) expect(status == TEST_PEND_STATUS, "New phase should be set to pending status") expect( self._phases.index(old_phase) == phase_idx - 1, "Skipped phase? %s %s" % (old_phase, phase_idx)) # Must be atomic self._tests[test] = (phase, status) ########################################################################### def _shell_cmd_for_phase(self, test, cmd, phase, from_dir=None): ########################################################################### while True: rc, output, errput = run_cmd(cmd, from_dir=from_dir) if rc != 0: self._log_output( test, "%s FAILED for test '%s'.\nCommand: %s\nOutput: %s\n\nErrput: %s" % (phase, test, cmd, output, errput)) # Temporary hack to get around odd file descriptor use by # buildnml scripts. if "bad interpreter" in errput: time.sleep(1) continue else: break else: # We don't want "RUN PASSED" in the TestStatus.log if the only thing that # succeeded was the submission. if phase != RUN_PHASE or self._no_batch: self._log_output( test, "%s PASSED for test '%s'.\nCommand: %s\nOutput: %s" % (phase, test, cmd, output)) break return rc == 0 ########################################################################### def _create_newcase_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) _, case_opts, grid, compset,\ machine, compiler, test_mods = CIME.utils.parse_test_name(test) create_newcase_cmd = "%s --case %s --res %s --mach %s --compiler %s --compset %s"\ " --test" % \ (os.path.join(self._cime_root, "scripts", "create_newcase"), test_dir, grid, machine, compiler, compset) if self._project is not None: create_newcase_cmd += " --project %s " % self._project if self._output_root is not None: create_newcase_cmd += " --output-root %s " % self._output_root if test_mods is not None: files = Files() (component, modspath) = test_mods.split('/', 1) testmods_dir = files.get_value("TESTS_MODS_DIR", {"component": component}) test_mod_file = os.path.join(testmods_dir, component, modspath) if not os.path.exists(test_mod_file): self._log_output(test, "Missing testmod file '%s'" % test_mod_file) return False create_newcase_cmd += " --user-mods-dir %s" % test_mod_file if case_opts is not None: for case_opt in case_opts: # pylint: disable=not-an-iterable if case_opt.startswith('M'): mpilib = case_opt[1:] create_newcase_cmd += " --mpilib %s" % mpilib logger.debug(" MPILIB set to %s" % mpilib) if case_opt.startswith('N'): ninst = case_opt[1:] create_newcase_cmd += " --ninst %s" % ninst logger.debug(" NINST set to %s" % ninst) if case_opt.startswith('P'): pesize = case_opt[1:] create_newcase_cmd += " --pecount %s" % pesize if self._queue is not None: create_newcase_cmd += " --queue=%s" % self._queue if self._walltime is not None: create_newcase_cmd += " --walltime %s" % self._walltime elif test in self._test_data and "options" in self._test_data[test] and \ "wallclock" in self._test_data[test]['options']: create_newcase_cmd += " --walltime %s" % self._test_data[test][ 'options']['wallclock'] logger.debug("Calling create_newcase: " + create_newcase_cmd) return self._shell_cmd_for_phase(test, create_newcase_cmd, CREATE_NEWCASE_PHASE) ########################################################################### def _xml_phase(self, test): ########################################################################### test_case = CIME.utils.parse_test_name(test)[0] # Create, fill and write an envtest object test_dir = self._get_test_dir(test) envtest = EnvTest(test_dir) # Determine list of component classes that this coupler/driver knows how # to deal with. This list follows the same order as compset longnames follow. files = Files() drv_config_file = files.get_value("CONFIG_CPL_FILE") drv_comp = Component(drv_config_file) envtest.add_elements_by_group(files, {}, "env_test.xml") envtest.add_elements_by_group(drv_comp, {}, "env_test.xml") envtest.set_value("TESTCASE", test_case) envtest.set_value("TEST_TESTID", self._test_id) envtest.set_value("CASEBASEID", test) if test in self._test_data and "options" in self._test_data[test] and \ "memleak_tolerance" in self._test_data[test]['options']: envtest.set_value( "TEST_MEMLEAK_TOLERANCE", self._test_data[test]['options']['memleak_tolerance']) test_argv = "-testname %s -testroot %s" % (test, self._test_root) if self._baseline_gen_name: test_argv += " -generate %s" % self._baseline_gen_name basegen_case_fullpath = os.path.join(self._baseline_root, self._baseline_gen_name, test) logger.debug("basegen_case is %s" % basegen_case_fullpath) envtest.set_value("BASELINE_NAME_GEN", self._baseline_gen_name) envtest.set_value("BASEGEN_CASE", os.path.join(self._baseline_gen_name, test)) if self._baseline_cmp_name: test_argv += " -compare %s" % self._baseline_cmp_name envtest.set_value("BASELINE_NAME_CMP", self._baseline_cmp_name) envtest.set_value("BASECMP_CASE", os.path.join(self._baseline_cmp_name, test)) envtest.set_value("TEST_ARGV", test_argv) envtest.set_value("CLEANUP", self._clean) if self._baseline_gen_name or self._baseline_cmp_name: envtest.set_value("BASELINE_ROOT", self._baseline_root) envtest.set_value("GENERATE_BASELINE", self._baseline_gen_name is not None) envtest.set_value("COMPARE_BASELINE", self._baseline_cmp_name is not None) envtest.set_value( "CCSM_CPRNC", self._machobj.get_value("CCSM_CPRNC", resolved=False)) # Add the test instructions from config_test to env_test in the case config_test = Tests() testnode = config_test.get_test_node(test_case) envtest.add_test(testnode) # Determine the test_case from the test name test_case, case_opts = CIME.utils.parse_test_name(test)[:2] # Determine case_opts from the test_case if case_opts is not None: logger.debug("case_opts are %s " % case_opts) for opt in case_opts: logger.debug("case_opt is %s" % opt) if opt == 'D': envtest.set_test_parameter("DEBUG", "TRUE") logger.debug(" DEBUG set to TRUE") elif opt == 'E': envtest.set_test_parameter("USE_ESMF_LIB", "TRUE") envtest.set_test_parameter("COMP_INTERFACE", "ESMF") logger.debug(" USE_ESMF_LIB set to TRUE") logger.debug(" COMP_INTERFACE set to ESMF") elif opt == 'CG': envtest.set_test_parameter("CALENDAR", "GREGORIAN") logger.debug(" CALENDAR set to %s" % opt) elif opt.startswith('L'): match = re.match('L([A-Za-z])([0-9]*)', opt) stop_option = { "y": "nyears", "m": "nmonths", "d": "ndays", "h": "nhours", "s": "nseconds", "n": "nsteps" } opt = match.group(1) envtest.set_test_parameter("STOP_OPTION", stop_option[opt]) opti = match.group(2) envtest.set_test_parameter("STOP_N", opti) logger.debug(" STOP_OPTION set to %s" % stop_option[opt]) logger.debug(" STOP_N set to %s" % opti) elif opt.startswith('M'): # M option handled by create newcase continue elif opt.startswith('P'): # P option handled by create newcase continue elif opt.startswith('N'): # handled in create_newcase continue elif opt.startswith('IOP'): logger.warn("IOP test option not yet implemented") else: expect(False, "Could not parse option '%s' " % opt) envtest.write() lockedfiles = os.path.join(test_dir, "LockedFiles") if not os.path.exists(lockedfiles): os.mkdir(lockedfiles) shutil.copy(os.path.join(test_dir, "env_run.xml"), os.path.join(lockedfiles, "env_run.orig.xml")) with Case(test_dir, read_only=False) as case: if self._output_root is None: self._output_root = case.get_value("CIME_OUTPUT_ROOT") case.set_value( "SHAREDLIBROOT", os.path.join(self._output_root, "sharedlibroot.%s" % self._test_id)) envtest.set_initial_values(case) case.set_value("TEST", True) if self._save_timing: case.set_value("SAVE_TIMING", True) return True
def parse_command_line(args, description): ############################################################################### help_str = """ Solve a Mixed Integer Linear Program to find a PE layout that minimizes the wall-clock time per model day. """ parser = argparse.ArgumentParser(usage=help_str, description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) CIME.utils.setup_standard_logging_options(parser) parser.add_argument('--test-id', default=DEFAULT_TESTID, help='test-id to use for all timing runs') parser.add_argument("-r", "--test-root", help="Where test cases were created." " Will default to output root as defined in the config_machines file") parser.add_argument('--timing-dir', help='alternative to using casename ' 'to find timing data, instead read all files in' ' this directory') parser.add_argument('--blocksize', help='default minimum size of blocks to assign to all ' 'components. Components can be assigned different ' 'blocksizes using --blocksize_XXX. Default 1', type=int) for c in COMPONENT_LIST: parser.add_argument('--blocksize-%s' % c.lower(), help='minimum blocksize for component %s, if ' 'different from --blocksize', type=int) parser.add_argument('--total-tasks', type=int, help='Number of pes available for assignment') parser.add_argument("--layout", help="name of layout to solve (default selected internally)") parser.add_argument("--graph-models", action="store_true", help="plot cost v. ntasks models. requires matplotlib") parser.add_argument("--print-models", action="store_true", help="print all costs and ntasks") parser.add_argument("--pe-output", help="write pe layout to file") parser.add_argument('--json-output', help="write MILP data to .json file") parser.add_argument('--json-input', help="solve using data from .json file") args = CIME.utils.parse_args_and_handle_standard_logging_options(args, parser) if args.total_tasks is None and args.json_input is None: expect(args.total_tasks is not None or args.json_input is not None, "--total-tasks or --json-input option must be set") blocksizes = {} for c in COMPONENT_LIST: attrib = 'blocksize_%s' % c.lower() if getattr(args, attrib) is not None: blocksizes[c] = getattr(args, attrib) elif args.blocksize is not None: blocksizes[c] = args.blocksize test_root = args.test_root if test_root is None: machobj = Machines() test_root = machobj.get_value("CIME_OUTPUT_ROOT") return (args.test_id, test_root, args.timing_dir, blocksizes, args.total_tasks, args.layout, args.graph_models, args.print_models, args.pe_output, args.json_output, args.json_input)
def configure(self, compset_name, grid_name, machine_name=None, project=None, pecount=None, compiler=None, mpilib=None, user_compset=False, pesfile=None, user_grid=False, gridfile=None, ninst=1, test=False): #-------------------------------------------- # compset, pesfile, and compset components #-------------------------------------------- self._set_compset_and_pesfile(compset_name, user_compset=user_compset, pesfile=pesfile) self._components = self.get_compset_components() #FIXME - if --user-compset is True then need to determine that #all of the compset settings are valid #-------------------------------------------- # grid #-------------------------------------------- if user_grid is True and gridfile is not None: self.set_value("GRIDS_SPEC_FILE", gridfile); grids = Grids(gridfile) gridinfo = grids.get_grid_info(name=grid_name, compset=self._compsetname) self._gridname = gridinfo["GRID"] for key,value in gridinfo.items(): logger.debug("Set grid %s %s"%(key,value)) self.set_value(key,value) #-------------------------------------------- # component config data #-------------------------------------------- self._get_component_config_data() self.get_compset_var_settings() # Add the group and elements for the config_files.xml for idx, config_file in enumerate(self._component_config_files): self.set_value(config_file[0],config_file[1]) #-------------------------------------------- # machine #-------------------------------------------- # set machine values in env_xxx files machobj = Machines(machine=machine_name) machine_name = machobj.get_machine_name() self.set_value("MACH",machine_name) nodenames = machobj.get_node_names() nodenames = [x for x in nodenames if '_system' not in x and '_variables' not in x and 'mpirun' not in x and\ 'COMPILER' not in x and 'MPILIB' not in x] for nodename in nodenames: value = machobj.get_value(nodename) type_str = self.get_type_info(nodename) if type_str is not None: self.set_value(nodename, convert_to_type(value, type_str, nodename)) if compiler is None: compiler = machobj.get_default_compiler() else: expect(machobj.is_valid_compiler(compiler), "compiler %s is not supported on machine %s" %(compiler, machine_name)) self.set_value("COMPILER",compiler) if mpilib is None: mpilib = machobj.get_default_MPIlib({"compiler":compiler}) else: expect(machobj.is_valid_MPIlib(mpilib, {"compiler":compiler}), "MPIlib %s is not supported on machine %s" %(mpilib, machine_name)) self.set_value("MPILIB",mpilib) machdir = machobj.get_machines_dir() self.set_value("MACHDIR", machdir) # Overwriting an existing exeroot or rundir can cause problems exeroot = self.get_value("EXEROOT") rundir = self.get_value("RUNDIR") for wdir in (exeroot, rundir): if os.path.exists(wdir): expect(not test, "Directory %s already exists, aborting test"% wdir) response = raw_input("\nDirectory %s already exists, (r)eplace, (a)bort, or (u)se existing?"% wdir) if response.startswith("r"): shutil.rmtree(wdir) else: expect(response.startswith("u"), "Aborting by user request") # the following go into the env_mach_specific file vars = ("module_system", "environment_variables", "mpirun") env_mach_specific_obj = self._get_env("mach_specific") for var in vars: nodes = machobj.get_first_child_nodes(var) for node in nodes: env_mach_specific_obj.add_child(node) #-------------------------------------------- # pe payout #-------------------------------------------- pesobj = Pes(self._pesfile) #FIXME - add pesize_opts as optional argument below pes_ntasks, pes_nthrds, pes_rootpe = pesobj.find_pes_layout(self._gridname, self._compsetname, machine_name, pesize_opts=pecount) mach_pes_obj = self._get_env("mach_pes") totaltasks = {} for key, value in pes_ntasks.items(): totaltasks[key[-3:]] = int(value) mach_pes_obj.set_value(key,int(value)) for key, value in pes_rootpe.items(): totaltasks[key[-3:]] += int(value) mach_pes_obj.set_value(key,int(value)) for key, value in pes_nthrds.items(): totaltasks[key[-3:]] *= int(value) mach_pes_obj.set_value(key,int(value)) maxval = 1 pes_per_node = mach_pes_obj.get_value("PES_PER_NODE") for key, val in totaltasks.items(): if val < 0: val = -1*val*pes_per_node if val > maxval: maxval = val # Make sure that every component has been accounted for # set, nthrds and ntasks to 1 otherwise. Also set the ninst values here. for compclass in self._component_classes: if compclass == "DRV": continue key = "NINST_%s"%compclass mach_pes_obj.set_value(key, ninst) key = "NTASKS_%s"%compclass if key not in pes_ntasks.keys(): mach_pes_obj.set_value(key,1) key = "NTHRDS_%s"%compclass if compclass not in pes_nthrds.keys(): mach_pes_obj.set_value(compclass,1) # FIXME - this is a short term fix for dealing with the restriction that # CISM1 cannot run on multiple cores if "CISM1" in self._compsetname: mach_pes_obj.set_value("NTASKS_GLC",1) mach_pes_obj.set_value("NTHRDS_GLC",1) #-------------------------------------------- # batch system #-------------------------------------------- batch_system_type = machobj.get_value("BATCH_SYSTEM") batch = Batch(batch_system=batch_system_type, machine=machine_name) bjobs = batch.get_batch_jobs() env_batch = self._get_env("batch") env_batch.set_batch_system(batch, batch_system_type=batch_system_type) env_batch.create_job_groups(bjobs) env_batch.set_job_defaults(bjobs, pesize=maxval) self._env_files_that_need_rewrite.add(env_batch) self.set_value("COMPSET",self._compsetname) self._set_pio_xml() logger.info(" Compset is: %s " %self._compsetname) logger.info(" Grid is: %s " %self._gridname ) logger.info(" Components in compset are: %s " %self._components) # miscellaneous settings if self.get_value("RUN_TYPE") == 'hybrid': self.set_value("GET_REFCASE", True) # Set project id if project is None: project = get_project(machobj) if project is not None: self.set_value("PROJECT", project) elif machobj.get_value("PROJECT_REQUIRED"): expect(project is not None, "PROJECT_REQUIRED is true but no project found")
class TestScheduler(object): ############################################################################### ########################################################################### def __init__(self, test_names, test_data=None, no_run=False, no_build=False, no_setup=False, no_batch=None, test_root=None, test_id=None, machine_name=None, compiler=None, baseline_root=None, baseline_cmp_name=None, baseline_gen_name=None, clean=False, namelists_only=False, project=None, parallel_jobs=None, walltime=None, proc_pool=None, use_existing=False, save_timing=False, queue=None, allow_baseline_overwrite=False, output_root=None, force_procs=None, force_threads=None, mpilib=None, input_dir=None, pesfile=None, mail_user=None, mail_type=None): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = get_model() self._save_timing = save_timing self._queue = queue self._test_data = {} if test_data is None else test_data # Format: {test_name -> {data_name -> data}} self._mpilib = mpilib # allow override of default mpilib self._completed_tests = 0 self._input_dir = input_dir self._pesfile = pesfile self._allow_baseline_overwrite = allow_baseline_overwrite self._mail_user = mail_user self._mail_type = mail_type self._machobj = Machines(machine=machine_name) self._model_build_cost = 4 # If user is forcing procs or threads, re-write test names to reflect this. if force_procs or force_threads: test_names = _translate_test_names_for_new_pecount(test_names, force_procs, force_threads) self._no_setup = no_setup self._no_build = no_build or no_setup or namelists_only self._no_run = no_run or self._no_build self._output_root = output_root # Figure out what project to use if project is None: self._project = CIME.utils.get_project() if self._project is None: self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() expect(not (self._no_batch and self._queue is not None), "Does not make sense to request a queue without batch system") # Determine and resolve test_root if test_root is not None: self._test_root = test_root elif self._output_root is not None: self._test_root = self._output_root else: self._test_root = self._machobj.get_value("CIME_OUTPUT_ROOT") if self._project is not None: self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_timestamp() self._compiler = self._machobj.get_default_compiler() if compiler is None else compiler self._clean = clean self._namelists_only = namelists_only self._walltime = walltime if parallel_jobs is None: self._parallel_jobs = min(len(test_names), self._machobj.get_value("MAX_MPITASKS_PER_NODE")) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = baseline_cmp_name # Implies comparison should be done if not None self._baseline_gen_name = baseline_gen_name # Implies generation should be done if not None # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None \ else self._machobj.get_value("BASELINE_ROOT") if self._project is not None: self._baseline_root = self._baseline_root.replace("$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if baseline_cmp_name or baseline_gen_name: if self._baseline_cmp_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect(os.path.isdir(full_baseline_dir), "Missing baseline comparison directory {}".format(full_baseline_dir)) # the following is to assure that the existing generate directory is not overwritten if self._baseline_gen_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name) existing_baselines = [] for test_name in test_names: test_baseline = os.path.join(full_baseline_dir, test_name) if os.path.isdir(test_baseline): existing_baselines.append(test_baseline) expect(allow_baseline_overwrite or len(existing_baselines) == 0, "Baseline directories already exists {}\n" \ "Use -o to avoid this error".format(existing_baselines)) if self._cime_model == "e3sm": _order_tests_by_runtime(test_names, self._baseline_root) # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # name -> (phase, status) self._tests = OrderedDict() for test_name in test_names: self._tests[test_name] = (TEST_START, TEST_PASS_STATUS) # Oversubscribe by 1/4 if proc_pool is None: pes = int(self._machobj.get_value("MAX_TASKS_PER_NODE")) self._proc_pool = int(pes * 1.25) else: self._proc_pool = int(proc_pool) self._procs_avail = self._proc_pool # Setup phases self._phases = list(PHASES) if self._no_setup: self._phases.remove(SETUP_PHASE) if self._no_build: self._phases.remove(SHAREDLIB_BUILD_PHASE) self._phases.remove(MODEL_BUILD_PHASE) if self._no_run: self._phases.remove(RUN_PHASE) if use_existing: for test in self._tests: with TestStatus(self._get_test_dir(test)) as ts: for phase, status in ts: if phase in CORE_PHASES: if status in [TEST_PEND_STATUS, TEST_FAIL_STATUS]: if status == TEST_FAIL_STATUS: # Import for potential subsequent waits ts.set_status(phase, TEST_PEND_STATUS) # We need to pick up here break else: if phase != SUBMIT_PHASE: # Somewhat subtle. Create_test considers submit/run to be the run phase, # so don't try to update test status for a passed submit phase self._update_test_status(test, phase, TEST_PEND_STATUS) self._update_test_status(test, phase, status) if phase == RUN_PHASE: logger.info("Test {} passed and will not be re-run".format(test)) logger.info("Using existing test directory {}".format(self._get_test_dir(test))) else: # None of the test directories should already exist. for test in self._tests: expect(not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '{}', it already exists." " Pick a different test-id".format(self._get_test_dir(test))) logger.info("Creating test directory {}".format(self._get_test_dir(test))) # By the end of this constructor, this program should never hard abort, # instead, errors will be placed in the TestStatus files for the various # tests cases ########################################################################### def get_testnames(self): ########################################################################### return list(self._tests.keys()) ########################################################################### def _log_output(self, test, output): ########################################################################### test_dir = self._get_test_dir(test) if not os.path.isdir(test_dir): # Note: making this directory could cause create_newcase to fail # if this is run before. os.makedirs(test_dir) append_testlog(output, caseroot=test_dir) ########################################################################### def _get_case_id(self, test): ########################################################################### baseline_action_code = "" if self._baseline_gen_name: baseline_action_code += "G" if self._baseline_cmp_name: baseline_action_code += "C" if len(baseline_action_code) > 0: return "{}.{}.{}".format(test, baseline_action_code, self._test_id) else: return "{}.{}".format(test, self._test_id) ########################################################################### def _get_test_dir(self, test): ########################################################################### return os.path.join(self._test_root, self._get_case_id(test)) ########################################################################### def _get_test_data(self, test): ########################################################################### # Must be atomic return self._tests[test] ########################################################################### def _is_broken(self, test): ########################################################################### status = self._get_test_status(test) return status != TEST_PASS_STATUS and status != TEST_PEND_STATUS ########################################################################### def _work_remains(self, test): ########################################################################### test_phase, test_status = self._get_test_data(test) return (test_status == TEST_PASS_STATUS or test_status == TEST_PEND_STATUS) and\ test_phase != self._phases[-1] ########################################################################### def _get_test_status(self, test, phase=None): ########################################################################### curr_phase, curr_status = self._get_test_data(test) if phase is None or phase == curr_phase: return curr_status else: expect(phase is None or self._phases.index(phase) < self._phases.index(curr_phase), "Tried to see the future") # Assume all older phases PASSed return TEST_PASS_STATUS ########################################################################### def _get_test_phase(self, test): ########################################################################### return self._get_test_data(test)[0] ########################################################################### def _update_test_status(self, test, phase, status): ########################################################################### phase_idx = self._phases.index(phase) old_phase, old_status = self._get_test_data(test) if old_phase == phase: expect(old_status == TEST_PEND_STATUS, "Only valid to transition from PEND to something else, found '{}' for phase '{}'".format(old_status, phase)) expect(status != TEST_PEND_STATUS, "Cannot transition from PEND -> PEND") else: expect(old_status == TEST_PASS_STATUS, "Why did we move on to next phase when prior phase did not pass?") expect(status == TEST_PEND_STATUS, "New phase should be set to pending status") expect(self._phases.index(old_phase) == phase_idx - 1, "Skipped phase? {} {}".format(old_phase, phase_idx)) # Must be atomic self._tests[test] = (phase, status) ########################################################################### def _shell_cmd_for_phase(self, test, cmd, phase, from_dir=None): ########################################################################### while True: rc, output, errput = run_cmd(cmd, from_dir=from_dir) if rc != 0: self._log_output(test, "{} FAILED for test '{}'.\nCommand: {}\nOutput: {}\n". format(phase, test, cmd, output.encode('utf-8') + b"\n" + errput.encode('utf-8'))) # Temporary hack to get around odd file descriptor use by # buildnml scripts. if "bad interpreter" in output: time.sleep(1) continue else: return False, errput else: # We don't want "RUN PASSED" in the TestStatus.log if the only thing that # succeeded was the submission. phase = "SUBMIT" if phase == RUN_PHASE else phase self._log_output(test, "{} PASSED for test '{}'.\nCommand: {}\nOutput: {}\n". format(phase, test, cmd, output.encode('utf-8') + b"\n" + errput.encode('utf-8'))) return True, errput ########################################################################### def _create_newcase_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) _, case_opts, grid, compset,\ machine, compiler, test_mods = CIME.utils.parse_test_name(test) create_newcase_cmd = "{} --case {} --res {} --compset {}"\ " --test".format(os.path.join(self._cime_root, "scripts", "create_newcase"), test_dir, grid, compset) if machine is not None: create_newcase_cmd += " --machine {}".format(machine) if compiler is not None: create_newcase_cmd += " --compiler {}".format(compiler) if self._project is not None: create_newcase_cmd += " --project {} ".format(self._project) if self._output_root is not None: create_newcase_cmd += " --output-root {} ".format(self._output_root) if self._input_dir is not None: create_newcase_cmd += " --input-dir {} ".format(self._input_dir) if self._pesfile is not None: create_newcase_cmd += " --pesfile {} ".format(self._pesfile) if test_mods is not None: files = Files() (component, modspath) = test_mods.split('/',1) testmods_dir = files.get_value("TESTS_MODS_DIR", {"component": component}) test_mod_file = os.path.join(testmods_dir, component, modspath) if not os.path.exists(test_mod_file): error = "Missing testmod file '{}'".format(test_mod_file) self._log_output(test, error) return False, error create_newcase_cmd += " --user-mods-dir {}".format(test_mod_file) mpilib = None ninst = 1 ncpl = 1 if case_opts is not None: for case_opt in case_opts: # pylint: disable=not-an-iterable if case_opt.startswith('M'): mpilib = case_opt[1:] create_newcase_cmd += " --mpilib {}".format(mpilib) logger.debug (" MPILIB set to {}".format(mpilib)) elif case_opt.startswith('N'): expect(ncpl == 1,"Cannot combine _C and _N options") ninst = case_opt[1:] create_newcase_cmd += " --ninst {}".format(ninst) logger.debug (" NINST set to {}".format(ninst)) elif case_opt.startswith('C'): expect(ninst == 1,"Cannot combine _C and _N options") ncpl = case_opt[1:] create_newcase_cmd += " --ninst {} --multi-driver" .format(ncpl) logger.debug (" NCPL set to {}" .format(ncpl)) elif case_opt.startswith('P'): pesize = case_opt[1:] create_newcase_cmd += " --pecount {}".format(pesize) elif case_opt.startswith('V'): driver = case_opt[1:] create_newcase_cmd += " --driver {}".format(driver) # create_test mpilib option overrides default but not explicitly set case_opt mpilib if mpilib is None and self._mpilib is not None: create_newcase_cmd += " --mpilib {}".format(self._mpilib) logger.debug (" MPILIB set to {}".format(self._mpilib)) if self._queue is not None: create_newcase_cmd += " --queue={}".format(self._queue) if self._walltime is not None: create_newcase_cmd += " --walltime {}".format(self._walltime) else: # model specific ways of setting time if self._cime_model == "e3sm": recommended_time = _get_time_est(test, self._baseline_root) if recommended_time is not None: create_newcase_cmd += " --walltime {}".format(recommended_time) else: if test in self._test_data and "options" in self._test_data[test] and \ "wallclock" in self._test_data[test]['options']: create_newcase_cmd += " --walltime {}".format(self._test_data[test]['options']['wallclock']) logger.debug("Calling create_newcase: " + create_newcase_cmd) return self._shell_cmd_for_phase(test, create_newcase_cmd, CREATE_NEWCASE_PHASE) ########################################################################### def _xml_phase(self, test): ########################################################################### test_case = CIME.utils.parse_test_name(test)[0] # Create, fill and write an envtest object test_dir = self._get_test_dir(test) envtest = EnvTest(test_dir) # Determine list of component classes that this coupler/driver knows how # to deal with. This list follows the same order as compset longnames follow. files = Files() drv_config_file = files.get_value("CONFIG_CPL_FILE") drv_comp = Component(drv_config_file, "CPL") envtest.add_elements_by_group(files, {}, "env_test.xml") envtest.add_elements_by_group(drv_comp, {}, "env_test.xml") envtest.set_value("TESTCASE", test_case) envtest.set_value("TEST_TESTID", self._test_id) envtest.set_value("CASEBASEID", test) if test in self._test_data and "options" in self._test_data[test] and \ "memleak_tolerance" in self._test_data[test]['options']: envtest.set_value("TEST_MEMLEAK_TOLERANCE", self._test_data[test]['options']['memleak_tolerance']) test_argv = "-testname {} -testroot {}".format(test, self._test_root) if self._baseline_gen_name: test_argv += " -generate {}".format(self._baseline_gen_name) basegen_case_fullpath = os.path.join(self._baseline_root,self._baseline_gen_name, test) logger.debug("basegen_case is {}".format(basegen_case_fullpath)) envtest.set_value("BASELINE_NAME_GEN", self._baseline_gen_name) envtest.set_value("BASEGEN_CASE", os.path.join(self._baseline_gen_name, test)) if self._baseline_cmp_name: test_argv += " -compare {}".format(self._baseline_cmp_name) envtest.set_value("BASELINE_NAME_CMP", self._baseline_cmp_name) envtest.set_value("BASECMP_CASE", os.path.join(self._baseline_cmp_name, test)) envtest.set_value("TEST_ARGV", test_argv) envtest.set_value("CLEANUP", self._clean) envtest.set_value("BASELINE_ROOT", self._baseline_root) envtest.set_value("GENERATE_BASELINE", self._baseline_gen_name is not None) envtest.set_value("COMPARE_BASELINE", self._baseline_cmp_name is not None) envtest.set_value("CCSM_CPRNC", self._machobj.get_value("CCSM_CPRNC", resolved=False)) tput_tolerance = self._machobj.get_value("TEST_TPUT_TOLERANCE", resolved=False) envtest.set_value("TEST_TPUT_TOLERANCE", 0.25 if tput_tolerance is None else tput_tolerance) # Add the test instructions from config_test to env_test in the case config_test = Tests() testnode = config_test.get_test_node(test_case) envtest.add_test(testnode) # Determine the test_case from the test name test_case, case_opts = CIME.utils.parse_test_name(test)[:2] # Determine case_opts from the test_case if case_opts is not None: logger.debug("case_opts are {} ".format(case_opts)) for opt in case_opts: # pylint: disable=not-an-iterable logger.debug("case_opt is {}".format(opt)) if opt == 'D': envtest.set_test_parameter("DEBUG", "TRUE") logger.debug (" DEBUG set to TRUE") elif opt == 'E': envtest.set_test_parameter("USE_ESMF_LIB", "TRUE") logger.debug (" USE_ESMF_LIB set to TRUE") elif opt == 'CG': envtest.set_test_parameter("CALENDAR", "GREGORIAN") logger.debug (" CALENDAR set to {}".format(opt)) elif opt.startswith('L'): match = re.match('L([A-Za-z])([0-9]*)', opt) stop_option = {"y":"nyears", "m":"nmonths", "d":"ndays", "h":"nhours", "s":"nseconds", "n":"nsteps"} opt = match.group(1) envtest.set_test_parameter("STOP_OPTION",stop_option[opt]) opti = match.group(2) envtest.set_test_parameter("STOP_N", opti) logger.debug (" STOP_OPTION set to {}".format(stop_option[opt])) logger.debug (" STOP_N set to {}".format(opti)) elif opt.startswith('R'): # R option is for testing in PTS_MODE or Single Column Model # (SCM) mode envtest.set_test_parameter("PTS_MODE", "TRUE") # For PTS_MODE, compile with mpi-serial envtest.set_test_parameter("MPILIB", "mpi-serial") elif (opt.startswith('I') or # Marker to distinguish tests with same name - ignored opt.startswith('M') or # handled in create_newcase opt.startswith('P') or # handled in create_newcase opt.startswith('N') or # handled in create_newcase opt.startswith('C') or # handled in create_newcase opt.startswith('V')): # handled in create_newcase pass elif opt.startswith('IOP'): logger.warning("IOP test option not yet implemented") else: expect(False, "Could not parse option '{}' ".format(opt)) envtest.write() lock_file("env_run.xml", caseroot=test_dir, newname="env_run.orig.xml") with Case(test_dir, read_only=False) as case: if self._output_root is None: self._output_root = case.get_value("CIME_OUTPUT_ROOT") # if we are running a single test we don't need sharedlibroot if len(self._tests) > 1 and self._cime_model != "e3sm": case.set_value("SHAREDLIBROOT", os.path.join(self._output_root, "sharedlibroot.{}".format(self._test_id))) envtest.set_initial_values(case) case.set_value("TEST", True) case.set_value("SAVE_TIMING", self._save_timing) # Scale back build parallelism on systems with few cores if self._model_build_cost > self._proc_pool: case.set_value("GMAKE_J", self._proc_pool) self._model_build_cost = self._proc_pool
def load_balancing_submit(compset, res, pesfile, mpilib, compiler, project, machine, extra_options_file, test_id, force_purge, test_root): ################################################################################ # Read in list of pes from given file expect(os.access(pesfile, os.R_OK), 'ERROR: File %s not found', pesfile) logger.info('Reading XML file %s. Searching for pesize entries:', pesfile) try: pesobj = Pes(pesfile) except ParseError: expect(False, 'ERROR: File %s not parseable', pesfile) pesize_list = [] grid_nodes = pesobj.get_children("grid") for gnode in grid_nodes: mach_nodes = pesobj.get_children("mach", root=gnode) for mnode in mach_nodes: pes_nodes = pesobj.get_children("pes", root=mnode) for pnode in pes_nodes: pesize = pesobj.get(pnode, 'pesize') if not pesize: logger.critical('No pesize for pes node in file %s', pesfile) if pesize in pesize_list: logger.critical('pesize %s duplicated in file %s', pesize, pesfile) pesize_list.append(pesize) expect(pesize_list, 'ERROR: No grid entries found in pes file {}'.format(pesfile)) machobj = Machines(machine=machine) if test_root is None: test_root = machobj.get_value("CIME_OUTPUT_ROOT") if machine is None: machine = machobj.get_machine_name() print "machine is {}".format(machine) if compiler is None: compiler = machobj.get_default_compiler() print "compiler is {}".format(compiler) if mpilib is None: mpilib = machobj.get_default_MPIlib({"compiler":compiler}) test_names = [] for i in xrange(len(pesize_list)): test_names.append(get_full_test_name("PFS_I{}".format(i),grid=res, compset=compset, machine=machine, compiler=compiler)) casedir = os.path.join(test_root, test_names[-1] + "." + test_id) print "casedir is {}".format(casedir) if os.path.isdir(casedir): if force_purge: logger.info('Removing directory %s', casedir) shutil.rmtree(casedir) else: expect(False, "casedir {} already exists, use the --force-purge option, --test-root or" " --test-id options".format(casedir)) tests = TestScheduler(test_names, no_setup = True, compiler=compiler, machine_name=machine, mpilib=mpilib, test_root=test_root, test_id=test_id, project=project) success = tests.run_tests(wait=True) expect(success, "Error in creating cases") testnames = [] for test in tests.get_testnames(): testname = os.path.join(test_root, test + "." + test_id) testnames.append( testname) logger.info("test is {}".format(testname)) with Case(testname) as case: pes_ntasks, pes_nthrds, pes_rootpe, _, _, _ = \ pesobj.find_pes_layout('any', 'any', 'any', pesize_opts=pesize_list.pop(0)) for key in pes_ntasks: case.set_value(key, pes_ntasks[key]) for key in pes_nthrds: case.set_value(key, pes_nthrds[key]) for key in pes_rootpe: case.set_value(key, pes_rootpe[key]) if extra_options_file is not None: try: extras = open(extra_options_file, 'r') for line in extras.readlines(): split = line.split('=') if len(split) == 2: logger.info('setting %s=%s', split[0], split[1]) case.set_value(split[0], split[1]) else: logger.debug('ignoring line in {}: {}'.format( extra_options_file, line)) extras.close() except IOError: expect(False, "ERROR: Could not read file {}".format(extra_options_file)) tests = TestScheduler(test_names, use_existing=True, test_root=test_root, test_id=test_id) success = tests.run_tests(wait=False) expect(success, "Error in running cases") # need to fix logger.info('Timing jobs submitted. After jobs completed, run to optimize ' 'pe layout:\n load_balancing_solve --test-id {} --test-root {}'. format(test_id, test_root))
def __init__(self, test_names, test_data=None, no_run=False, no_build=False, no_setup=False, no_batch=None, test_root=None, test_id=None, machine_name=None, compiler=None, baseline_root=None, baseline_cmp_name=None, baseline_gen_name=None, clean=False, namelists_only=False, project=None, parallel_jobs=None, walltime=None, proc_pool=None, use_existing=False, save_timing=False, queue=None, allow_baseline_overwrite=False, output_root=None, force_procs=None, force_threads=None, mpilib=None, input_dir=None, pesfile=None, mail_user=None, mail_type=None): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = get_model() self._save_timing = save_timing self._queue = queue self._test_data = {} if test_data is None else test_data # Format: {test_name -> {data_name -> data}} self._mpilib = mpilib # allow override of default mpilib self._completed_tests = 0 self._input_dir = input_dir self._pesfile = pesfile self._allow_baseline_overwrite = allow_baseline_overwrite self._mail_user = mail_user self._mail_type = mail_type self._machobj = Machines(machine=machine_name) self._model_build_cost = 4 # If user is forcing procs or threads, re-write test names to reflect this. if force_procs or force_threads: test_names = _translate_test_names_for_new_pecount(test_names, force_procs, force_threads) self._no_setup = no_setup self._no_build = no_build or no_setup or namelists_only self._no_run = no_run or self._no_build self._output_root = output_root # Figure out what project to use if project is None: self._project = CIME.utils.get_project() if self._project is None: self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() expect(not (self._no_batch and self._queue is not None), "Does not make sense to request a queue without batch system") # Determine and resolve test_root if test_root is not None: self._test_root = test_root elif self._output_root is not None: self._test_root = self._output_root else: self._test_root = self._machobj.get_value("CIME_OUTPUT_ROOT") if self._project is not None: self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_timestamp() self._compiler = self._machobj.get_default_compiler() if compiler is None else compiler self._clean = clean self._namelists_only = namelists_only self._walltime = walltime if parallel_jobs is None: self._parallel_jobs = min(len(test_names), self._machobj.get_value("MAX_MPITASKS_PER_NODE")) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = baseline_cmp_name # Implies comparison should be done if not None self._baseline_gen_name = baseline_gen_name # Implies generation should be done if not None # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None \ else self._machobj.get_value("BASELINE_ROOT") if self._project is not None: self._baseline_root = self._baseline_root.replace("$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if baseline_cmp_name or baseline_gen_name: if self._baseline_cmp_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect(os.path.isdir(full_baseline_dir), "Missing baseline comparison directory {}".format(full_baseline_dir)) # the following is to assure that the existing generate directory is not overwritten if self._baseline_gen_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name) existing_baselines = [] for test_name in test_names: test_baseline = os.path.join(full_baseline_dir, test_name) if os.path.isdir(test_baseline): existing_baselines.append(test_baseline) expect(allow_baseline_overwrite or len(existing_baselines) == 0, "Baseline directories already exists {}\n" \ "Use -o to avoid this error".format(existing_baselines)) if self._cime_model == "e3sm": _order_tests_by_runtime(test_names, self._baseline_root) # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # name -> (phase, status) self._tests = OrderedDict() for test_name in test_names: self._tests[test_name] = (TEST_START, TEST_PASS_STATUS) # Oversubscribe by 1/4 if proc_pool is None: pes = int(self._machobj.get_value("MAX_TASKS_PER_NODE")) self._proc_pool = int(pes * 1.25) else: self._proc_pool = int(proc_pool) self._procs_avail = self._proc_pool # Setup phases self._phases = list(PHASES) if self._no_setup: self._phases.remove(SETUP_PHASE) if self._no_build: self._phases.remove(SHAREDLIB_BUILD_PHASE) self._phases.remove(MODEL_BUILD_PHASE) if self._no_run: self._phases.remove(RUN_PHASE) if use_existing: for test in self._tests: with TestStatus(self._get_test_dir(test)) as ts: for phase, status in ts: if phase in CORE_PHASES: if status in [TEST_PEND_STATUS, TEST_FAIL_STATUS]: if status == TEST_FAIL_STATUS: # Import for potential subsequent waits ts.set_status(phase, TEST_PEND_STATUS) # We need to pick up here break else: if phase != SUBMIT_PHASE: # Somewhat subtle. Create_test considers submit/run to be the run phase, # so don't try to update test status for a passed submit phase self._update_test_status(test, phase, TEST_PEND_STATUS) self._update_test_status(test, phase, status) if phase == RUN_PHASE: logger.info("Test {} passed and will not be re-run".format(test)) logger.info("Using existing test directory {}".format(self._get_test_dir(test))) else: # None of the test directories should already exist. for test in self._tests: expect(not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '{}', it already exists." " Pick a different test-id".format(self._get_test_dir(test))) logger.info("Creating test directory {}".format(self._get_test_dir(test)))
def configure(self, compset_name, grid_name, machine_name=None, project=None, pecount=None, compiler=None, mpilib=None, user_compset=False, pesfile=None, user_grid=False, gridfile=None, ninst=1, test=False, walltime=None, queue=None, output_root=None): #-------------------------------------------- # compset, pesfile, and compset components #-------------------------------------------- self._set_compset_and_pesfile(compset_name, user_compset=user_compset, pesfile=pesfile) self._components = self.get_compset_components() #FIXME - if --user-compset is True then need to determine that #all of the compset settings are valid #-------------------------------------------- # grid #-------------------------------------------- if user_grid is True and gridfile is not None: self.set_value("GRIDS_SPEC_FILE", gridfile) grids = Grids(gridfile) gridinfo = grids.get_grid_info(name=grid_name, compset=self._compsetname) self._gridname = gridinfo["GRID"] for key, value in gridinfo.items(): logger.debug("Set grid %s %s" % (key, value)) self.set_lookup_value(key, value) #-------------------------------------------- # component config data #-------------------------------------------- self._get_component_config_data() self.get_compset_var_settings() #-------------------------------------------- # machine #-------------------------------------------- # set machine values in env_xxx files machobj = Machines(machine=machine_name) machine_name = machobj.get_machine_name() self.set_value("MACH", machine_name) nodenames = machobj.get_node_names() nodenames = [x for x in nodenames if '_system' not in x and '_variables' not in x and 'mpirun' not in x and\ 'COMPILER' not in x and 'MPILIB' not in x] for nodename in nodenames: value = machobj.get_value(nodename, resolved=False) type_str = self.get_type_info(nodename) if type_str is not None: logger.debug("machine nodname %s value %s" % (nodename, value)) self.set_value(nodename, convert_to_type(value, type_str, nodename)) if compiler is None: compiler = machobj.get_default_compiler() else: expect( machobj.is_valid_compiler(compiler), "compiler %s is not supported on machine %s" % (compiler, machine_name)) self.set_value("COMPILER", compiler) if mpilib is None: mpilib = machobj.get_default_MPIlib({"compiler": compiler}) else: expect( machobj.is_valid_MPIlib(mpilib, {"compiler": compiler}), "MPIlib %s is not supported on machine %s" % (mpilib, machine_name)) self.set_value("MPILIB", mpilib) machdir = machobj.get_machines_dir() self.set_value("MACHDIR", machdir) # Create env_mach_specific settings from machine info. env_mach_specific_obj = self.get_env("mach_specific") env_mach_specific_obj.populate(machobj) self.schedule_rewrite(env_mach_specific_obj) #-------------------------------------------- # pe payout #-------------------------------------------- match1 = re.match('([0-9]+)x([0-9]+)', "" if pecount is None else pecount) match2 = re.match('([0-9]+)', "" if pecount is None else pecount) pes_ntasks = {} pes_nthrds = {} pes_rootpe = {} if match1: opti_tasks = match1.group(1) opti_thrds = match1.group(2) elif match2: opti_tasks = match2.group(1) opti_thrds = 1 other = {} if match1 or match2: for component_class in self._component_classes: if component_class == "DRV": component_class = "CPL" string = "NTASKS_" + component_class pes_ntasks[string] = opti_tasks string = "NTHRDS_" + component_class pes_nthrds[string] = opti_thrds string = "ROOTPE_" + component_class pes_rootpe[string] = 0 else: pesobj = Pes(self._pesfile) pes_ntasks, pes_nthrds, pes_rootpe, other = pesobj.find_pes_layout( self._gridname, self._compsetname, machine_name, pesize_opts=pecount) mach_pes_obj = self.get_env("mach_pes") totaltasks = {} # Since other items may include PES_PER_NODE we need to do this first # we can get rid of this code when all of the perl is removed for key, value in other.items(): self.set_value(key, value) pes_per_node = self.get_value("PES_PER_NODE") for key, value in pes_ntasks.items(): totaltasks[key[-3:]] = int(value) mach_pes_obj.set_value(key, int(value), pes_per_node=pes_per_node) for key, value in pes_rootpe.items(): totaltasks[key[-3:]] += int(value) mach_pes_obj.set_value(key, int(value), pes_per_node=pes_per_node) for key, value in pes_nthrds.items(): totaltasks[key[-3:]] *= int(value) mach_pes_obj.set_value(key, int(value), pes_per_node=pes_per_node) maxval = 1 if mpilib != "mpi-serial": for key, val in totaltasks.items(): if val < 0: val = -1 * val * pes_per_node if val > maxval: maxval = val # Make sure that every component has been accounted for # set, nthrds and ntasks to 1 otherwise. Also set the ninst values here. for compclass in self._component_classes: if compclass == "DRV": continue key = "NINST_%s" % compclass mach_pes_obj.set_value(key, ninst) key = "NTASKS_%s" % compclass if key not in pes_ntasks.keys(): mach_pes_obj.set_value(key, 1) key = "NTHRDS_%s" % compclass if compclass not in pes_nthrds.keys(): mach_pes_obj.set_value(compclass, 1) # FIXME - this is a short term fix for dealing with the restriction that # CISM1 cannot run on multiple cores if "CISM1" in self._compsetname: mach_pes_obj.set_value("NTASKS_GLC", 1) mach_pes_obj.set_value("NTHRDS_GLC", 1) #-------------------------------------------- # batch system #-------------------------------------------- batch_system_type = machobj.get_value("BATCH_SYSTEM") batch = Batch(batch_system=batch_system_type, machine=machine_name) bjobs = batch.get_batch_jobs() env_batch = self.get_env("batch") env_batch.set_batch_system(batch, batch_system_type=batch_system_type) env_batch.create_job_groups(bjobs) env_batch.set_job_defaults(bjobs, pesize=maxval, walltime=walltime, force_queue=queue) self.schedule_rewrite(env_batch) self.set_value("COMPSET", self._compsetname) self._set_pio_xml() logger.info(" Compset is: %s " % self._compsetname) logger.info(" Grid is: %s " % self._gridname) logger.info(" Components in compset are: %s " % self._components) # Set project id if project is None: project = get_project(machobj) if project is not None: self.set_value("PROJECT", project) elif machobj.get_value("PROJECT_REQUIRED"): expect(project is not None, "PROJECT_REQUIRED is true but no project found") # Resolve the CIME_OUTPUT_ROOT variable, other than this # we don't want to resolve variables until we need them if output_root is None: output_root = self.get_value("CIME_OUTPUT_ROOT") self.set_value("CIME_OUTPUT_ROOT", output_root) # Overwriting an existing exeroot or rundir can cause problems exeroot = self.get_value("EXEROOT") rundir = self.get_value("RUNDIR") for wdir in (exeroot, rundir): logging.debug("wdir is %s" % wdir) if os.path.exists(wdir): expect(not test, "Directory %s already exists, aborting test" % wdir) response = raw_input( "\nDirectory %s already exists, (r)eplace, (a)bort, or (u)se existing?" % wdir) if response.startswith("r"): shutil.rmtree(wdir) else: expect(response.startswith("u"), "Aborting by user request") # miscellaneous settings if self.get_value("RUN_TYPE") == 'hybrid': self.set_value("GET_REFCASE", True) # Turn on short term archiving as cesm default setting model = get_model() self.set_model_version(model) if model == "cesm" and not test: self.set_value("DOUT_S", True) self.set_value("TIMER_LEVEL", 4) if test: self.set_value("TEST", True) self.initialize_derived_attributes()
def _compare_baseline(self): with self._test_status as ts: ts.set_status(CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_FAIL_STATUS) run_dir = self._case.get_value("RUNDIR") case_name = self._case.get_value("CASE") base_dir = os.path.join( self._case.get_value("BASELINE_ROOT"), self._case.get_value("BASECMP_CASE"), ) test_name = "{}".format(case_name.split(".")[-1]) evv_config = { test_name: { "module": os.path.join(evv_lib_dir, "extensions", "tsc.py"), "test-case": case_name, "test-dir": run_dir, "ref-case": "Baseline", "ref-dir": base_dir, "time-slice": [OUT_FREQ, SIM_LENGTH], "inspect-times": INSPECT_AT, "variables": VAR_LIST, "p-threshold": P_THRESHOLD, "component": self.atmmod, } } json_file = os.path.join(run_dir, ".".join([case_name, "json"])) with open(json_file, "w") as config_file: json.dump(evv_config, config_file, indent=4) evv_out_dir = os.path.join(run_dir, ".".join([case_name, "evv"])) evv(["-e", json_file, "-o", evv_out_dir]) with open(os.path.join(evv_out_dir, "index.json"), "r") as evv_f: evv_status = json.load(evv_f) comments = "" for evv_ele in evv_status["Page"]["elements"]: if "Table" in evv_ele: comments = "; ".join( "{}: {}".format(key, val[0]) for key, val in evv_ele["Table"]["data"].items()) if evv_ele["Table"]["data"]["Test status"][0].lower( ) == "pass": self._test_status.set_status( CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_PASS_STATUS, ) break status = self._test_status.get_status( CIME.test_status.BASELINE_PHASE) mach_name = self._case.get_value("MACH") mach_obj = Machines(machine=mach_name) htmlroot = CIME.utils.get_htmlroot(mach_obj) urlroot = CIME.utils.get_urlroot(mach_obj) if htmlroot is not None: with CIME.utils.SharedArea(): dir_util.copy_tree( evv_out_dir, os.path.join(htmlroot, "evv", case_name), preserve_mode=False, ) if urlroot is None: urlroot = "[{}_URL]".format(mach_name.capitalize()) viewing = "{}/evv/{}/index.html".format(urlroot, case_name) else: viewing = ( "{}\n" " EVV viewing instructions can be found at: " " https://github.com/E3SM-Project/E3SM/blob/master/cime/scripts/" "climate_reproducibility/README.md#test-passfail-and-extended-output" "".format(evv_out_dir)) comments = ("{} {} for test '{}'.\n" " {}\n" " EVV results can be viewed at:\n" " {}".format( CIME.test_status.BASELINE_PHASE, status, test_name, comments, viewing, )) CIME.utils.append_testlog(comments, self._orig_caseroot)
class SystemTest(object): ############################################################################### ########################################################################### def __init__(self, test_names, no_run=False, no_build=False, no_batch=None, test_root=None, test_id=None, machine_name=None,compiler=None, baseline_root=None, baseline_name=None, clean=False,compare=False, generate=False, namelists_only=False, project=None, parallel_jobs=None, xml_machine=None, xml_compiler=None, xml_category=None,xml_testlist=None): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = CIME.utils.get_model() # needed for perl interface os.environ["CIMEROOT"] = self._cime_root self._machobj = Machines(machine=machine_name) machine_name = self._machobj.get_machine_name() self._no_build = no_build if not namelists_only else True self._no_run = no_run if not self._no_build else True # Figure out what project to use if (project is None): self._project = CIME.utils.get_project() if (self._project is None): self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() self._test_root = test_root if test_root is not None else self._machobj.get_value("CESMSCRATCHROOT") if (self._project is not None): self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_utc_timestamp() self._compiler = compiler if compiler is not None else self._machobj.get_default_compiler() expect(self._machobj.is_valid_compiler(self._compiler), "Compiler %s not valid for machine %s" % (self._compiler,machine_name)) self._clean = clean self._namelists_only = namelists_only # Extra data associated with tests, do not modify after construction # test_name -> test_data # test_data: name -> value self._test_data = {} # If xml options are provided get tests from xml file, otherwise use acme dictionary if(not test_names and (xml_machine is not None or xml_category is not None or xml_compiler is not None or xml_testlist is not None)): test_data = CIME.test_utils.get_tests_from_xml(xml_machine, xml_category, xml_compiler, xml_testlist, machine_name, compiler) test_names = [item["name"] for item in test_data] for test_datum in test_data: self._test_data[test_datum["name"]] = test_datum else: expect(len(test_names) > 0, "No tests to run") test_names = update_acme_tests.get_full_test_names(test_names, machine_name, self._compiler) if (parallel_jobs is None): self._parallel_jobs = min(len(test_names), int(self._machobj.get_value("MAX_TASKS_PER_NODE"))) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = None self._baseline_gen_name = None self._compare = False self._generate = False if (compare or generate): # Figure out what baseline name to use if (baseline_name is None): if(compare is not None and isinstance(compare,str)): self._baseline_cmp_name = compare self._compare = True if(generate is not None and isinstance(generate,str)): self._baseline_gen_name = generate self._generate = True branch_name = CIME.utils.get_current_branch(repo=self._cime_root) expect(branch_name is not None, "Could not determine baseline name from branch, please use -b option") if(self._compare and self._baseline_cmp_name is None): self._baseline_cmp_name = os.path.join(self._compiler, branch_name) if(self._generate and self._baseline_gen_name is None): self._baseline_gen_name = os.path.join(self._compiler, branch_name) else: if(compare): self._compare = True self._baseline_cmp_name = baseline_name if (not self._baseline_cmp_name.startswith("%s/" % self._compiler)): self._baseline_cmp_name = os.path.join(self._compiler, self._baseline_cmp_name) if(generate): self._generate = True self._baseline_gen_name = baseline_name if (not self._baseline_gen_name.startswith("%s/" % self._compiler)): self._baseline_gen_name = os.path.join(self._compiler, self._baseline_gen_name) # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None else self._machobj.get_value("CCSM_BASELINE") if (self._project is not None): self._baseline_root = self._baseline_root.replace("$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if (self._compare): full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect(os.path.isdir(full_baseline_dir), "Missing baseline comparison directory %s" % full_baseline_dir) else: self._baseline_root = None # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # Since the name-list phase can fail without aborting later phases, we # need some extra state to remember tests that had namelist problems. # name -> (phase, status, has_namelist_problem) self._tests = {} for test_name in test_names: self._tests[test_name] = (INITIAL_PHASE, TEST_PASS_STATUS, False) # Oversubscribe by 1/4 pes = int(self._machobj.get_value("PES_PER_NODE")) self._proc_pool = int(pes * 1.25) # Setup phases self._phases = list(PHASES) if (no_build): self._phases.remove(BUILD_PHASE) if (no_run): self._phases.remove(RUN_PHASE) if (not self._compare and not self._generate): self._phases.remove(NAMELIST_PHASE) # None of the test directories should already exist. for test in self._tests: expect(not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '%s', it already exists. Pick a different test-id" % self._get_test_dir(test)) # By the end of this constructor, this program should never hard abort, # instead, errors will be placed in the TestStatus files for the various # tests cases ########################################################################### def _log_output(self, test, output): ########################################################################### test_dir = self._get_test_dir(test) if (not os.path.isdir(test_dir)): # Note: making this directory could cause create_newcase to fail # if this is run before. os.makedirs(test_dir) with open(os.path.join(test_dir, "TestStatus.log"), "a") as fd: fd.write(output) ########################################################################### def _get_case_id(self, test): ########################################################################### baseline_action_code = ".C" if self._compare else (".G" if self._generate else "") return "%s%s.%s" % (test, baseline_action_code, self._test_id) ########################################################################### def _get_test_dir(self, test): ########################################################################### return os.path.join(self._test_root, self._get_case_id(test)) ########################################################################### def _get_test_data(self, test): ########################################################################### # Must be atomic return self._tests[test] ########################################################################### def _is_broken(self, test): ########################################################################### status = self._get_test_status(test) return status not in CONTINUE and status != TEST_PENDING_STATUS ########################################################################### def _work_remains(self, test): ########################################################################### test_phase, test_status, _ = self._get_test_data(test) return (test_status in CONTINUE or test_status == TEST_PENDING_STATUS) and test_phase != self._phases[-1] ########################################################################### def _get_test_status(self, test, phase=None): ########################################################################### curr_phase, curr_status, nl_fail = self._get_test_data(test) if (phase == NAMELIST_PHASE and nl_fail): return NAMELIST_FAIL_STATUS elif (phase is None or phase == curr_phase): return curr_status else: expect(phase is None or self._phases.index(phase) < self._phases.index(curr_phase), "Tried to see the future") # Assume all older phases PASSed return TEST_PASS_STATUS ########################################################################### def _get_test_phase(self, test): ########################################################################### return self._get_test_data(test)[0] ########################################################################### def _update_test_status(self, test, phase, status): ########################################################################### phase_idx = self._phases.index(phase) old_phase, old_status, old_nl_fail = self._get_test_data(test) if (old_phase == phase): expect(old_status == TEST_PENDING_STATUS, "Only valid to transition from PENDING to something else, found '%s'" % old_status) expect(status != TEST_PENDING_STATUS, "Cannot transition from PEND -> PEND") else: expect(old_status in CONTINUE, "Why did we move on to next phase when prior phase did not pass?") expect(status == TEST_PENDING_STATUS, "New phase should be set to pending status") expect(self._phases.index(old_phase) == phase_idx - 1, "Skipped phase?") # Must be atomic self._tests[test] = (phase, status, old_nl_fail) ########################################################################### def _test_has_nl_problem(self, test): ########################################################################### curr_phase, curr_status, _ = self._get_test_data(test) expect(curr_phase == NAMELIST_PHASE, "Setting namelist status outside of namelist phase?") # Must be atomic self._tests[test] = (curr_phase, curr_status, True) ########################################################################### def _shell_cmd_for_phase(self, test, cmd, phase, from_dir=None): ########################################################################### while (True): rc, output, errput = run_cmd(cmd, ok_to_fail=True, from_dir=from_dir) if (rc != 0): self._log_output(test, "%s FAILED for test '%s'.\nCommand: %s\nOutput: %s\n\nErrput: %s" % (phase, test, cmd, output, errput)) # Temporary hack to get around odd file descriptor use by # buildnml scripts. if ("bad interpreter" in errput): time.sleep(1) continue else: break else: self._log_output(test, "%s PASSED for test '%s'.\nCommand: %s\nOutput: %s\n\nErrput: %s" % (phase, test, cmd, output, errput)) break return rc == 0 ########################################################################### def _create_newcase_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) test_case, case_opts, grid, compset, machine, compiler, test_mods = CIME.utils.parse_test_name(test) if (compiler != self._compiler): raise StandardError("Test '%s' has compiler that does not match instance compliler '%s'" % (test, self._compiler)) if (self._parallel_jobs == 1): scratch_dir = self._machobj.get_value("CESMSCRATCHROOT") if (self._project is not None): scratch_dir = scratch_dir.replace("$PROJECT", self._project) sharedlibroot = os.path.join(scratch_dir, "sharedlibroot.%s" % self._test_id) else: # Parallelizing builds introduces potential sync problems with sharedlibroot # Just let every case build it's own sharedlibroot = os.path.join(test_dir, "sharedlibroot.%s" % self._test_id) create_newcase_cmd = "%s -model %s -case %s -res %s -mach %s -compiler %s -compset %s -testname %s -project %s -sharedlibroot %s" % \ (os.path.join(self._cime_root,"scripts", "create_newcase"), self._cime_model,test_dir, grid, machine, compiler, compset, test_case, self._project, sharedlibroot) if (test_case != 'PFS'): create_newcase_cmd += " -nosavetiming " if (case_opts is not None): create_newcase_cmd += " -confopts _%s" % ("_".join(case_opts)) if (test_mods is not None): files = Files() (component, mods) = test_mods.split('/') testmods_dir = files.get_value("TESTS_MODS_DIR",{"component": component}) test_mod_file = os.path.join(testmods_dir, component, mods) if (not os.path.exists(test_mod_file)): self._log_output(test, "Missing testmod file '%s'" % test_mod_file) return False create_newcase_cmd += " -user_mods_dir %s" % test_mod_file logging.info("Calling create_newcase: "+create_newcase_cmd) return self._shell_cmd_for_phase(test, create_newcase_cmd, CREATE_NEWCASE_PHASE) ########################################################################### def _xml_phase(self, test): ########################################################################### test_case = CIME.utils.parse_test_name(test)[0] envtest = EnvTest(self._get_test_dir(test)) files = Files() drv_config_file = files.get_value("CONFIG_DRV_FILE") logging.info("Found drv_config_file %s" % drv_config_file) drv_comp = Component(drv_config_file) envtest.add_elements_by_group(drv_comp, {}, "env_test.xml") envtest.set_value("TESTCASE", test_case) envtest.set_value("TEST_TESTID", self._test_id) envtest.set_value("CASEBASEID", test) test_argv = "-testname %s -testroot %s" % (test, self._test_root) if (self._generate): test_argv += " -generate %s" % self._baseline_gen_name envtest.set_value("BASELINE_NAME_GEN",self._baseline_gen_name) envtest.set_value("BASEGEN_CASE",os.path.join(self._baseline_gen_name,test)) if (self._compare): test_argv += " -compare %s" % self._baseline_cmp_name envtest.set_value("BASELINE_NAME_CMP",self._baseline_cmp_name) envtest.set_value("BASECMP_CASE",os.path.join(self._baseline_cmp_name,test)) envtest.set_value("TEST_ARGV",test_argv) envtest.set_value("CLEANUP", ("TRUE" if self._clean else "FALSE")) if (self._generate or self._compare): envtest.set_value("BASELINE_ROOT", self._baseline_root) envtest.set_value("GENERATE_BASELINE", "TRUE" if self._generate else "FALSE") envtest.set_value("COMPARE_BASELINE", "TRUE" if self._compare else "FALSE") envtest.set_value("CCSM_CPRNC", self._machobj.get_value("CCSM_CPRNC",resolved=False)) envtest.write() return True ########################################################################### def _setup_phase(self, test): ########################################################################### test_case = CIME.utils.parse_test_name(test)[0] test_dir = self._get_test_dir(test) test_case_definition_dir = os.path.join(self._cime_root, "scripts", "Testing", "Testcases") test_build = os.path.join(test_dir, "case.test_build" ) if (os.path.exists(os.path.join(test_case_definition_dir, "%s_build.csh" % test_case))): shutil.copy(os.path.join(test_case_definition_dir, "%s_build.csh" % test_case), test_build) else: shutil.copy(os.path.join(test_case_definition_dir, "tests_build.csh"), test_build) return self._shell_cmd_for_phase(test, "./case.setup", SETUP_PHASE, from_dir=test_dir) ########################################################################### def _nlcomp_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) casedoc_dir = os.path.join(test_dir, "CaseDocs") compare_nl = os.path.join(CIME.utils.get_acme_scripts_root(), "compare_namelists") simple_compare = os.path.join(CIME.utils.get_acme_scripts_root(), "simple_compare") if (self._compare): has_fails = False baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name, test) baseline_casedocs = os.path.join(baseline_dir, "CaseDocs") # Start off by comparing everything in CaseDocs except a few arbitrary files (ugh!) # TODO: Namelist files should have consistent suffix all_items_to_compare = \ [ item for item in glob.glob("%s/*" % casedoc_dir) if "README" not in os.path.basename(item) and not item.endswith("doc") and not item.endswith("prescribed") and not os.path.basename(item).startswith(".")] + \ glob.glob("%s/*user_nl*" % test_dir) for item in all_items_to_compare: baseline_counterpart = os.path.join(baseline_casedocs if os.path.dirname(item).endswith("CaseDocs") else baseline_dir, os.path.basename(item)) if (not os.path.exists(baseline_counterpart)): self._log_output(test, "Missing baseline namelist '%s'" % baseline_counterpart) has_fails = True else: if (compare_namelists.is_namelist_file(item)): rc, output, _ = run_cmd("%s %s %s -c %s 2>&1" % (compare_nl, baseline_counterpart, item, test), ok_to_fail=True) else: rc, output, _ = run_cmd("%s %s %s -c %s 2>&1" % (simple_compare, baseline_counterpart, item, test), ok_to_fail=True) if (rc != 0): has_fails = True self._log_output(test, output) if (has_fails): self._test_has_nl_problem(test) if (self._generate): baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name, test) baseline_casedocs = os.path.join(baseline_dir, "CaseDocs") if (not os.path.isdir(baseline_dir)): os.makedirs(baseline_dir, stat.S_IRWXU | stat.S_IRWXG | stat.S_IXOTH | stat.S_IROTH) if (os.path.isdir(baseline_casedocs)): shutil.rmtree(baseline_casedocs) shutil.copytree(casedoc_dir, baseline_casedocs) for item in glob.glob(os.path.join(test_dir, "user_nl*")): shutil.copy2(item, baseline_dir) # Always mark as passed unless we hit exception return True ########################################################################### def _build_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.test_build", BUILD_PHASE, from_dir=test_dir) ########################################################################### def _run_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) # wallclock is an optional field in the version 2.0 testlist.xml file # setting wallclock time close to the expected test time will help queue throughput if (test in self._test_data and "wallclock" in self._test_data[test]): run_cmd("./xmlchange JOB_WALLCLOCK_TIME=%s" % self._test_data[test]["wallclock"], from_dir=test_dir) return self._shell_cmd_for_phase(test, "./case.submit", RUN_PHASE, from_dir=test_dir) ########################################################################### def _update_test_status_file(self, test): ########################################################################### # TODO: The run scripts heavily use the TestStatus file. So we write out # the phases we have taken care of and then let the run scrips go from there # Eventually, it would be nice to have TestStatus management encapsulated # into a single place. str_to_write = "" made_it_to_phase = self._get_test_phase(test) made_it_to_phase_idx = self._phases.index(made_it_to_phase) for phase in self._phases[0:made_it_to_phase_idx+1]: str_to_write += "%s %s %s\n" % (self._get_test_status(test, phase), test, phase) if (not self._no_run and not self._is_broken(test) and made_it_to_phase == BUILD_PHASE): # Ensure PEND state always gets added to TestStatus file if we are # about to run test str_to_write += "%s %s %s\n" % (TEST_PENDING_STATUS, test, RUN_PHASE) test_status_file = os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME) with open(test_status_file, "w") as fd: fd.write(str_to_write) ########################################################################### def _run_catch_exceptions(self, test, phase, run): ########################################################################### try: return run(test) except Exception as e: exc_tb = sys.exc_info()[2] errput = "Test '%s' failed in phase '%s' with exception '%s'" % (test, phase, str(e)) self._log_output(test, errput) logging.warning("Caught exception: %s" % str(e)) traceback.print_tb(exc_tb) return False ########################################################################### def _get_procs_needed(self, test, phase): ########################################################################### if (phase == RUN_PHASE and self._no_batch): test_dir = self._get_test_dir(test) out = run_cmd("./xmlquery TOTALPES -value", from_dir=test_dir) return int(out) else: return 1 ########################################################################### def _handle_test_status_file(self, test, test_phase, success): ########################################################################### # # This complexity is due to sharing of TestStatus responsibilities # try: if (test_phase != RUN_PHASE and (not success or test_phase == BUILD_PHASE or test_phase == self._phases[-1])): self._update_test_status_file(test) # If we failed VERY early on in the run phase, it's possible that # the CIME scripts never got a chance to set the state. elif (test_phase == RUN_PHASE and not success): test_status_file = os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME) statuses = wait_for_tests.parse_test_status_file(test_status_file)[0] if ( RUN_PHASE not in statuses or statuses[RUN_PHASE] in [TEST_PASS_STATUS, TEST_PENDING_STATUS] ): self._update_test_status_file(test) except Exception as e: # TODO: What to do here? This failure is very severe because the # only way for test results to be communicated is by the TestStatus # file. logging.critical("VERY BAD! Could not handle TestStatus file '%s': '%s'" % (os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME), str(e))) thread.interrupt_main() ########################################################################### def _wait_for_something_to_finish(self, threads_in_flight): ########################################################################### expect(len(threads_in_flight) <= self._parallel_jobs, "Oversubscribed?") finished_tests = [] while (not finished_tests): for test, thread_info in threads_in_flight.iteritems(): if (not thread_info[0].is_alive()): finished_tests.append( (test, thread_info[1]) ) if (not finished_tests): time.sleep(0.2) for finished_test, procs_needed in finished_tests: self._proc_pool += procs_needed del threads_in_flight[finished_test] ########################################################################### def _consumer(self, test, test_phase, phase_method): ########################################################################### before_time = time.time() success = self._run_catch_exceptions(test, test_phase, phase_method) elapsed_time = time.time() - before_time status = (TEST_PENDING_STATUS if test_phase == RUN_PHASE and not self._no_batch else TEST_PASS_STATUS) if success else TEST_FAIL_STATUS if (status != TEST_PENDING_STATUS): self._update_test_status(test, test_phase, status) self._handle_test_status_file(test, test_phase, success) status_str = "Finished %s for test %s in %f seconds (%s)\n" % (test_phase, test, elapsed_time, status) if (not success): status_str += " Case dir: %s\n" % self._get_test_dir(test) sys.stdout.write(status_str) ########################################################################### def _producer(self): ########################################################################### threads_in_flight = {} # test-name -> (thread, procs) while (True): work_to_do = False num_threads_launched_this_iteration = 0 for test in self._tests: logging.info("test_name: " + test) # If we have no workers available, immediately wait if (len(threads_in_flight) == self._parallel_jobs): self._wait_for_something_to_finish(threads_in_flight) if (self._work_remains(test)): work_to_do = True if (test not in threads_in_flight): test_phase, test_status, _ = self._get_test_data(test) expect(test_status != TEST_PENDING_STATUS, test) next_phase = self._phases[self._phases.index(test_phase) + 1] procs_needed = self._get_procs_needed(test, next_phase) if (procs_needed <= self._proc_pool): self._proc_pool -= procs_needed # Necessary to print this way when multiple threads printing sys.stdout.write("Starting %s for test %s with %d procs\n" % (next_phase, test, procs_needed)) self._update_test_status(test, next_phase, TEST_PENDING_STATUS) t = threading.Thread(target=self._consumer, args=(test, next_phase, getattr(self, "_%s_phase" % next_phase.lower()) )) threads_in_flight[test] = (t, procs_needed) t.start() num_threads_launched_this_iteration += 1 if (not work_to_do): break if (num_threads_launched_this_iteration == 0): # No free resources, wait for something in flight to finish self._wait_for_something_to_finish(threads_in_flight) for thread_info in threads_in_flight.values(): thread_info[0].join() ########################################################################### def _setup_cs_files(self): ########################################################################### try: python_libs_root = CIME.utils.get_python_libs_root() acme_scripts_root = CIME.utils.get_acme_scripts_root() template_file = os.path.join(python_libs_root, "cs.status.template") template = open(template_file, "r").read() template = template.replace("<PATH>", acme_scripts_root).replace("<TESTID>", self._test_id) cs_status_file = os.path.join(self._test_root, "cs.status.%s" % self._test_id) with open(cs_status_file, "w") as fd: fd.write(template) os.chmod(cs_status_file, os.stat(cs_status_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) template_file = os.path.join(python_libs_root, "cs.submit.template") template = open(template_file, "r").read() build_cmd = "./*.test_build" if self._no_build else ":" run_cmd = "./*.test" if self._no_batch else "./*.submit" template = template.replace("<BUILD_CMD>", build_cmd).replace("<RUN_CMD>", run_cmd).replace("<TESTID>", self._test_id) if (self._no_build or self._no_run): cs_submit_file = os.path.join(self._test_root, "cs.submit.%s" % self._test_id) with open(cs_submit_file, "w") as fd: fd.write(template) os.chmod(cs_submit_file, os.stat(cs_submit_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) except Exception as e: logging.warning("FAILED to set up cs files: %s" % str(e)) ########################################################################### def system_test(self): ########################################################################### """ Main API for this class. Return True if all tests passed. """ start_time = time.time() # Tell user what will be run print "RUNNING TESTS:" for test in self._tests: print " ", test # TODO - documentation self._producer() expect(threading.active_count() == 1, "Leftover threads?") # Setup cs files self._setup_cs_files() # Return True if all tests passed print "At system_test close, state is:" rv = True for test in self._tests: phase, status, nl_fail = self._get_test_data(test) logging.debug("phase %s status %s" % (phase, status)) if (status == TEST_PASS_STATUS and phase == RUN_PHASE): # Be cautious about telling the user that the test passed. This # status should match what they would see on the dashboard. Our # self._test_states does not include comparison fail information, # so we need to parse test status. test_status_file = os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME) status = wait_for_tests.interpret_status_file(test_status_file)[1] if (status not in [TEST_PASS_STATUS, TEST_PENDING_STATUS]): print "%s %s (phase %s)" % (status, test, phase) rv = False elif (nl_fail): print "%s %s (but otherwise OK)" % (NAMELIST_FAIL_STATUS, test) rv = False else: print status, test, phase print " Case dir: %s" % self._get_test_dir(test) print "system_test took", time.time() - start_time, "seconds" return rv
case.load_env() models = case.get_values("COMP_CLASSES") mach = case.get_value("MACH") compiler = case.get_value("COMPILER") debug = case.get_value("DEBUG") mpilib = case.get_value("MPILIB") sysos = case.get_value("OS") comp_interface = case.get_value("COMP_INTERFACE") expect(mach is not None, "xml variable MACH is not set") # creates the Macros.make, Depends.compiler, Depends.machine, Depends.machine.compiler # and env_mach_specific.xml if they don't already exist. if not os.path.isfile("Macros.make") or not os.path.isfile( "env_mach_specific.xml"): configure(Machines(machine=mach), caseroot, ["Makefile"], compiler, mpilib, debug, comp_interface, sysos) # Set tasks to 1 if mpi-serial library if mpilib == "mpi-serial": for vid, value in case: if vid.startswith("NTASKS") and value != 1: case.set_value(vid, 1) # Check ninst. # In CIME there can be multiple instances of each component model (an ensemble) NINST is the instance of that component. comp_interface = case.get_value("COMP_INTERFACE") if comp_interface == "nuopc": ninst = case.get_value("NINST") multi_driver = case.get_value("MULTI_DRIVER")
def _compare_baseline(self): """ Compare baselines in the pergro test sense. That is, compare PGE from the test simulation with the baseline cloud """ with self._test_status: self._test_status.set_status(CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_FAIL_STATUS) logger.debug("PGN_INFO:BASELINE COMPARISON STARTS") run_dir = self._case.get_value("RUNDIR") case_name = self._case.get_value("CASE") base_dir = os.path.join(self._case.get_value("BASELINE_ROOT"), self._case.get_value("BASECMP_CASE")) var_list = self.get_var_list() test_name = "{}".format(case_name.split('.')[-1]) evv_config = { test_name: { "module": os.path.join(evv_lib_dir, "extensions", "pg.py"), "test-case": case_name, "test-name": "Test", "test-dir": run_dir, "ref-name": "Baseline", "ref-dir": base_dir, "variables": var_list, "perturbations": PERTURBATIONS, "pge-cld": FCLD_NC, "ninit": NUMBER_INITIAL_CONDITIONS, "init-file-template": INIT_COND_FILE_TEMPLATE, "instance-file-template": INSTANCE_FILE_TEMPLATE, "init-model": "cam", "component": self.atmmod, } } json_file = os.path.join(run_dir, '.'.join([case_name, 'json'])) with open(json_file, 'w') as config_file: json.dump(evv_config, config_file, indent=4) evv_out_dir = os.path.join(run_dir, '.'.join([case_name, 'evv'])) evv(['-e', json_file, '-o', evv_out_dir]) with open(os.path.join(evv_out_dir, 'index.json'), 'r') as evv_f: evv_status = json.load(evv_f) comments = "" for evv_elem in evv_status['Data']['Elements']: if evv_elem['Type'] == 'ValSummary' \ and evv_elem['TableTitle'] == 'Perturbation growth test': comments = "; ".join("{}: {}".format(key, val) for key, val in evv_elem['Data'] [test_name][''].items()) if evv_elem['Data'][test_name]['']['Test status'].lower( ) == 'pass': self._test_status.set_status( CIME.test_status.BASELINE_PHASE, CIME.test_status.TEST_PASS_STATUS) break status = self._test_status.get_status( CIME.test_status.BASELINE_PHASE) mach_name = self._case.get_value("MACH") mach_obj = Machines(machine=mach_name) htmlroot = CIME.utils.get_htmlroot(mach_obj) urlroot = CIME.utils.get_urlroot(mach_obj) if htmlroot is not None: with CIME.utils.SharedArea(): dir_util.copy_tree(evv_out_dir, os.path.join(htmlroot, 'evv', case_name), preserve_mode=False) if urlroot is None: urlroot = "[{}_URL]".format(mach_name.capitalize()) viewing = "{}/evv/{}/index.html".format(urlroot, case_name) else: viewing = "{}\n" \ " EVV viewing instructions can be found at: " \ " https://github.com/E3SM-Project/E3SM/blob/master/cime/scripts/" \ "climate_reproducibility/README.md#test-passfail-and-extended-output" \ "".format(evv_out_dir) comments = "{} {} for test '{}'.\n" \ " {}\n" \ " EVV results can be viewed at:\n" \ " {}".format(CIME.test_status.BASELINE_PHASE, status, test_name, comments, viewing) CIME.utils.append_testlog(comments, self._orig_caseroot)
def __init__(self, test_names, test_data=None, no_run=False, no_build=False, no_setup=False, no_batch=None, test_root=None, test_id=None, machine_name=None, compiler=None, baseline_root=None, baseline_cmp_name=None, baseline_gen_name=None, clean=False, namelists_only=False, project=None, parallel_jobs=None, walltime=None, proc_pool=None, use_existing=False, save_timing=False, queue=None, allow_baseline_overwrite=False, output_root=None): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = CIME.utils.get_model() self._allow_baseline_overwrite = allow_baseline_overwrite self._save_timing = save_timing self._queue = queue self._test_data = {} if test_data is None else test_data # Format: {test_name -> {data_name -> data}} self._machobj = Machines(machine=machine_name) self._no_setup = no_setup self._no_build = no_build or no_setup or namelists_only self._no_run = no_run or self._no_build self._output_root = output_root # Figure out what project to use if project is None: self._project = CIME.utils.get_project() if self._project is None: self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() expect(not (self._no_batch and self._queue is not None), "Does not make sense to request a queue without batch system") # Determine and resolve test_root if test_root is not None: self._test_root = test_root elif self._output_root is not None: self._test_root = self._output_root else: self._test_root = self._machobj.get_value("CIME_OUTPUT_ROOT") if self._project is not None: self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_timestamp( ) self._compiler = self._machobj.get_default_compiler( ) if compiler is None else compiler self._clean = clean self._namelists_only = namelists_only self._walltime = walltime if parallel_jobs is None: self._parallel_jobs = min( len(test_names), int(self._machobj.get_value("MAX_TASKS_PER_NODE"))) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = baseline_cmp_name # Implies comparison should be done if not None self._baseline_gen_name = baseline_gen_name # Implies generation should be done if not None if baseline_cmp_name or baseline_gen_name: # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None \ else self._machobj.get_value("BASELINE_ROOT") if self._project is not None: self._baseline_root = self._baseline_root.replace( "$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if self._baseline_cmp_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect( os.path.isdir(full_baseline_dir), "Missing baseline comparison directory %s" % full_baseline_dir) # the following is to assure that the existing generate directory is not overwritten if self._baseline_gen_name: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name) existing_baselines = [] for test_name in test_names: test_baseline = os.path.join(full_baseline_dir, test_name) if os.path.isdir(test_baseline): existing_baselines.append(test_baseline) expect(allow_baseline_overwrite or len(existing_baselines) == 0, "Baseline directories already exists %s\n"\ "Use --allow_baseline_overwrite to avoid this error"%existing_baselines) else: self._baseline_root = None # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # name -> (phase, status) self._tests = {} for test_name in test_names: self._tests[test_name] = (TEST_START, TEST_PASS_STATUS) # Oversubscribe by 1/4 if proc_pool is None: pes = int(self._machobj.get_value("PES_PER_NODE")) self._proc_pool = int(pes * 1.25) else: self._proc_pool = int(proc_pool) self._procs_avail = self._proc_pool # Setup phases self._phases = list(PHASES) if self._no_setup: self._phases.remove(SETUP_PHASE) if self._no_build: self._phases.remove(SHAREDLIB_BUILD_PHASE) self._phases.remove(MODEL_BUILD_PHASE) if self._no_run: self._phases.remove(RUN_PHASE) if use_existing: for test in self._tests: ts = TestStatus(self._get_test_dir(test)) for phase, status in ts: if phase in CORE_PHASES: if status in [TEST_PEND_STATUS, TEST_FAIL_STATUS]: # We need to pick up here break else: self._update_test_status(test, phase, TEST_PEND_STATUS) self._update_test_status(test, phase, status) else: # None of the test directories should already exist. for test in self._tests: expect( not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '%s', it already exists." " Pick a different test-id" % self._get_test_dir(test))
class SystemTest(object): ############################################################################### ########################################################################### def __init__(self, test_names, no_run=False, no_build=False, no_batch=None, test_root=None, test_id=None, machine_name=None, compiler=None, baseline_root=None, baseline_name=None, clean=False, compare=False, generate=False, namelists_only=False, project=None, parallel_jobs=None, xml_machine=None, xml_compiler=None, xml_category=None, xml_testlist=None, walltime=None, proc_pool=None, use_existing=False): ########################################################################### self._cime_root = CIME.utils.get_cime_root() self._cime_model = CIME.utils.get_model() # needed for perl interface os.environ["CIMEROOT"] = self._cime_root # if machine_name is set use it, otherwise if xml_machine is set use it, # otherwise probe for machine_name if machine_name is None: machine_name = xml_machine self._machobj = Machines(machine=machine_name) machine_name = self._machobj.get_machine_name() self._no_build = no_build if not namelists_only else True self._no_run = no_run if not self._no_build else True # Figure out what project to use if project is None: self._project = CIME.utils.get_project() if self._project is None: self._project = self._machobj.get_value("PROJECT") else: self._project = project # We will not use batch system if user asked for no_batch or if current # machine is not a batch machine self._no_batch = no_batch or not self._machobj.has_batch_system() self._test_root = test_root if test_root is not None \ else self._machobj.get_value("CESMSCRATCHROOT") if self._project is not None: self._test_root = self._test_root.replace("$PROJECT", self._project) self._test_root = os.path.abspath(self._test_root) self._test_id = test_id if test_id is not None else CIME.utils.get_utc_timestamp() # if compiler is set use it, otherwise if xml_compiler is set use it, # otherwise use the default compiler for the machine if compiler is not None: self._compiler = compiler elif xml_compiler is not None: self._compiler = xml_compiler else: self._compiler = self._machobj.get_default_compiler() expect(self._machobj.is_valid_compiler(self._compiler), "Compiler %s not valid for machine %s" % (self._compiler, machine_name)) self._clean = clean self._namelists_only = namelists_only # Extra data associated with tests, do not modify after construction # test_name -> test_data # test_data: name -> value self._test_xml = {} # If xml options are provided get tests from xml file, otherwise use acme dictionary if not test_names and (xml_machine is not None or xml_category is not None or xml_compiler is not None or xml_testlist is not None): test_data = CIME.test_utils.get_tests_from_xml(xml_machine, xml_category, xml_compiler, xml_testlist, machine_name, compiler) test_names = [item["name"] for item in test_data] for test_datum in test_data: self._test_xml[test_datum["name"]] = test_datum else: expect(len(test_names) > 0, "No tests to run") test_names = update_acme_tests.get_full_test_names(test_names, machine_name, self._compiler) if walltime is not None: for test in test_names: if test in self._test_xml: test_datum = self._test_xml[test] else: test_datum = {} self._test_xml[test] = test_datum test_datum["wallclock"] = walltime if parallel_jobs is None: self._parallel_jobs = min(len(test_names), int(self._machobj.get_value("MAX_TASKS_PER_NODE"))) else: self._parallel_jobs = parallel_jobs self._baseline_cmp_name = None self._baseline_gen_name = None self._compare = False self._generate = False if compare or generate: # Figure out what baseline name to use if baseline_name is None: if compare is not None and isinstance(compare, str): self._baseline_cmp_name = compare self._compare = True if generate is not None and isinstance(generate, str): self._baseline_gen_name = generate self._generate = True if self._compare and self._baseline_cmp_name is None: branch_name = CIME.utils.get_current_branch(repo=self._cime_root) expect(branch_name is not None, "Could not determine baseline name from branch, please use -b option") self._baseline_cmp_name = os.path.join(self._compiler, branch_name) if self._generate and self._baseline_gen_name is None: branch_name = CIME.utils.get_current_branch(repo=self._cime_root) expect(branch_name is not None, "Could not determine baseline name from branch, please use -b option") self._baseline_gen_name = os.path.join(self._compiler, branch_name) else: if compare: self._compare = True self._baseline_cmp_name = baseline_name if not self._baseline_cmp_name.startswith("%s/" % self._compiler): self._baseline_cmp_name = os.path.join(self._compiler, self._baseline_cmp_name) if generate: self._generate = True self._baseline_gen_name = baseline_name if not self._baseline_gen_name.startswith("%s/" % self._compiler): self._baseline_gen_name = os.path.join(self._compiler, self._baseline_gen_name) # Compute baseline_root self._baseline_root = baseline_root if baseline_root is not None \ else self._machobj.get_value("CCSM_BASELINE") if self._project is not None: self._baseline_root = self._baseline_root.replace("$PROJECT", self._project) self._baseline_root = os.path.abspath(self._baseline_root) if self._compare: full_baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name) expect(os.path.isdir(full_baseline_dir), "Missing baseline comparison directory %s" % full_baseline_dir) else: self._baseline_root = None # This is the only data that multiple threads will simultaneously access # Each test has it's own value and setting/retrieving items from a dict # is atomic, so this should be fine to use without mutex. # Since the name-list phase can fail without aborting later phases, we # need some extra state to remember tests that had namelist problems. # name -> (phase, status, has_namelist_problem) self._tests = {} for test_name in test_names: self._tests[test_name] = (INITIAL_PHASE, TEST_PASS_STATUS, False) # Oversubscribe by 1/4 if proc_pool is None: pes = int(self._machobj.get_value("PES_PER_NODE")) self._proc_pool = int(pes * 1.25) else: self._proc_pool = int(proc_pool) self._procs_avail = self._proc_pool # Setup phases self._phases = list(PHASES) if no_build: self._phases.remove(SHAREDLIB_BUILD_PHASE) self._phases.remove(MODEL_BUILD_PHASE) if no_run: self._phases.remove(RUN_PHASE) if not self._compare and not self._generate: self._phases.remove(NAMELIST_PHASE) if use_existing: for test in self._tests: test_status_file = os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME) statuses = wait_for_tests.parse_test_status_file(test_status_file)[0] for phase, status in statuses.iteritems(): if phase != INITIAL_PHASE: self._update_test_status(test, phase, TEST_PENDING_STATUS) self._update_test_status(test, phase, status) else: # None of the test directories should already exist. for test in self._tests: expect(not os.path.exists(self._get_test_dir(test)), "Cannot create new case in directory '%s', it already exists." " Pick a different test-id" % self._get_test_dir(test)) # By the end of this constructor, this program should never hard abort, # instead, errors will be placed in the TestStatus files for the various # tests cases ########################################################################### def _log_output(self, test, output): ########################################################################### test_dir = self._get_test_dir(test) if not os.path.isdir(test_dir): # Note: making this directory could cause create_newcase to fail # if this is run before. os.makedirs(test_dir) append_status(output,caseroot=test_dir,sfile="TestStatus.log") ########################################################################### def _get_case_id(self, test): ########################################################################### baseline_action_code = ".C" if self._compare else (".G" if self._generate else "") return "%s%s.%s" % (test, baseline_action_code, self._test_id) ########################################################################### def _get_test_dir(self, test): ########################################################################### return os.path.join(self._test_root, self._get_case_id(test)) ########################################################################### def _get_test_data(self, test): ########################################################################### # Must be atomic return self._tests[test] ########################################################################### def _is_broken(self, test): ########################################################################### status = self._get_test_status(test) return status not in CONTINUE and status != TEST_PENDING_STATUS ########################################################################### def _work_remains(self, test): ########################################################################### test_phase, test_status, _ = self._get_test_data(test) return (test_status in CONTINUE or test_status == TEST_PENDING_STATUS) and\ test_phase != self._phases[-1] ########################################################################### def _get_test_status(self, test, phase=None): ########################################################################### curr_phase, curr_status, nl_fail = self._get_test_data(test) if phase == NAMELIST_PHASE and nl_fail: return NAMELIST_FAIL_STATUS elif phase is None or phase == curr_phase: return curr_status else: expect(phase is None or self._phases.index(phase) < self._phases.index(curr_phase), "Tried to see the future") # Assume all older phases PASSed return TEST_PASS_STATUS ########################################################################### def _get_test_phase(self, test): ########################################################################### return self._get_test_data(test)[0] ########################################################################### def _update_test_status(self, test, phase, status): ########################################################################### phase_idx = self._phases.index(phase) old_phase, old_status, old_nl_fail = self._get_test_data(test) if old_phase == phase: expect(old_status == TEST_PENDING_STATUS, "Only valid to transition from PENDING to something else, found '%s' for phase '%s'" % (old_status, phase)) expect(status != TEST_PENDING_STATUS, "Cannot transition from PEND -> PEND") else: expect(old_status in CONTINUE, "Why did we move on to next phase when prior phase did not pass?") expect(status == TEST_PENDING_STATUS, "New phase should be set to pending status") expect(self._phases.index(old_phase) == phase_idx - 1, "Skipped phase?") # Must be atomic self._tests[test] = (phase, status, old_nl_fail) ########################################################################### def _test_has_nl_problem(self, test): ########################################################################### curr_phase, curr_status, _ = self._get_test_data(test) expect(curr_phase == NAMELIST_PHASE, "Setting namelist status outside of namelist phase?") # Must be atomic self._tests[test] = (curr_phase, curr_status, True) ########################################################################### def _shell_cmd_for_phase(self, test, cmd, phase, from_dir=None): ########################################################################### while True: rc, output, errput = run_cmd(cmd, ok_to_fail=True, from_dir=from_dir) if rc != 0: self._log_output(test, "%s FAILED for test '%s'.\nCommand: %s\nOutput: %s\n\nErrput: %s" % (phase, test, cmd, output, errput)) # Temporary hack to get around odd file descriptor use by # buildnml scripts. if "bad interpreter" in errput: time.sleep(1) continue else: break else: self._log_output(test, "%s PASSED for test '%s'.\nCommand: %s\nOutput: %s\n\nErrput: %s" % (phase, test, cmd, output, errput)) break return rc == 0 ########################################################################### def _create_newcase_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) _, case_opts, grid, compset,\ machine, compiler, test_mods = CIME.utils.parse_test_name(test) if compiler != self._compiler: raise StandardError("Test '%s' has compiler that does" " not match instance compliler '%s'" % (test, self._compiler)) create_newcase_cmd = "%s --case %s --res %s --mach %s --compiler %s --compset %s"\ " --project %s --test"%\ (os.path.join(self._cime_root, "scripts", "create_newcase"), test_dir, grid, machine, compiler, compset, self._project) if test_mods is not None: files = Files() (component,modspath) = test_mods.split('/',1) testmods_dir = files.get_value("TESTS_MODS_DIR", {"component": component}) test_mod_file = os.path.join(testmods_dir, component, modspath) if not os.path.exists(test_mod_file): self._log_output(test, "Missing testmod file '%s'" % test_mod_file) return False create_newcase_cmd += " --user-mods-dir %s" % test_mod_file if case_opts is not None: for case_opt in case_opts: if case_opt.startswith('M'): mpilib = case_opt[1:] create_newcase_cmd += " --mpilib %s" % mpilib logger.debug (" MPILIB set to %s" % mpilib) if case_opt.startswith('N'): ninst = case_opt[1:] create_newcase_cmd += " --ninst %s" %ninst logger.debug (" NINST set to %s" % ninst) pesize = re.match('P([SMLX][12]?)', case_opt) if pesize: create_newcase_cmd += " --pecount %s"%pesize.group(1) logger.debug("Calling create_newcase: " + create_newcase_cmd) return self._shell_cmd_for_phase(test, create_newcase_cmd, CREATE_NEWCASE_PHASE) ########################################################################### def _xml_phase(self, test): ########################################################################### test_case = CIME.utils.parse_test_name(test)[0] # Create, fill and write an envtest object test_dir = self._get_test_dir(test) envtest = EnvTest(test_dir) # Determine list of component classes that this coupler/driver knows how # to deal with. This list follows the same order as compset longnames follow. files = Files() drv_config_file = files.get_value("CONFIG_DRV_FILE") drv_comp = Component(drv_config_file) component_classes = drv_comp.get_valid_model_components() envtest.add_elements_by_group(drv_comp, {}, "env_test.xml") envtest.set_value("TESTCASE", test_case) envtest.set_value("TEST_TESTID", self._test_id) envtest.set_value("CASEBASEID", test) test_argv = "-testname %s -testroot %s" % (test, self._test_root) if self._generate: test_argv += " -generate %s" % self._baseline_gen_name envtest.set_value("BASELINE_NAME_GEN", self._baseline_gen_name) envtest.set_value("BASEGEN_CASE", os.path.join(self._baseline_gen_name, test)) if self._compare: test_argv += " -compare %s" % self._baseline_cmp_name envtest.set_value("BASELINE_NAME_CMP", self._baseline_cmp_name) envtest.set_value("BASECMP_CASE", os.path.join(self._baseline_cmp_name, test)) envtest.set_value("TEST_ARGV", test_argv) envtest.set_value("CLEANUP", self._clean) if self._generate or self._compare: envtest.set_value("BASELINE_ROOT", self._baseline_root) envtest.set_value("GENERATE_BASELINE", self._generate) envtest.set_value("COMPARE_BASELINE", self._compare) envtest.set_value("CCSM_CPRNC", self._machobj.get_value("CCSM_CPRNC", resolved=False)) """ Add the test instructions from config_test to env_test in the case """ config_test = Tests() testnode = config_test.get_test_node(test_case) envtest.add_test(testnode) # Determine the test_case from the test name test_case, case_opts = CIME.utils.parse_test_name(test)[:2] # Determine case_opts from the test_case if case_opts is not None: logger.debug("case_opts are %s " %case_opts) for opt in case_opts: logger.debug("case_opt is %s" %opt) if opt == 'D': envtest.set_test_parameter("DEBUG", "TRUE") logger.debug (" DEBUG set to TRUE") elif opt == 'E': envtest.set_test_parameter("USE_ESMF_LIB", "TRUE") envtest.set_test_parameter("COMP_INTERFACE", "ESMF") logger.debug (" USE_ESMF_LIB set to TRUE") logger.debug (" COMP_INTERFACE set to ESMF") elif opt == 'CG': envtest.set_test_parameter("CALENDAR", "GREGORIAN") logger.debug (" CALENDAR set to %s" %opt) elif opt.startswith('L'): match = re.match('L([A-Za-z])([0-9]*)', opt) stop_option = {"y":"nyears", "m":"nmonths", "d":"ndays", "h":"nhours", "s":"nseconds", "n":"nsteps"} opt = match.group(1) envtest.set_test_parameter("STOP_OPTION",stop_option[opt]) opti = match.group(2) envtest.set_test_parameter("STOP_N", opti) logger.debug (" STOP_OPTION set to %s" %stop_option[opt]) logger.debug (" STOP_N set to %s" %opti) elif opt.startswith('M'): # M option handled by create newcase continue elif opt.startswith('P'): match1 = re.match('P([0-9]+)', opt) match2 = re.match('P([0-9]+)x([0-9]+)', opt) match3 = re.match('P[SMLX][12]?', opt) opti_tasks = None if match1: opti_tasks = match1.group(1) for component_class in component_classes: if component_class == "DRV": component_class = "CPL" string = "NTASKS_" + component_class envtest.set_test_parameter(string, opti_tasks) string = "NTHRDS_" + component_class envtest.set_test_parameter(string, str(1)) string = "ROOTPE_" + component_class envtest.set_test_parameter(string, str(0)) opti_thrds = 1 elif match2: opti_tasks = match2.group(1) opti_thrds = match2.group(2) for component_class in component_classes: if component_class == "DRV": component_class = "CPL" string = "NTASKS_" + component_class envtest.set_test_parameter(string, opti_tasks) string = "NTHRDS_" + component_class envtest.set_test_parameter(string, opti_thrds) string = "ROOTPE_" + component_class envtest.set_test_parameter(string, str(0)) elif match3: # handled by create_newcase continue if not match3: expect(opti_tasks is not None, "No match found for PE option %s"%opt) logger.debug (" NTASKS_xxx set to %s" %opti_tasks) logger.debug (" NTHRDS_xxx set to %s" %opti_thrds) logger.debug (" ROOTPE_xxx set to %s 0") elif opt.startswith('N'): # handled in create_newcase continue elif opt.startswith('IOP'): logger.warn("IOP test option not yet implemented") else: expect(False, "Could not parse option '%s' " %opt) envtest.write() lockedfiles = os.path.join(test_dir, "LockedFiles") try: os.stat(lockedfiles) except: os.mkdir(lockedfiles) shutil.copy(os.path.join(test_dir,"env_run.xml"), os.path.join(lockedfiles, "env_run.orig.xml")) case = Case(test_dir) case.set_value("SHAREDLIBROOT", os.path.join(self._test_root, "sharedlibroot.%s"%self._test_id)) envtest.set_initial_values(case) return True ########################################################################### def _setup_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.setup", SETUP_PHASE, from_dir=test_dir) ########################################################################### def _nlcomp_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) casedoc_dir = os.path.join(test_dir, "CaseDocs") compare_nl = os.path.join(CIME.utils.get_scripts_root(), "Tools", "compare_namelists") simple_compare = os.path.join(CIME.utils.get_scripts_root(), "Tools", "simple_compare") if self._compare: has_fails = False baseline_dir = os.path.join(self._baseline_root, self._baseline_cmp_name, test) baseline_casedocs = os.path.join(baseline_dir, "CaseDocs") # Start off by comparing everything in CaseDocs except a few arbitrary files (ugh!) # TODO: Namelist files should have consistent suffix all_items_to_compare = [item for item in glob.glob("%s/*" % casedoc_dir)\ if "README" not in os.path.basename(item)\ and not item.endswith("doc")\ and not item.endswith("prescribed")\ and not os.path.basename(item).startswith(".")] + \ glob.glob("%s/*user_nl*" % test_dir) for item in all_items_to_compare: baseline_counterpart = os.path.join(baseline_casedocs \ if os.path.dirname(item).endswith("CaseDocs") \ else baseline_dir,os.path.basename(item)) if not os.path.exists(baseline_counterpart): self._log_output(test, "Missing baseline namelist '%s'" % baseline_counterpart) has_fails = True else: if compare_namelists.is_namelist_file(item): rc, output, _ = run_cmd("%s %s %s -c %s 2>&1" % (compare_nl, baseline_counterpart, item, test), ok_to_fail=True) else: rc, output, _ = run_cmd("%s %s %s -c %s 2>&1" % (simple_compare, baseline_counterpart, item, test), ok_to_fail=True) if rc != 0: has_fails = True self._log_output(test, output) if has_fails: self._test_has_nl_problem(test) if self._generate: baseline_dir = os.path.join(self._baseline_root, self._baseline_gen_name, test) baseline_casedocs = os.path.join(baseline_dir, "CaseDocs") if not os.path.isdir(baseline_dir): os.makedirs(baseline_dir, stat.S_IRWXU | stat.S_IRWXG | stat.S_IXOTH | stat.S_IROTH) if os.path.isdir(baseline_casedocs): shutil.rmtree(baseline_casedocs) shutil.copytree(casedoc_dir, baseline_casedocs) os.chmod(baseline_casedocs, stat.S_IRWXU | stat.S_IRWXG | stat.S_IXOTH | stat.S_IROTH) for item in glob.glob("%s/*" % baseline_casedocs): os.chmod(item, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) for item in glob.glob(os.path.join(test_dir, "user_nl*")): preexisting_baseline = os.path.join(baseline_dir, os.path.basename(item)) if (os.path.exists(preexisting_baseline)): os.remove(preexisting_baseline) shutil.copy2(item, baseline_dir) os.chmod(preexisting_baseline, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # Always mark as passed unless we hit exception return True ########################################################################### def _sharedlib_build_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.build --sharedlib-only", SHAREDLIB_BUILD_PHASE, from_dir=test_dir) ########################################################################### def _model_build_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) return self._shell_cmd_for_phase(test, "./case.build --model-only", MODEL_BUILD_PHASE, from_dir=test_dir) ########################################################################### def _run_phase(self, test): ########################################################################### test_dir = self._get_test_dir(test) # wallclock is an optional field in the version 2.0 testlist.xml file # setting wallclock time close to the expected test time will help queue throughput if test in self._test_xml and "wallclock" in self._test_xml[test]: run_cmd("./xmlchange JOB_WALLCLOCK_TIME=%s" % self._test_xml[test]["wallclock"], from_dir=test_dir) if self._no_batch: cmd = "./case.submit --no-batch" else: cmd = "./case.submit " return self._shell_cmd_for_phase(test, cmd, RUN_PHASE, from_dir=test_dir) ########################################################################### def _update_test_status_file(self, test): ########################################################################### # TODO: The run scripts heavily use the TestStatus file. So we write out # the phases we have taken care of and then let the run scrips go from there # Eventually, it would be nice to have TestStatus management encapsulated # into a single place. str_to_write = "" made_it_to_phase = self._get_test_phase(test) made_it_to_phase_idx = self._phases.index(made_it_to_phase) for phase in self._phases[0:made_it_to_phase_idx+1]: str_to_write += "%s %s %s\n" % (self._get_test_status(test, phase), test, phase) if not self._no_run and not self._is_broken(test) and made_it_to_phase == MODEL_BUILD_PHASE: # Ensure PEND state always gets added to TestStatus file if we are # about to run test str_to_write += "%s %s %s\n" % (TEST_PENDING_STATUS, test, RUN_PHASE) test_status_file = os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME) with open(test_status_file, "w") as fd: fd.write(str_to_write) ########################################################################### def _run_catch_exceptions(self, test, phase, run): ########################################################################### try: return run(test) except (SystemExit, Exception) as e: exc_tb = sys.exc_info()[2] errput = "Test '%s' failed in phase '%s' with exception '%s'" % (test, phase, str(e)) self._log_output(test, errput) logger.warning("Caught exception: %s" % str(e)) traceback.print_tb(exc_tb) return False ########################################################################### def _get_procs_needed(self, test, phase, threads_in_flight=None): ########################################################################### if phase == RUN_PHASE and self._no_batch: test_dir = self._get_test_dir(test) out = run_cmd("./xmlquery TOTALPES -value", from_dir=test_dir) return int(out) elif (phase == SHAREDLIB_BUILD_PHASE): # Will force serialization of sharedlib builds # TODO - instead of serializing, compute all library configs needed and build # them all in parallel for _, _, running_phase in threads_in_flight.values(): if (running_phase == SHAREDLIB_BUILD_PHASE): return self._proc_pool + 1 return 1 elif (phase == MODEL_BUILD_PHASE): # Model builds now happen in parallel return 4 else: return 1 ########################################################################### def _handle_test_status_file(self, test, test_phase, success): ########################################################################### # # This complexity is due to sharing of TestStatus responsibilities # try: if test_phase != RUN_PHASE and (not success or test_phase == MODEL_BUILD_PHASE or test_phase == self._phases[-1]): self._update_test_status_file(test) # If we failed VERY early on in the run phase, it's possible that # the CIME scripts never got a chance to set the state. elif test_phase == RUN_PHASE and not success: test_status_file = os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME) statuses = wait_for_tests.parse_test_status_file(test_status_file)[0] if RUN_PHASE not in statuses or\ (statuses[RUN_PHASE] in [TEST_PASS_STATUS, TEST_PENDING_STATUS]): self._update_test_status_file(test) except Exception as e: # TODO: What to do here? This failure is very severe because the # only way for test results to be communicated is by the TestStatus # file. logger.critical("VERY BAD! Could not handle TestStatus file '%s': '%s'" % (os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME), str(e))) thread.interrupt_main() ########################################################################### def _wait_for_something_to_finish(self, threads_in_flight): ########################################################################### expect(len(threads_in_flight) <= self._parallel_jobs, "Oversubscribed?") finished_tests = [] while not finished_tests: for test, thread_info in threads_in_flight.iteritems(): if not thread_info[0].is_alive(): finished_tests.append((test, thread_info[1])) if not finished_tests: time.sleep(0.2) for finished_test, procs_needed in finished_tests: self._procs_avail += procs_needed del threads_in_flight[finished_test] ########################################################################### def _consumer(self, test, test_phase, phase_method): ########################################################################### before_time = time.time() success = self._run_catch_exceptions(test, test_phase, phase_method) elapsed_time = time.time() - before_time status = (TEST_PENDING_STATUS if test_phase == RUN_PHASE and not \ self._no_batch else TEST_PASS_STATUS) if success else TEST_FAIL_STATUS if status != TEST_PENDING_STATUS: self._update_test_status(test, test_phase, status) self._handle_test_status_file(test, test_phase, success) status_str = "Finished %s for test %s in %f seconds (%s)" %\ (test_phase, test, elapsed_time, status) if not success: status_str += " Case dir: %s" % self._get_test_dir(test) logger.info(status_str) # On batch systems, we want to immediately submit to the queue, because # it's very cheap to submit and will get us a better spot in line if (success and not self._no_run and not self._no_batch and test_phase == MODEL_BUILD_PHASE): logger.info("Starting %s for test %s with %d procs" % (RUN_PHASE, test, 1)) self._update_test_status(test, RUN_PHASE, TEST_PENDING_STATUS) self._consumer(test, RUN_PHASE, self._run_phase) ########################################################################### def _producer(self): ########################################################################### threads_in_flight = {} # test-name -> (thread, procs, phase) while True: work_to_do = False num_threads_launched_this_iteration = 0 for test in self._tests: logger.debug("test_name: " + test) # If we have no workers available, immediately wait if len(threads_in_flight) == self._parallel_jobs: self._wait_for_something_to_finish(threads_in_flight) if self._work_remains(test): work_to_do = True if test not in threads_in_flight: test_phase, test_status, _ = self._get_test_data(test) expect(test_status != TEST_PENDING_STATUS, test) next_phase = self._phases[self._phases.index(test_phase) + 1] procs_needed = self._get_procs_needed(test, next_phase, threads_in_flight) if procs_needed <= self._procs_avail: self._procs_avail -= procs_needed # Necessary to print this way when multiple threads printing logger.info("Starting %s for test %s with %d procs" % (next_phase, test, procs_needed)) self._update_test_status(test, next_phase, TEST_PENDING_STATUS) new_thread = threading.Thread(target=self._consumer, args=(test, next_phase, getattr(self, "_%s_phase" % next_phase.lower())) ) threads_in_flight[test] = (new_thread, procs_needed, next_phase) new_thread.start() num_threads_launched_this_iteration += 1 if not work_to_do: break if num_threads_launched_this_iteration == 0: # No free resources, wait for something in flight to finish self._wait_for_something_to_finish(threads_in_flight) for unfinished_thread, _, _ in threads_in_flight.values(): unfinished_thread.join() ########################################################################### def _setup_cs_files(self): ########################################################################### try: python_libs_root = CIME.utils.get_python_libs_root() template_file = os.path.join(python_libs_root, "cs.status.template") template = open(template_file, "r").read() template = template.replace("<PATH>", os.path.join(self._cime_root,"scripts","Tools")).replace\ ("<TESTID>", self._test_id) cs_status_file = os.path.join(self._test_root, "cs.status.%s" % self._test_id) with open(cs_status_file, "w") as fd: fd.write(template) os.chmod(cs_status_file, os.stat(cs_status_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) template_file = os.path.join(python_libs_root, "cs.submit.template") template = open(template_file, "r").read() build_cmd = "./*.build" if self._no_build else ":" run_cmd = "./*.test" if self._no_batch else "./*.submit" template = template.replace("<BUILD_CMD>", build_cmd).replace("<RUN_CMD>", run_cmd).replace("<TESTID>", self._test_id) if self._no_build or self._no_run: cs_submit_file = os.path.join(self._test_root, "cs.submit.%s" % self._test_id) with open(cs_submit_file, "w") as fd: fd.write(template) os.chmod(cs_submit_file, os.stat(cs_submit_file).st_mode | stat.S_IXUSR | stat.S_IXGRP) except Exception as e: logger.warning("FAILED to set up cs files: %s" % str(e)) ########################################################################### def system_test(self): ########################################################################### """ Main API for this class. Return True if all tests passed. """ start_time = time.time() # Tell user what will be run logger.info( "RUNNING TESTS:") for test in self._tests: logger.info( " %s"% test) # TODO - documentation self._producer() expect(threading.active_count() == 1, "Leftover threads?") # Setup cs files self._setup_cs_files() # Return True if all tests passed logger.info( "At system_test close, state is:") rv = True for test in self._tests: phase, status, nl_fail = self._get_test_data(test) logger.debug("phase %s status %s" % (phase, status)) if status == TEST_PASS_STATUS and phase == RUN_PHASE: # Be cautious about telling the user that the test passed. This # status should match what they would see on the dashboard. Our # self._test_states does not include comparison fail information, # so we need to parse test status. test_status_file = os.path.join(self._get_test_dir(test), TEST_STATUS_FILENAME) status = wait_for_tests.interpret_status_file(test_status_file)[1] if status not in [TEST_PASS_STATUS, TEST_PENDING_STATUS]: logger.info( "%s %s (phase %s)" % (status, test, phase)) rv = False elif nl_fail: logger.info( "%s %s (but otherwise OK)" % (NAMELIST_FAIL_STATUS, test)) rv = False else: logger.info("status=%s test=%s phase=%s"%( status, test, phase)) logger.info( " Case dir: %s" % self._get_test_dir(test)) logger.info( "system_test took %s seconds"% (time.time() - start_time)) return rv