def run_on_machine(self, machine): ########################################################################### cmd = self.formulate_command(machine) print("Starting analysis on {} with cmd: {}".format(machine, cmd)) if self._local: run_cmd_no_fail(cmd, arg_stdout=None, arg_stderr=None, verbose=True, exc_type=RuntimeError) else: try: ssh_cmd = "ssh -o StrictHostKeyChecking=no {} '{}'".format( machine, cmd) output = run_cmd_no_fail(ssh_cmd, exc_type=RuntimeError, combine_output=True) except RuntimeError as e: output = str(e) raise finally: result_path = os.path.join("gather-all-results", self._commit, machine) if os.path.exists(result_path): old_path = result_path + ".old" while (os.path.exists(old_path)): old_path += ".old" print("Warning moving old results to {}".format(old_path)) os.rename(result_path, old_path) with open(result_path, "w") as fd: fd.write(output) print("Completed analysis on {}".format(machine))
def checkout_git_ref(git_ref, verbose=False, repo=None, dry_run=False): ############################################################################### """ Checks out 'branch_ref', and updates submodules """ if dry_run: print("Would checkout {}".format(git_ref)) elif get_current_commit(repo=repo) != get_current_commit(repo=repo, commit=git_ref): expect( is_repo_clean(repo=repo), "If we need to change HEAD, then the repo must be clean before running" ) expect(git_ref is not None, "Missing git-ref") run_cmd_no_fail("git checkout {}".format(git_ref), from_dir=repo) update_submodules(repo=repo) git_commit = get_current_commit(repo=repo) expect( is_repo_clean(repo=repo), "Something went wrong when checking out git ref '{}'".format( git_ref)) if verbose: print("Switched to '{}' ({})".format(git_ref, git_commit)) print_last_commit(repo=repo, git_ref=git_ref)
def run_test(self, test): ############################################################################### git_head = get_current_head() print("===============================================================================") print("Testing '{}' for test '{}'".format(git_head, self._test_full_names[test])) print("===============================================================================") test_dir = self.get_test_dir(test) cmake_config = self.generate_cmake_config(self._tests_cmake_args[test], for_ctest=True) ctest_config = self.generate_ctest_config(cmake_config, [], test) if self._quick_rerun and pathlib.Path("{}/CMakeCache.txt".format(test_dir)).is_file(): # Do not purge bld dir, and do not rerun config step. # Note: make will still rerun cmake if some cmake file has changed ctest_config += "-DSKIP_CONFIG_STEP=TRUE " if self._quick_rerun_failed: ctest_config += "--rerun-failed " else: # This directory might have been used also to build the model to generate baselines. # Although it's ok to build in the same dir, we MUST make sure to erase cmake's cache # and internal files from the previous build (CMakeCache.txt and CMakeFiles folder) run_cmd_no_fail("rm -rf CMake*", from_dir=test_dir, dry_run=self._dry_run) success = run_cmd(ctest_config, from_dir=test_dir, arg_stdout=None, arg_stderr=None, verbose=True, dry_run=self._dry_run)[0] == 0 return success
def run_test(self, test_cmd): ############################################################################### if self._cd: test_path, test_exe = os.path.split(test_cmd) test_path = None if not test_path else test_path else: test_exe = test_cmd test_path = None self.machine_specific_init(self._scaling_exp.threads) self.test_specific_init(test_exe, self._scaling_exp.threads) cmd = self.formulate_cmd(test_exe) results = [] with open("{}.perf.log".format( os.path.split(test_exe)[1].split(" ")[0]), "w", encoding="utf-8") as fd: fd.write(cmd + "\n\n") fd.write("ENV: \n{}\n\n".format(run_cmd_no_fail("env"))) for _ in range(self._num_runs): output = run_cmd_no_fail(cmd, from_dir=test_path, verbose=(not self._plot_friendly or self._verbose)) fd.write(output + "\n\n") results.append(self.get_time(output)) threads = self.get_threads(output) return median(results), threads
def generate_baselines(self, test): ############################################################################### test_dir = self.get_baseline_dir(test) cmake_config = self.generate_cmake_config(self._tests_cmake_args[test]) cmake_config += " -DSCREAM_BASELINES_ONLY=ON" print("===============================================================================") print("Generating baseline for test {} with config '{}'".format(self._test_full_names[test], cmake_config)) print("===============================================================================") # We cannot just crash if we fail to generate baselines, since we would # not get a dashboard report if we did that. Instead, just ensure there is # no baseline file to compare against if there's a problem. stat, _, err = run_cmd("{} {}".format(cmake_config, self._root_dir), from_dir=test_dir, verbose=True, dry_run=self._dry_run) if stat!= 0: print ("WARNING: Failed to configure baselines:\n{}".format(err)) return False cmd = "make -j{} && make -j{} baseline".format(self._compile_res_count[test],self._testing_res_count[test]) if self._parallel: start, end = self.get_taskset_id(test) cmd = "taskset -c {}-{} sh -c '{}'".format(start,end,cmd) stat, _, err = run_cmd(cmd, from_dir=test_dir, verbose=True, dry_run=self._dry_run) if stat != 0: print("WARNING: Failed to create baselines:\n{}".format(err)) return False else: # Clean up the directory, by removing everything but the 'data' subfolder run_cmd_no_fail(r"find -maxdepth 1 -not -name data ! -path . -exec rm -rf {} \;", from_dir=test_dir, verbose=True, dry_run=self._dry_run) return True
def baselines_are_expired (self, expected_baseline_sha): ############################################################################### # Baselines are expired if either: # 2) there is no file in baseline_dir containing the sha of the baselines # 3) the baselines sha does not match the one passed to this function # Sanity check expect(self._baseline_dir is not None, "Error! This routine should only be called when testing against pre-existing baselines.") # The file specifying what baselines were built during last baselines generation msut be there if not self._baseline_names_file.exists(): return True # It might happen that we generate baselines for all build types, then later on # for some reason we manually generate baselines for only one build type. The other # baselines will still be there, but may be expired. Therefore, we check the # baselines_names file, to see what baselines were built last time. If all the # baselines we need are there, then we're good valid_baselines = run_cmd_no_fail("cat {}".format(self._baseline_names_file.resolve())) for test in self._tests: if not test in valid_baselines: return True # No sha file => baselines expired if not self._baseline_sha_file.exists(): return True # Different sha => baselines expired baseline_sha = run_cmd_no_fail("cat {}".format(self._baseline_sha_file)) return expected_baseline_sha != baseline_sha
def run_test(self, test): ############################################################################### git_head = get_current_head() print( "===============================================================================" ) print("Testing '{}' for test '{}'".format(git_head, self._test_full_names[test])) print( "===============================================================================" ) test_dir = self.get_test_dir(test) cmake_config = self.generate_cmake_config(self._tests_cmake_args[test], for_ctest=True) ctest_config = self.generate_ctest_config(cmake_config, [], test) # This directory might have been used also to build the model to generate baselines. # Although it's ok to build in the same dir, we MUST make sure to erase cmake's cache # and internal files from the previous build (CMakeCache.txt and CMakeFiles folder) run_cmd_no_fail("rm -rf CMake*", from_dir=test_dir) success = run_cmd(ctest_config, from_dir=test_dir, arg_stdout=None, arg_stderr=None, verbose=True, dry_run=self._dry_run)[0] == 0 return success
def merge_git_ref(git_ref, repo=None, verbose=False, dry_run=False): ############################################################################### """ Merge given git ref into the current branch, and updates submodules """ # Even thoguh it can allow some extra corner cases (dirty repo, but ahead of git_ref), # this check is mostly for debugging purposes, as it will inform that no merge occurred out = get_common_ancestor(git_ref) if out == get_current_commit(commit=git_ref): if verbose: print( "Merge of '{}' not necessary. Current HEAD is already ahead.". format(git_ref)) return merge_cmd = "git merge {0} -m 'Automatic merge of {0}'".format(git_ref) if dry_run: print("Would run: {}".format(merge_cmd)) else: expect(is_repo_clean(repo=repo), "Cannot merge ref '{}'. The repo is not clean.".format(git_ref)) run_cmd_no_fail(merge_cmd, from_dir=repo) update_submodules(repo=repo) expect( is_repo_clean(repo=repo), "Something went wrong while performing the merge of '{}'".format( git_ref)) if verbose: print("git ref {} successfully merged.".format(git_ref)) print_last_commit()
def generate_all_baselines(self): ############################################################################### git_head_ref = get_current_head() print( "###############################################################################" ) print("Generating baselines for ref {}".format(self._baseline_ref)) print( "###############################################################################" ) # First, create build directories (one per test) for test in self._tests: test_dir = self.get_baseline_dir(test) # Create this test's build dir if test_dir.exists(): shutil.rmtree(str(test_dir)) test_dir.mkdir(parents=True) checkout_git_ref(self._baseline_ref, verbose=True) success = True num_workers = len(self._tests) if self._parallel else 1 with threading3.ProcessPoolExecutor( max_workers=num_workers) as executor: future_to_test = { executor.submit(self.generate_baselines, test): test for test in self._tests } for future in threading3.as_completed(future_to_test): test = future_to_test[future] success &= future.result() if not success and self._fast_fail: print('Generation of baselines for build {} failed'.format( self._test_full_names[test])) return False if success: # Store the sha used for baselines generation run_cmd_no_fail("echo '{}' > {}".format( get_current_commit(commit=self._baseline_ref), self._baseline_sha_file)) # Store the name of the builds for which we created a baseline tmp_string = "" for test in self._tests: tmp_string += " {}".format(test) run_cmd_no_fail("echo '{}' > {}".format(tmp_string, self._baseline_names_file)) checkout_git_ref(git_head_ref, verbose=True) return success
def build(self): ############################################################################### with open("build.perf.log", "w") as fd: cmake_cmd = "cmake {} ..".format(self._cmake_options) make_cmd = "make -j8 VERBOSE=1" fd.write(cmake_cmd + "\n") fd.write(run_cmd_no_fail(cmake_cmd, combine_output=True) + "\n\n") fd.write(make_cmd + "\n") fd.write(run_cmd_no_fail(make_cmd, combine_output=True) + "\n")
def import_variables_horiz_remap(self): ########################################################################### # Use a temp file, cause ncremap adds a lot of auxiliary fields temp_file = pathlib.Path("./pncf_tmp.nc").resolve().absolute() cmd = " ncremap -m {} -i {} -o {} -v {}".\ format(self._mfile,self._ifile,temp_file,",".join(self._ivars)) run_cmd_no_fail(cmd) # Import only desired vars from temp file to the actual nc file self.import_variables_no_remap(temp_file) # Clean up temp file run_cmd_no_fail("rm {}".format(temp_file))
def build(self): ############################################################################### cmake_cmd = "cmake {} ..".format(self._cmake_options) if self._verbose: print("In dir {}, building with cmake command: {}\nOutput will be stored in build.perf.log".\ format(os.getcwd(), cmake_cmd)) with open("build.perf.log", "w", encoding="utf-8") as fd: make_cmd = "make -j16 VERBOSE=1" fd.write(cmake_cmd + "\n") fd.write(run_cmd_no_fail(cmake_cmd, combine_output=True) + "\n\n") fd.write(make_cmd + "\n") fd.write(run_cmd_no_fail(make_cmd, combine_output=True) + "\n")
def git_refs_difference(cmp_ref, head="HEAD", repo=None): ############################################################################### """ Return the difference in commits between cmp_ref and head. In particular, it returns two numbers: the number of commits in cmp_ref that are not in head, and the number of commits in head that are not in cmp_ref. The former is how much head is behind cmp_ref, while the latter is how much head is ahead of cmp_ref. """ if "SCREAM_FAKE_GIT_HEAD" in os.environ: expect( "SCREAM_FAKE_AHEAD" in os.environ, "git_refs_difference cannot be used with SCREAM_FAKE_GIT_HEAD and without SCREAM_FAKE_AHEAD" ) return 0, 0 if cmp_ref == head else int( os.environ["SCREAM_FAKE_AHEAD"]) cmd = "git rev-list --left-right --count {}...{}".format(cmp_ref, head) out = run_cmd_no_fail("{}".format(cmd), from_dir=repo) behind_ahead = out.split() expect( len(behind_ahead) == 2, "Error! Something went wrong when running {}".format(cmd)) behind, ahead = int(behind_ahead[0]), int(behind_ahead[1]) return behind, ahead
def cleanup_repo(orig_branch, orig_commit, repo=None, dry_run=False): ############################################################################### """ Discards all unstaged changes, as well as untracked files """ curr_commit = get_current_commit(repo=repo) # Is this a pointless check? Maybe. if not dry_run and not is_repo_clean(repo=repo): # Discard any modifications to the repo (either tracked or untracked), # but keep the ctest-build directory run_cmd_no_fail("git clean -df --exclude=ctest-build", from_dir=repo) toplevel_dir = get_git_toplevel_dir(repo=repo) run_cmd_no_fail("git checkout -- {}".format(toplevel_dir), from_dir=repo) checkout_git_ref(orig_branch if orig_branch else orig_commit, repo=repo, dry_run=dry_run) # This *can* happen. test_all_scream can merge origin/master into current branch. # Checking out orig_branch doesn't do anything if we were on a branch (not detached # head mode), since the branch tip moved with the master merge. In that case, # what we really need is a hard reset to the original commit. # NOTE: if you reset the branch, don't forget to re-update the modules!! if curr_commit != orig_commit and not dry_run: run_cmd_no_fail("git reset --hard {}".format(orig_commit), from_dir=repo) update_submodules(repo=repo)
def generate_all_baselines(self, git_baseline_head, git_head): ############################################################################### print("Generating baselines for ref {}".format(git_baseline_head)) if git_baseline_head != "HEAD": expect( is_repo_clean(), "If baseline commit is not HEAD, then the repo must be clean before running" ) run_cmd_no_fail("git checkout {}".format(git_baseline_head)) print(" Switched to {} ({})".format(git_baseline_head, get_current_commit())) cleanup = git_baseline_head != "HEAD" success = True num_workers = len(self._tests) if self._parallel else 1 with threading3.ProcessPoolExecutor( max_workers=num_workers) as executor: future_to_test = { executor.submit(self.generate_baselines, test, cleanup): test for test in self._tests } for future in threading3.as_completed(future_to_test): test = future_to_test[future] success &= future.result() if not success and self._fast_fail: print('Generation of baselines for build {} failed'.format( self._test_full_names[test])) return False if git_baseline_head != "HEAD": run_cmd_no_fail("git checkout {}".format(git_head)) print(" Switched back to {} ({})".format(git_head, get_current_commit())) return success
def run_test(self, exename): ############################################################################### self.machine_specific_init(self._scaling_exp.threads) self.test_specific_init(exename, self._scaling_exp.threads) prefix = "" if "NUMA_PREFIX" not in os.environ else "{} ".format( os.environ["NUMA_PREFIX"]) cmd = "{}./{} {}".format( prefix, exename, " ".join([ str(item) for item in self._scaling_exp.values(incl_threads=False) ])) results = [] with open("{}.perf.log".format(exename), "w") as fd: fd.write(cmd + "\n\n") fd.write("ENV: \n{}\n\n".format(run_cmd_no_fail("env"))) for _ in range(self._num_runs): output = run_cmd_no_fail(cmd, verbose=not self._plot_friendly) fd.write(output + "\n\n") results.append(self.get_time(output)) threads = self.get_threads(output) return median(results), threads
def print_last_commit(git_ref=None, repo=None, dry_run=False): ############################################################################### """ Prints a one-liner of the last commit """ if dry_run: print("Last commit on ref '{}'".format(git_ref)) elif "SCREAM_FAKE_GIT_HEAD" in os.environ: print("Last commit on ref '{}'".format( os.environ["SCREAM_FAKE_GIT_HEAD"])) else: git_ref = get_current_head(repo) if git_ref is None else git_ref last_commit = run_cmd_no_fail( "git log --oneline -1 {}".format(git_ref), from_dir=repo) print("Last commit on ref '{}': {}".format(git_ref, last_commit))
def get_mach_testing_resources(machine): ############################################################################### """ The number of host cores is used to parallelize compilation, while the number of devices is used to parallelize testing. On CPU machines, the two will usually coincide, while on GPU machines they are going to be different (compile on CPU, run on GPU). One difference is that, for CPU machines, we allow hyperthreading for compilation but not for testing because we want to minimize fragmentation of jobs across cores. """ if is_cuda_machine(machine): return int(run_cmd_no_fail("nvidia-smi -L | wc -l")) else: return get_available_cpu_count()
def import_variables(self): ########################################################################### if len(self._ivars)>0: expect (self._ifile.exists(), "Error! Import file '{}' does not exist.".format(self._ifile)) ds_out = self.get_database(self._ofile,'a') ds_in = self.get_database(self._ifile,'r') expect ('ncol' in ds_in.dimensions, "Error! 'ncol' not found in input file dimensions'") expect ('lev' in ds_in.dimensions, "Error! 'lev' not found in input file dimensions'") ncol_out = ds_out.dimensions['ncol'].size nlev_out = ds_out.dimensions['lev'].size ncol_in = ds_in.dimensions['ncol'].size nlev_in = ds_in.dimensions['lev'].size ds_in.close() ds_out.close() expect (nlev_in==nlev_out, "Error! Vertical remapping unavailable, due to ncremap assumption that level idx strides slower than column idx.") if ncol_in==ncol_out: self.import_variables_no_remap(self._ifile) else: self.import_variables_horiz_remap() # To protect against the possiblity that the input file stored vars with # a layout different from scream (e.g., T(time,lev,ncol) instead of # T(time,ncol,lev)), we run ncpdq to rearrange (if need be) the dimensions run_cmd_no_fail ("ncpdq -a ncol,lev -O {} {}".format(self._ofile,self._ofile)) run_cmd_no_fail ("ncpdq -a ncol,ilev -O {} {}".format(self._ofile,self._ofile))
def generate_baselines(self, test, cleanup): ############################################################################### name = self._test_full_names[test] test_dir = "ctest-build/{}".format(name) cmake_config = self.generate_cmake_config(self._tests_cmake_args[test]) print("Generating baseline for build type {} with config '{}'".format( name, cmake_config)) # We cannot just crash if we fail to generate baselines, since we would # not get a dashboard report if we did that. Instead, just ensure there is # no baseline file to compare against if there's a problem. stat, _, err = run_cmd("{} {}".format(cmake_config, self._src_dir), from_dir=test_dir, verbose=True) if stat != 0: print("WARNING: Failed to configure baselines:\n{}".format(err)) return False cmd = "make -j{} && make baseline".format(self._proc_count) if self._parallel: start, end = self.get_taskset_id(test) cmd = "taskset -c {}-{} sh -c '{}'".format(start, end, cmd) stat, _, err = run_cmd(cmd, from_dir=test_dir, verbose=True) if stat != 0: print("WARNING: Failed to create baselines:\n{}".format(err)) return False if cleanup: run_cmd_no_fail("ls | grep -v data | xargs rm -rf ", from_dir=test_dir) return True
def setup_mach_env(machine): ############################################################################### expect( is_machine_supported(machine), "Error! Machine {} is not currently supported by scream testing system." .format(machine)) env_setup = get_mach_env_setup_command(machine) # Do something only if this machine has env specs if env_setup != []: # Running the env command only modifies the env in the subprocess # But we can return the resulting PATH, and update current env with that # Get the whole env string after running the env_setup command curr_env = run_cmd_no_fail( "{{ {}; }} > /dev/null && env | sort".format(";".join(env_setup))) # Split by line. We are assuming that each env variable is *exactly* on one line curr_env_list = curr_env.split("\n") # For each line, split the string at the 1st '='. # The resulting length-2 stirng is (ENV_VAR_NAME, ENV_VAR_VALUE); # use it to update the os environment for item in curr_env_list: # On fedora systems, the environment contains the annoying entry (on 2 lines) # # BASH_FUNC_module()=() { eval `/usr/bin/modulecmd bash $*` # } # Which breaks the assumption that each env var is on one line. # On some systems, this variable seems to have a different name, # and there can potentially be other BASH_FUNC_blah variables. # To get around this, discard lines that either do not contain '=', # or that start with BASH_FUNC_. if item.find("BASH_FUNC_") != -1 or item.find("=") == -1: continue # 2 means only 1st occurence will cause a split. # Just in case some env var value contains '=' item_list = item.split("=", 2) os.environ.update(dict({item_list[0]: item_list[1]}))
def update_submodules(repo=None): ############################################################################### """ Updates submodules """ run_cmd_no_fail("git submodule update --init --recursive", from_dir=repo)
def __init__(self, cxx_compiler=None, f90_compiler=None, c_compiler=None, submit=False, parallel=False, fast_fail=False, baseline_ref=None, baseline_dir=None, machine=None, no_tests=False, keep_tree=False, custom_cmake_opts=(), custom_env_vars=(), preserve_env=False, tests=(), integration_test="JENKINS_HOME" in os.environ, local=False, root_dir=None, work_dir=None, quick_rerun=False,quick_rerun_failed=False,dry_run=False, make_parallel_level=0, ctest_parallel_level=0): ########################################################################### self._cxx_compiler = cxx_compiler self._f90_compiler = f90_compiler self._c_compiler = c_compiler self._submit = submit self._parallel = parallel self._fast_fail = fast_fail self._baseline_ref = baseline_ref self._machine = machine self._local = local self._perform_tests = not no_tests self._keep_tree = keep_tree self._baseline_dir = baseline_dir self._custom_cmake_opts = custom_cmake_opts self._custom_env_vars = custom_env_vars self._preserve_env = preserve_env self._tests = tests self._root_dir = root_dir self._work_dir = work_dir self._integration_test = integration_test self._quick_rerun = quick_rerun self._quick_rerun_failed = quick_rerun_failed self._dry_run = dry_run self._must_generate_baselines = False if self._quick_rerun_failed: self._quick_rerun = True ############################################ # Sanity checks and helper structs setup # ############################################ # Probe machine if none was specified if self._machine is None: # We could potentially integrate more with CIME here to do actual # nodename probing. if "CIME_MACHINE" in os.environ and is_machine_supported(os.environ["CIME_MACHINE"]): self._machine = os.environ["CIME_MACHINE"] else: expect(self._local, "test-all-scream requires either the machine arg (-m $machine) or the -l flag," "which makes it lookf for machine specs in '~/.cime/scream_mach_specs.py'.") self._machine = "local" else: expect (not self._local, "Specifying a machine while passing '-l,--local' is ambiguous.") ################################################## # Deduce how many testing resources per test # ################################################## if ctest_parallel_level > 0: ctest_max_jobs = ctest_parallel_level print("Note: honoring requested value for ctest parallel level: {}".format(ctest_max_jobs)) elif "CTEST_PARALLEL_LEVEL" in os.environ: ctest_max_jobs = int(os.environ["CTEST_PARALLEL_LEVEL"]) print("Note: honoring environment value for ctest parallel level: {}".format(ctest_max_jobs)) else: ctest_max_jobs = get_mach_testing_resources(self._machine) print("Note: no value passed for --ctest-parallel-level. Using the default for this machine: {}".format(ctest_max_jobs)) # Unless the user claims to know what he/she is doing, we setup the env. if not self._preserve_env: # Setup the env on this machine setup_mach_env(self._machine, ctest_j=ctest_max_jobs) # Compute root dir if not self._root_dir: self._root_dir = pathlib.Path(__file__).resolve().parent.parent else: self._root_dir = pathlib.Path(self._root_dir).resolve() expect(self._root_dir.is_dir() and self._root_dir.parts()[-2:] == ('scream', 'components'), "Bad root-dir '{}', should be: $scream_repo/components/scream".format(self._root_dir)) if self._work_dir is not None: expect(pathlib.Path(self._work_dir).absolute().is_dir(), "Error! Work directory '{}' does not exist.".format(self._work_dir)) else: self._work_dir = self._root_dir.absolute().joinpath("ctest-build") expect (not self._baseline_dir or self._work_dir != self._baseline_dir, "Error! For your safety, do NOT use '{}' to store baselines. Move them to a different directory (even a subdirectory of that works).".format(self._work_dir)) expect(not (self._baseline_ref and self._baseline_dir), "Makes no sense to specify a baseline generation commit if using pre-existing baselines ") self._tests_cmake_args = { "dbg" : [("CMAKE_BUILD_TYPE", "Debug"), ("EKAT_DEFAULT_BFB", "True")], "sp" : [("CMAKE_BUILD_TYPE", "Debug"), ("SCREAM_DOUBLE_PRECISION", "False"), ("EKAT_DEFAULT_BFB", "True")], "fpe" : [("CMAKE_BUILD_TYPE", "Debug"), ("SCREAM_PACK_SIZE", "1"), ("SCREAM_SMALL_PACK_SIZE", "1"), ("EKAT_DEFAULT_BFB", "True")], "opt" : [("CMAKE_BUILD_TYPE", "Release")], "valg" : [("CMAKE_BUILD_TYPE", "Debug"), ("EKAT_ENABLE_VALGRIND", "True")], } self._test_full_names = OrderedDict([ ("dbg" , "full_debug"), ("sp" , "full_sp_debug"), ("fpe" , "debug_nopack_fpe"), ("opt" , "release"), ("valg" , "valgrind"), ]) if not self._tests: # default to all test types except do not do fpe on CUDA self._tests = list(self._test_full_names.keys()) self._tests.remove("valg") # don't want this on by default if is_cuda_machine(self._machine): self._tests.remove("fpe") else: for t in self._tests: expect(t in self._test_full_names, "Requested test '{}' is not supported by test-all-scream, please choose from: {}".\ format(t, ", ".join(self._test_full_names.keys()))) os.chdir(str(self._root_dir)) # needed, or else every git command will need repo=root_dir expect(get_current_commit(), "Root dir: {}, does not appear to be a git repo".format(self._root_dir)) self._original_branch = get_current_branch() self._original_commit = get_current_commit() print_last_commit(git_ref=self._original_branch, dry_run=self._dry_run) ############################################ # Deduce compilers if needed/possible # ############################################ if self._cxx_compiler is None: self._cxx_compiler = get_mach_cxx_compiler(self._machine) if self._f90_compiler is None: self._f90_compiler = get_mach_f90_compiler(self._machine) if self._c_compiler is None: self._c_compiler = get_mach_c_compiler(self._machine) if not self._dry_run: self._f90_compiler = run_cmd_no_fail("which {}".format(self._f90_compiler)) self._cxx_compiler = run_cmd_no_fail("which {}".format(self._cxx_compiler)) self._c_compiler = run_cmd_no_fail("which {}".format(self._c_compiler)) ################################### # Compute baseline info # ################################### default_baselines_root_dir = pathlib.Path(self._work_dir,"baselines") if self._baseline_dir is None: if self._baseline_ref is None: # Compute baseline ref if self._keep_tree: self._baseline_ref = "HEAD" elif self._integration_test: # Make sure our copy of origin/master is up-to-date (at least at the time of this script's execution) git_fetch_remote("origin") self._baseline_ref = "origin/master" merge_git_ref(git_ref="origin/master", verbose=True, dry_run=self._dry_run) else: self._baseline_ref = get_common_ancestor("origin/master") # Prefer a symbolic ref if possible if self._baseline_ref is None or self._baseline_ref == get_current_commit(commit="origin/master"): self._baseline_ref = "origin/master" self._must_generate_baselines = True self._baseline_dir = pathlib.Path(default_baselines_root_dir).absolute() else: # We treat the "AUTO" string as a request for automatic baseline dir. if self._baseline_dir == "AUTO": self._baseline_dir = get_mach_baseline_root_dir(self._machine) self._baseline_dir = pathlib.Path(self._baseline_dir).absolute() # Make sure the baseline root directory exists expect(self._baseline_dir.is_dir(), "Baseline_dir {} is not a dir".format(self._baseline_dir)) if self._integration_test: self._baseline_ref = "origin/master" merge_git_ref(git_ref=self._baseline_ref, verbose=True, dry_run=self._dry_run) else: for test in self._tests: test_baseline_dir = self.get_preexisting_baseline(test) expect(test_baseline_dir.is_dir(), "Missing baseline {}".format(test_baseline_dir)) # Name of the file used to store/check the git sha of the repo used to generate baselines, # and name of the file used to store/check the builds for which baselines are available # Store it once to avoid typos-like bugs self._baseline_sha_file = pathlib.Path(self._baseline_dir, "baseline_git_sha") self._baseline_names_file = pathlib.Path(self._baseline_dir, "baseline_names") if self._integration_test: master_sha = get_current_commit(commit=self._baseline_ref) if not self.baselines_are_present(): print ("Some baselines were not found. Rebuilding them.") self._must_generate_baselines = True elif self.baselines_are_expired(expected_baseline_sha=master_sha): print ("Baselines expired. Rebuilding them.") self._must_generate_baselines = True else: print ("Baselines found and not expired. Skipping baselines generation.") if self._must_generate_baselines: print("Using commit {} to generate baselines".format(self._baseline_ref)) self._testing_res_count = { "dbg" : ctest_max_jobs, "sp" : ctest_max_jobs, "fpe" : ctest_max_jobs, "opt" : ctest_max_jobs, "valg" : ctest_max_jobs, } # Deduce how many compilation resources per test if make_parallel_level > 0: make_max_jobs = make_parallel_level print("Note: honoring requested value for make parallel level: {}".format(make_max_jobs)) else: make_max_jobs = get_mach_compilation_resources(self._machine) print("Note: no value passed for --make-parallel-level. Using the default for this machine: {}".format(make_max_jobs)) self._compile_res_count = { "dbg" : make_max_jobs, "sp" : make_max_jobs, "fpe" : make_max_jobs, "opt" : make_max_jobs, "valg" : make_max_jobs, } if self._parallel: # We need to be aware that other builds may be running too. # (Do not oversubscribe the machine) make_remainder = make_max_jobs % len(self._tests) make_count = make_max_jobs // len(self._tests) ctest_remainder = ctest_max_jobs % len(self._tests) ctest_count = ctest_max_jobs // len(self._tests) # In case we have more items in self._tests than cores/gpus (unlikely) if make_count == 0: make_count = 1 if ctest_count == 0: ctest_count = 1 for test in self._tests: self._compile_res_count[test] = make_count if self._tests.index(test)<make_remainder: self._compile_res_count[test] = make_count + 1 self._testing_res_count[test] = ctest_count if self._tests.index(test)<ctest_remainder: self._testing_res_count[test] = ctest_count + 1 print("test {} can use {} jobs to compile, and {} jobs for testing".format(test,self._compile_res_count[test],self._testing_res_count[test])) if self._keep_tree: expect(not self._integration_test, "Should not be doing keep-tree with integration testing") print("WARNING! You have uncommitted changes in your repo.", " The PASS/FAIL status may depend on these changes", " so if you want to keep them, don't forget to create a commit.",sep="\n") if self._baseline_dir is None: # Make sure the baseline ref is HEAD expect(self._baseline_ref == "HEAD", "The option --keep-tree is only available when testing against pre-built baselines " "(--baseline-dir) or HEAD (-b HEAD)") else: expect(self._dry_run or is_repo_clean(), "Repo must be clean before running. If testing against HEAD or pre-built baselines, " "you can pass `--keep-tree` to allow non-clean repo.")
def __init__(self, cxx_compiler=None, f90_compiler=None, c_compiler=None, submit=False, parallel=False, fast_fail=False, baseline_ref=None, baseline_dir=None, machine=None, no_tests=False, config_only=False, keep_tree=False, custom_cmake_opts=(), custom_env_vars=(), preserve_env=False, tests=(), integration_test=False, local=False, root_dir=None, work_dir=None, quick_rerun=False, quick_rerun_failed=False, dry_run=False, make_parallel_level=0, ctest_parallel_level=0, update_expired_baselines=False, extra_verbose=False, limit_test_regex=None): ########################################################################### # When using scripts-tests, we can't pass "-l" to test-all-scream, # but we can pass "-m local". So if machine="local", reset things # as if local=True and machine=None if machine == "local": local = True machine = None self._cxx_compiler = cxx_compiler self._f90_compiler = f90_compiler self._c_compiler = c_compiler self._submit = submit self._parallel = parallel self._fast_fail = fast_fail self._baseline_ref = baseline_ref self._machine = machine self._local = local self._perform_tests = not no_tests self._config_only = config_only self._keep_tree = keep_tree self._baseline_dir = baseline_dir self._custom_cmake_opts = custom_cmake_opts self._custom_env_vars = custom_env_vars self._preserve_env = preserve_env self._tests = tests self._root_dir = root_dir self._work_dir = None if work_dir is None else Path(work_dir) self._integration_test = integration_test self._quick_rerun = quick_rerun self._quick_rerun_failed = quick_rerun_failed self._dry_run = dry_run self._tests_needing_baselines = [] self._update_expired_baselines = update_expired_baselines self._extra_verbose = extra_verbose self._limit_test_regex = limit_test_regex self._test_full_names = OrderedDict([ ("dbg", "full_debug"), ("sp", "full_sp_debug"), ("fpe", "debug_nopack_fpe"), ("opt", "release"), ("valg", "valgrind"), ("cmc", "cuda_mem_check"), ("cov", "coverage"), ]) # Not all builds are ment to perform comparisons against pre-built baselines self._test_uses_baselines = OrderedDict([ ("dbg", True), ("sp", True), ("fpe", False), ("opt", True), ("valg", False), ("cmc", False), ("cov", False), ]) self._tests_cmake_args = { "dbg": [("CMAKE_BUILD_TYPE", "Debug"), ("EKAT_DEFAULT_BFB", "True")], "sp": [("CMAKE_BUILD_TYPE", "Debug"), ("SCREAM_DOUBLE_PRECISION", "False"), ("EKAT_DEFAULT_BFB", "True")], "fpe": [("CMAKE_BUILD_TYPE", "Debug"), ("SCREAM_PACK_SIZE", "1"), ("SCREAM_SMALL_PACK_SIZE", "1"), ("SCREAM_ENABLE_BASELINE_TESTS", "False"), ("EKAT_DEFAULT_BFB", "True")], "opt": [("CMAKE_BUILD_TYPE", "Release")], "valg": [("CMAKE_BUILD_TYPE", "Debug"), ("SCREAM_TEST_PROFILE", "SHORT"), ("SCREAM_ENABLE_BASELINE_TESTS", "False"), ("EKAT_ENABLE_VALGRIND", "True")], "cmc": [("CMAKE_BUILD_TYPE", "Debug"), ("SCREAM_TEST_PROFILE", "SHORT"), ("SCREAM_ENABLE_BASELINE_TESTS", "False"), ("EKAT_ENABLE_CUDA_MEMCHECK", "True")], "cov": [("CMAKE_BUILD_TYPE", "Debug"), ("SCREAM_ENABLE_BASELINE_TESTS", "False"), ("EKAT_ENABLE_COVERAGE", "True")], } if self._quick_rerun_failed: self._quick_rerun = True ############################################ # Sanity checks and helper structs setup # ############################################ # Quick rerun skips config phase, and config-only runs only config. You can't ask for both... expect( not (self._quick_rerun and self._config_only), "Makes no sense to ask for --quick-rerun and --config-only at the same time" ) # Probe machine if none was specified if self._machine is None: # We could potentially integrate more with CIME here to do actual # nodename probing. if "SCREAM_MACHINE" in os.environ and is_machine_supported( os.environ["SCREAM_MACHINE"]): self._machine = os.environ["SCREAM_MACHINE"] else: expect( self._local, "test-all-scream requires either the machine arg (-m $machine) or the -l flag," "which makes it look for machine specs in '~/.cime/scream_mach_specs.py'." ) self._machine = "local" else: expect( not self._local, "Specifying a machine while passing '-l,--local' is ambiguous." ) if not self._tests: # default to all test types except do not do fpe on CUDA self._tests = list(self._test_full_names.keys()) self._tests.remove("valg") # don't want this on by default self._tests.remove("cov") # don't want this on by default self._tests.remove("cmc") # don't want this on by default if is_cuda_machine(self._machine): self._tests.remove("fpe") else: for t in self._tests: expect(t in self._test_full_names, "Requested test '{}' is not supported by test-all-scream, please choose from: {}".\ format(t, ", ".join(self._test_full_names.keys()))) # Compute root dir (where repo is) and work dir (where build/test will happen) if not self._root_dir: self._root_dir = Path(__file__).resolve().parent.parent else: self._root_dir = Path(self._root_dir).resolve() expect( self._root_dir.is_dir() and self._root_dir.parts()[-2:] == ('scream', 'components'), "Bad root-dir '{}', should be: $scream_repo/components/scream". format(self._root_dir)) if self._work_dir is not None: self._work_dir = Path(self._work_dir).absolute() expect( self._work_dir.is_dir(), "Error! Work directory '{}' does not exist.".format( self._work_dir)) else: self._work_dir = self._root_dir.absolute().joinpath("ctest-build") self._work_dir.mkdir(exist_ok=True) os.chdir(str(self._root_dir) ) # needed, or else every git command will need repo=root_dir expect( get_current_commit(), "Root dir: {}, does not appear to be a git repo".format( self._root_dir)) # Print some info on the branch self._original_branch = get_current_branch() self._original_commit = get_current_commit() print_last_commit(git_ref=self._original_branch, dry_run=self._dry_run) ################################### # Compilation/testing resources # ################################### # Deduce how many compilation resources per test make_max_jobs = get_mach_compilation_resources() if make_parallel_level > 0: expect( make_parallel_level <= make_max_jobs, "Requested make_parallel_level {} is more than max available {}" .format(make_parallel_level, make_max_jobs)) make_max_jobs = make_parallel_level print("Note: honoring requested value for make parallel level: {}". format(make_max_jobs)) else: print( "Note: no value passed for --make-parallel-level. Using the default for this machine: {}" .format(make_max_jobs)) ctest_max_jobs = get_mach_testing_resources(self._machine) if ctest_parallel_level > 0: expect( ctest_parallel_level <= ctest_max_jobs, "Requested ctest_parallel_level {} is more than max available {}" .format(ctest_parallel_level, ctest_max_jobs)) ctest_max_jobs = ctest_parallel_level print( "Note: honoring requested value for ctest parallel level: {}". format(ctest_max_jobs)) elif "CTEST_PARALLEL_LEVEL" in os.environ: env_val = int(os.environ["CTEST_PARALLEL_LEVEL"]) expect( env_val <= ctest_max_jobs, "CTEST_PARALLEL_LEVEL env {} is more than max available {}". format(env_val, ctest_max_jobs)) ctest_max_jobs = env_val print( "Note: honoring environment value for ctest parallel level: {}" .format(ctest_max_jobs)) else: print( "Note: no value passed for --ctest-parallel-level. Using the default for this machine: {}" .format(ctest_max_jobs)) self._ctest_max_jobs = ctest_max_jobs self._testing_res_count = dict( zip(self._tests, [ctest_max_jobs] * len(self._tests))) self._compile_res_count = dict( zip(self._tests, [make_max_jobs] * len(self._tests))) if self._parallel: # We need to be aware that other builds may be running too. # (Do not oversubscribe the machine) log_per_phys = logical_cores_per_physical_core() # Avoid splitting physical cores across test types make_jobs_per_test = ((make_max_jobs // len(self._tests)) // log_per_phys) * log_per_phys if is_cuda_machine(self._machine): ctest_jobs_per_test = ctest_max_jobs // len(self._tests) else: ctest_jobs_per_test = ((ctest_max_jobs // len(self._tests)) // log_per_phys) * log_per_phys # The current system of selecting cores explicitly with taskset will not work # if we try to oversubscribe. We would need to implement some kind of wrap-around # mechanism if make_jobs_per_test == 0 or ctest_jobs_per_test == 0: expect( False, "test-all-scream does not currently support oversubscription. " "Either run fewer test types or turn off parallel testing") self._testing_res_count = dict( zip(self._tests, [ctest_jobs_per_test] * len(self._tests))) self._compile_res_count = dict( zip(self._tests, [make_jobs_per_test] * len(self._tests))) for test in self._tests: print( "Test {} can use {} jobs to compile, and {} jobs for test". format(test, self._compile_res_count[test], self._testing_res_count[test])) # Unless the user claims to know what he/she is doing, we setup the env. # Need to happen before compiler probing if not self._preserve_env: # Setup the env on this machine setup_mach_env(self._machine, ctest_j=ctest_max_jobs) ################################### # Compute baseline info # ################################### expect( not self._baseline_dir or self._work_dir != self._baseline_dir, "Error! For your safety, do NOT use '{}' to store baselines. Move them to a different directory (even a subdirectory if that works)." .format(self._work_dir)) # If no baseline ref/dir was provided, use default master baseline dir for this machine # NOTE: if user specifies baseline ref, baseline dir will be set later to a path within work dir if self._baseline_dir is None and self._baseline_ref is None: self._baseline_dir = "AUTO" print( "No '--baseline-dir XYZ' nor '-b XYZ' provided. Testing against default baselines dir for this machine." ) # If -k was used, make sure it's allowed if self._keep_tree: expect(not self._integration_test, "Should not be doing keep-tree with integration testing") print( "WARNING! You have uncommitted changes in your repo.", " The PASS/FAIL status may depend on these changes", " so if you want to keep them, don't forget to create a commit.", sep="\n") if self._baseline_dir is None: # Make sure the baseline ref is HEAD expect( self._baseline_ref == "HEAD", "The option --keep-tree is only available when testing against pre-built baselines " "(--baseline-dir) or HEAD (-b HEAD)") else: # Make sure the baseline ref is unset (or HEAD) expect( self._baseline_ref is None or self._baseline_ref == "HEAD", "The option --keep-tree is only available when testing against pre-built baselines " "(--baseline-dir) or HEAD (-b HEAD)") else: expect( self._dry_run or is_repo_clean(), "Repo must be clean before running. If testing against HEAD or pre-built baselines, " "you can pass `--keep-tree` to allow non-clean repo.") # For integration test, enforce baseline_ref==origin/master, and proceed to merge origin/master if self._integration_test: expect( self._baseline_ref is None or self._baseline_ref == "origin/master", "Error! Integration tests cannot be done against an arbitrary baseline ref." ) # Set baseline ref and merge it self._baseline_ref = "origin/master" merge_git_ref(git_ref=self._baseline_ref, verbose=True, dry_run=self._dry_run) # Always update expired baselines if this is an integration test self._update_expired_baselines = True # By now, we should have at least one between baseline_dir and baseline_ref set (possibly both) default_baselines_root_dir = self._work_dir / "baselines" if self._baseline_dir is None: # Use default baseline dir, and create it if necessary self._baseline_dir = Path(default_baselines_root_dir).absolute() self.create_tests_dirs(self._baseline_dir, True) # Wipe out previous baselines else: if self._baseline_dir == "AUTO": expect( self._baseline_ref is None or self._baseline_ref == 'origin/master', "Do not specify `-b XYZ` when using `--baseline-dir AUTO`. The AUTO baseline dir should be used for the master baselines only.\n" " `-b XYZ` needs to probably build baselines for ref XYZ. However, no baselines will be built if the dir already contains baselines.\n" ) # We treat the "AUTO" string as a request for automatic baseline dir. auto_dir = get_mach_baseline_root_dir(self._machine) self._baseline_dir = Path( auto_dir) if auto_dir else default_baselines_root_dir if "SCREAM_FAKE_AUTO" in os.environ: self._baseline_dir = self._baseline_dir / "fake" else: self._baseline_dir = Path(self._baseline_dir).absolute() # Make sure the baseline folders exist (but do not purge content if they exist) self.create_tests_dirs(self._baseline_dir, False) print("Checking baselines directory: {}".format(self._baseline_dir)) self.baselines_are_present() if self._update_expired_baselines: self.baselines_are_expired() ############################################ # Deduce compilers if needed/possible # ############################################ if self._cxx_compiler is None: self._cxx_compiler = get_mach_cxx_compiler(self._machine) if self._f90_compiler is None: self._f90_compiler = get_mach_f90_compiler(self._machine) if self._c_compiler is None: self._c_compiler = get_mach_c_compiler(self._machine) if not self._dry_run: self._f90_compiler = run_cmd_no_fail("which {}".format( self._f90_compiler)) self._cxx_compiler = run_cmd_no_fail("which {}".format( self._cxx_compiler)) self._c_compiler = run_cmd_no_fail("which {}".format( self._c_compiler))
def generate_baselines(self, test, commit): ############################################################################### expect( self._test_uses_baselines[test], "Something is off. generate_baseline should have not be called for test {}" .format(test)) test_dir = self.get_test_dir(self._baseline_dir, test) cmake_config = self.generate_cmake_config(self._tests_cmake_args[test]) cmake_config += " -DSCREAM_BASELINES_ONLY=ON" cmake_config += " -DSCREAM_TEST_DATA_DIR={}/data".format(test_dir) print( "===============================================================================" ) print("Generating baseline for test {} with config '{}'".format( self._test_full_names[test], cmake_config)) print( "===============================================================================" ) success = True try: # We cannot just crash if we fail to generate baselines, since we would # not get a dashboard report if we did that. Instead, just ensure there is # no baseline file to compare against if there's a problem. stat, _, err = run_cmd("{} {}".format(cmake_config, self._root_dir), from_dir=test_dir, verbose=True, dry_run=self._dry_run) if stat != 0: print( "WARNING: Failed to configure baselines:\n{}".format(err)) success = False else: cmd = "make -j{} && make -j{} baseline".format( self._compile_res_count[test], self._testing_res_count[test]) if self._parallel: start, end = self.get_taskset_range(test) cmd = "taskset -c {}-{} sh -c '{}'".format(start, end, cmd) stat, _, err = run_cmd(cmd, from_dir=test_dir, verbose=True, dry_run=self._dry_run) if stat != 0: print( "WARNING: Failed to create baselines:\n{}".format(err)) success = False finally: # Clean up the directory, by removing everything but the 'data' subfolder. This must # happen unconditionally or else subsequent runs could be corrupted run_cmd_no_fail( r"find -maxdepth 1 -not -name data ! -path . -exec rm -rf {} \;", from_dir=test_dir, verbose=True, dry_run=self._dry_run) if success: # Store the sha used for baselines generation self.set_baseline_file_sha(test, commit) return success