def determine_java_executable(): """Will return the path of the java executable that will be used by Spark's tests or `None`""" # Any changes in the way that Spark's build detects java must be reflected # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to # the `java` executable on the path java_home = os.environ.get("JAVA_HOME") # check if there is an executable at $JAVA_HOME/bin/java java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None # if the java_exe wasn't set, check for a `java` version on the $PATH return java_exe if java_exe else which("java")
def run_sparkr_tests(): set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")]) else: print("Ignoring SparkR tests as R was not found in PATH")
def run_individual_python_test(test_name, pyspark_python): env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)} LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name) start_time = time.time() per_test_output = tempfile.TemporaryFile() retcode = subprocess.Popen( [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], stderr=per_test_output, stdout=per_test_output, env=env).wait() duration = time.time() - start_time # Exit on the first failure. if retcode != 0: with FAILURE_REPORTING_LOCK: with open(LOG_FILE, 'ab') as log_file: per_test_output.seek(0) log_file.writelines(per_test_output.readlines()) per_test_output.seek(0) for line in per_test_output: if not re.match('[0-9]+', line): print(line, end='') per_test_output.close() print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) else: per_test_output.close() LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
def get_default_python_executables(): python_execs = [x for x in ["python2.7", "python3.4", "pypy"] if which(x)] if "python2.7" not in python_execs: LOGGER.warning("Not testing against `python2.7` because it could not be found; falling" " back to `python` instead") python_execs.insert(0, "python") return python_execs
def kill_zinc_on_port(zinc_port): """ Kill the Zinc process running on the given port, if one exists. """ cmd = "%s -P |grep %s | grep LISTEN | awk '{ print $2; }' | xargs kill" lsof_exe = which("lsof") subprocess.check_call(cmd % (lsof_exe if lsof_exe else "/usr/sbin/lsof", zinc_port), shell=True)
def run_individual_python_test(test_name, pyspark_python): env = dict(os.environ) env.update({ 'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH, 'SPARK_TESTING': '1', 'SPARK_PREPEND_CLASSES': '1', 'PYSPARK_PYTHON': which(pyspark_python), 'PYSPARK_DRIVER_PYTHON': which(pyspark_python) }) LOGGER.info("Starting test(%s): %s", pyspark_python, test_name) start_time = time.time() try: per_test_output = tempfile.TemporaryFile() retcode = subprocess.Popen( [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], stderr=per_test_output, stdout=per_test_output, env=env).wait() except: LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(1) duration = time.time() - start_time # Exit on the first failure. if retcode != 0: try: with FAILURE_REPORTING_LOCK: with open(LOG_FILE, 'ab') as log_file: per_test_output.seek(0) log_file.writelines(per_test_output) per_test_output.seek(0) for line in per_test_output: decoded_line = line.decode() if not re.match('[0-9]+', decoded_line): print(decoded_line, end='') per_test_output.close() except: LOGGER.exception("Got an exception while trying to print failed test output") finally: print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) else: per_test_output.close() LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
def get_default_python_executables(): python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)] if "python2.6" not in python_execs: print( "WARNING: Not testing against `python2.6` because it could not be found; falling" " back to `python` instead" ) python_execs.insert(0, "python") return python_execs
def run_sparkr_style_checks(): set_title_and_block("Running R style checks", "BLOCK_R_STYLE") if which("R"): # R style check should be executed after `install-dev.sh`. # Since warnings about `no visible global function definition` appear # without the installation. SEE ALSO: SPARK-9121. run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")]) else: print("Ignoring SparkR style check as R was not found in PATH")
def build_spark_documentation(): set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") os.environ["PRODUCTION"] = "1 jekyll build" os.chdir(os.path.join(SPARK_HOME, "docs")) jekyll_bin = which("jekyll") if not jekyll_bin: print("[error] Cannot find a version of `jekyll` on the system; please", " install one and retry to build documentation.") sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) else: run_cmd([jekyll_bin, "build"]) os.chdir(SPARK_HOME)
def build_spark_documentation(): set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") os.environ["PRODUCTION"] = "1 jekyll build" os.chdir(os.path.join(SPARK_HOME, "docs")) jekyll_bin = which("jekyll") if not jekyll_bin: print( "[error] Cannot find a version of `jekyll` on the system; please", " install one and retry to build documentation.") sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) else: run_cmd([jekyll_bin, "build"]) os.chdir(SPARK_HOME)
def run_individual_python_test(test_name, pyspark_python): env = dict(os.environ) env.update({'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)}) LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name) start_time = time.time() try: per_test_output = tempfile.TemporaryFile() retcode = subprocess.Popen( [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], stderr=per_test_output, stdout=per_test_output, env=env).wait() except: LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(1) duration = time.time() - start_time # Exit on the first failure. if retcode != 0: try: with FAILURE_REPORTING_LOCK: with open(LOG_FILE, 'ab') as log_file: per_test_output.seek(0) log_file.writelines(per_test_output) per_test_output.seek(0) for line in per_test_output: decoded_line = line.decode() if not re.match('[0-9]+', decoded_line): print(decoded_line, end='') per_test_output.close() except: LOGGER.exception( "Got an exception while trying to print failed test output") finally: print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) else: per_test_output.close() LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
def run_individual_python_test(test_name, pyspark_python): env = {"SPARK_TESTING": "1", "PYSPARK_PYTHON": which(pyspark_python)} print(" Running test: %s ..." % test_name, end="") start_time = time.time() with open(LOG_FILE, "a") as log_file: retcode = subprocess.call( [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], stderr=log_file, stdout=log_file, env=env ) duration = time.time() - start_time # Exit on the first failure. if retcode != 0: with open(LOG_FILE, "r") as log_file: for line in log_file: if not re.match("[0-9]+", line): print(line, end="") print_red("\nHad test failures in %s; see logs." % test_name) exit(-1) else: print("ok (%is)" % duration)
def run_individual_python_test(test_name, pyspark_python): env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)} print(" Running test: %s ..." % test_name, end='') start_time = time.time() with open(LOG_FILE, 'a') as log_file: retcode = subprocess.call( [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], stderr=log_file, stdout=log_file, env=env) duration = time.time() - start_time # Exit on the first failure. if retcode != 0: with open(LOG_FILE, 'r') as log_file: for line in log_file: if not re.match('[0-9]+', line): print(line, end='') print_red("\nHad test failures in %s; see logs." % test_name) exit(-1) else: print("ok (%is)" % duration)
def run_python_tests(test_modules, parallelism, with_coverage=False): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") if with_coverage: # Coverage makes the PySpark tests flaky due to heavy parallelism. # When we run PySpark tests with coverage, it uses 4 for now as # workaround. parallelism = 4 script = "run-tests-with-coverage" else: script = "run-tests" command = [os.path.join(SPARK_HOME, "python", script)] if test_modules != [modules.root]: command.append("--modules=%s" % ','.join(m.name for m in test_modules)) command.append("--parallelism=%i" % parallelism) if "GITHUB_ACTIONS" in os.environ: # See SPARK-33565. Python 3.9 was temporarily removed as its default Python executables # to test because of Jenkins environment issue. Once Jenkins has Python 3.9 to test, # we should remove this change back and add python3.9 into python/run-tests.py script. command.append("--python-executable=%s" % ','.join(x for x in ["python3.9", "pypy3"] if which(x))) run_cmd(command)
def run_individual_python_test(target_dir, test_name, pyspark_python): env = dict(os.environ) env.update({ 'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH, 'SPARK_TESTING': '1', 'SPARK_PREPEND_CLASSES': '1', 'PYSPARK_PYTHON': which(pyspark_python), 'PYSPARK_DRIVER_PYTHON': which(pyspark_python) }) # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is # recognized by the tempfile module to override the default system temp directory. tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) while os.path.isdir(tmp_dir): tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) os.mkdir(tmp_dir) env["TMPDIR"] = tmp_dir # Also override the JVM's temp directory by setting driver and executor options. spark_args = [ "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), "pyspark-shell" ] env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args) LOGGER.info("Starting test(%s): %s", pyspark_python, test_name) start_time = time.time() try: per_test_output = tempfile.TemporaryFile() retcode = subprocess.Popen( [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], stderr=per_test_output, stdout=per_test_output, env=env).wait() shutil.rmtree(tmp_dir, ignore_errors=True) except: LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(1) duration = time.time() - start_time # Exit on the first failure. if retcode != 0: try: with FAILURE_REPORTING_LOCK: with open(LOG_FILE, 'ab') as log_file: per_test_output.seek(0) log_file.writelines(per_test_output) per_test_output.seek(0) for line in per_test_output: decoded_line = line.decode("utf-8", "replace") if not re.match('[0-9]+', decoded_line): print(decoded_line, end='') per_test_output.close() except: LOGGER.exception( "Got an exception while trying to print failed test output") finally: print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) else: skipped_counts = 0 try: per_test_output.seek(0) # Here expects skipped test output from unittest when verbosity level is # 2 (or --verbose option is enabled). decoded_lines = map(lambda line: line.decode("utf-8", "replace"), iter(per_test_output)) skipped_tests = list( filter( lambda line: re.search( r'test_.* \(pyspark\..*\) ... skipped ', line), decoded_lines)) skipped_counts = len(skipped_tests) if skipped_counts > 0: key = (pyspark_python, test_name) assert SKIPPED_TESTS is not None SKIPPED_TESTS[key] = skipped_tests per_test_output.close() except: import traceback print_red("\nGot an exception while trying to store " "skipped test output:\n%s" % traceback.format_exc()) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) if skipped_counts != 0: LOGGER.info( "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name, duration, skipped_counts) else: LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
def run_individual_python_test(target_dir, test_name, pyspark_python, failed_tests_deque): env = dict(os.environ) env.update({ 'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH, 'SPARK_TESTING': '1', 'SPARK_PREPEND_CLASSES': '1', 'PYSPARK_PYTHON': which(pyspark_python), 'PYSPARK_DRIVER_PYTHON': which(pyspark_python) }) # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is # recognized by the tempfile module to override the default system temp directory. tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) while os.path.isdir(tmp_dir): tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) os.mkdir(tmp_dir) env["TMPDIR"] = tmp_dir # Also override the JVM's temp directory by setting driver and executor options. spark_args = [ "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), "pyspark-shell" ] env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args) LOGGER.info("Starting test(%s): %s", pyspark_python, test_name) start_time = time.time() # This line must come before the try block, file shouldn't be closed before 'worker' is done per_test_output = tempfile.TemporaryFile() try: process = subprocess.Popen( [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], # bufsize must be 0 (unbuffered), 1 (line buffered) doesn't seem to work stderr=subprocess.STDOUT, stdout=subprocess.PIPE, env=env, bufsize=0, universal_newlines=True) def consume_log(output): for line in process.stdout: print("({}) {} - {}".format(pyspark_python, test_name, line), end=b'') print(line, file=output, end=b'') worker = Thread(target=consume_log, args=(per_test_output,)) worker.start() # This is essential as we need to consume the stdout pipe retcode = process.wait() shutil.rmtree(tmp_dir, ignore_errors=True) except: LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) duration = time.time() - start_time worker.join() # Wait on the thread that consumed the output # If it failed, append output to LOG_FILE, add to failed tests and carry on if retcode != 0: try: with FAILURE_REPORTING_LOCK: with open(LOG_FILE, 'ab') as log_file: per_test_output.seek(0) log_file.writelines(per_test_output) per_test_output.seek(0) for line in per_test_output: decoded_line = line.decode() if not re.match('[0-9]+', decoded_line): print(decoded_line, end='') per_test_output.close() except: LOGGER.exception("Got an exception while trying to print failed test output") finally: print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) failed_tests_deque.append((test_name, pyspark_python)) else: skipped_counts = 0 try: per_test_output.seek(0) # Here expects skipped test output from unittest when verbosity level is # 2 (or --verbose option is enabled). decoded_lines = map(lambda line: line.decode(), iter(per_test_output)) skipped_tests = list(filter( lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line), decoded_lines)) skipped_counts = len(skipped_tests) if skipped_counts > 0: key = (pyspark_python, test_name) SKIPPED_TESTS[key] = skipped_tests per_test_output.close() except: import traceback print_red("\nGot an exception while trying to store " "skipped test output:\n%s" % traceback.format_exc()) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) if skipped_counts != 0: LOGGER.info( "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name, duration, skipped_counts) else: LOGGER.info( "Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
def run_individual_python_test(test_name, pyspark_python): env = dict(os.environ) env.update({ 'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH, 'SPARK_TESTING': '1', 'SPARK_PREPEND_CLASSES': '1', 'PYSPARK_PYTHON': which(pyspark_python), 'PYSPARK_DRIVER_PYTHON': which(pyspark_python) }) LOGGER.info("Starting test(%s): %s", pyspark_python, test_name) start_time = time.time() try: per_test_output = tempfile.TemporaryFile() retcode = subprocess.Popen( [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], stderr=per_test_output, stdout=per_test_output, env=env).wait() except: LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(1) duration = time.time() - start_time # Exit on the first failure. if retcode != 0: try: with FAILURE_REPORTING_LOCK: with open(LOG_FILE, 'ab') as log_file: per_test_output.seek(0) log_file.writelines(per_test_output) per_test_output.seek(0) for line in per_test_output: decoded_line = line.decode() if not re.match('[0-9]+', decoded_line): print(decoded_line, end='') per_test_output.close() except: LOGGER.exception( "Got an exception while trying to print failed test output") finally: print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) else: skipped_counts = 0 try: per_test_output.seek(0) # Here expects skipped test output from unittest when verbosity level is # 2 (or --verbose option is enabled). decoded_lines = map(lambda line: line.decode(), iter(per_test_output)) skipped_tests = list( filter( lambda line: re.search( 'test_.* \(pyspark\..*\) ... skipped ', line), decoded_lines)) skipped_counts = len(skipped_tests) if skipped_counts > 0: key = (pyspark_python, test_name) SKIPPED_TESTS[key] = skipped_tests per_test_output.close() except: import traceback print_red("\nGot an exception while trying to store " "skipped test output:\n%s" % traceback.format_exc()) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) if skipped_counts != 0: LOGGER.info( "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name, duration, skipped_counts) else: LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
def main(): opts = parse_opts() # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print("[error] Cannot determine your home directory as an absolute path;", " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) java_exe = determine_java_executable() if not java_exe: print("[error] Cannot find a version of `java` on the system; please", " install one and retry.") sys.exit(2) java_version = determine_java_version(java_exe) if java_version.minor < 8: print("[warn] Java 8 tests will not run because JDK version is < 1.8.") # install SparkR if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) else: print("Cannot install SparkR as R was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH") else: # else we're running locally and can use local settings build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6") test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "under environment", test_env) changed_modules = None changed_files = None if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) excluded_tags = determine_tags_to_exclude(changed_modules) if not changed_modules: changed_modules = [modules.root] excluded_tags = [] print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) # setup environment variables # note - the 'root' module doesn't collect environment variables for all modules. Because the # environment variables should not be set if a module is not changed, even if running the 'root' # module. So here we should use changed_modules rather than test_modules. test_environ = {} for m in changed_modules: test_environ.update(m.environ) setup_test_environ(test_environ) test_modules = determine_modules_to_test(changed_modules) # license checks run_apache_rat_checks() # style checks if not changed_files or any(f.endswith(".scala") or f.endswith("scalastyle-config.xml") for f in changed_files): run_scala_style_checks() if not changed_files or any(f.endswith(".java") or f.endswith("checkstyle.xml") or f.endswith("checkstyle-suppressions.xml") for f in changed_files): # run_java_style_checks() pass if not changed_files or any(f.endswith(".py") for f in changed_files): run_python_style_checks() if not changed_files or any(f.endswith(".R") for f in changed_files): run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() if any(m.should_run_build_tests for m in test_modules): run_build_tests() # spark build build_apache_spark(build_tool, hadoop_version) # backwards compatibility checks if build_tool == "sbt": # Note: compatibility tests only supported in sbt for now detect_binary_inop_with_mima(hadoop_version) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. build_spark_assembly_sbt(hadoop_version) # run the test suites run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: run_python_tests(modules_with_python_tests, opts.parallelism) run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests()
def run_individual_python_test(target_dir, test_name, pyspark_python): env = dict(os.environ) env.update({ 'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH, 'SPARK_TESTING': '1', 'SPARK_PREPEND_CLASSES': '1', 'PYSPARK_PYTHON': which(pyspark_python), 'PYSPARK_DRIVER_PYTHON': which(pyspark_python) }) # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is # recognized by the tempfile module to override the default system temp directory. tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) while os.path.isdir(tmp_dir): tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) os.mkdir(tmp_dir) env["TMPDIR"] = tmp_dir # Also override the JVM's temp directory by setting driver and executor options. spark_args = [ "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), "pyspark-shell" ] env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args) LOGGER.info("Starting test(%s): %s", pyspark_python, test_name) start_time = time.time() try: per_test_output = tempfile.TemporaryFile() retcode = subprocess.Popen( [os.path.join(SPARK_HOME, "bin/pyspark")] + test_name.split(), stderr=per_test_output, stdout=per_test_output, env=env).wait() shutil.rmtree(tmp_dir, ignore_errors=True) except: LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(1) duration = time.time() - start_time # Exit on the first failure. if retcode != 0: try: with FAILURE_REPORTING_LOCK: with open(LOG_FILE, 'ab') as log_file: per_test_output.seek(0) log_file.writelines(per_test_output) per_test_output.seek(0) for line in per_test_output: decoded_line = line.decode() if not re.match('[0-9]+', decoded_line): print(decoded_line, end='') per_test_output.close() except: LOGGER.exception("Got an exception while trying to print failed test output") finally: print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) else: skipped_counts = 0 try: per_test_output.seek(0) # Here expects skipped test output from unittest when verbosity level is # 2 (or --verbose option is enabled). decoded_lines = map(lambda line: line.decode(), iter(per_test_output)) skipped_tests = list(filter( lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line), decoded_lines)) skipped_counts = len(skipped_tests) if skipped_counts > 0: key = (pyspark_python, test_name) SKIPPED_TESTS[key] = skipped_tests per_test_output.close() except: import traceback print_red("\nGot an exception while trying to store " "skipped test output:\n%s" % traceback.format_exc()) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) if skipped_counts != 0: LOGGER.info( "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name, duration, skipped_counts) else: LOGGER.info( "Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
def run_individual_python_test(target_dir, test_name, pyspark_python): env = dict(os.environ) env.update({ 'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH, 'SPARK_TESTING': '1', 'SPARK_PREPEND_CLASSES': '1', 'PYSPARK_PYTHON': which(pyspark_python), 'PYSPARK_DRIVER_PYTHON': which(pyspark_python), # Preserve legacy nested timezone behavior for pyarrow>=2, remove after SPARK-32285 'PYARROW_IGNORE_TIMEZONE': '1', }) # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is # recognized by the tempfile module to override the default system temp directory. tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) while os.path.isdir(tmp_dir): tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) os.mkdir(tmp_dir) env["TMPDIR"] = tmp_dir metastore_dir = os.path.join(tmp_dir, str(uuid.uuid4())) while os.path.isdir(metastore_dir): metastore_dir = os.path.join(metastore_dir, str(uuid.uuid4())) os.mkdir(metastore_dir) # Also override the JVM's temp directory by setting driver and executor options. java_options = "-Djava.io.tmpdir={0}".format(tmp_dir) java_options = java_options + " -Dio.netty.tryReflectionSetAccessible=true -Xss4M" spark_args = [ "--conf", "spark.driver.extraJavaOptions='{0}'".format(java_options), "--conf", "spark.executor.extraJavaOptions='{0}'".format(java_options), "--conf", "spark.sql.warehouse.dir='{0}'".format(metastore_dir), "pyspark-shell" ] env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args) output_prefix = get_valid_filename(pyspark_python + "__" + test_name + "__").lstrip("_") per_test_output = tempfile.NamedTemporaryFile(prefix=output_prefix, suffix=".log") LOGGER.info("Starting test(%s): %s (temp output: %s)", pyspark_python, test_name, per_test_output.name) start_time = time.time() try: retcode = subprocess.Popen([os.path.join(SPARK_HOME, "bin/pyspark")] + test_name.split(), stderr=per_test_output, stdout=per_test_output, env=env).wait() # There exists a race condition in Python and it causes flakiness in MacOS # https://github.com/python/cpython/issues/73885 if platform.system() == "Darwin": os.system("rm -rf " + tmp_dir) else: shutil.rmtree(tmp_dir, ignore_errors=True) except BaseException: LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(1) duration = time.time() - start_time # Exit on the first failure. if retcode != 0: try: with FAILURE_REPORTING_LOCK: with open(LOG_FILE, 'ab') as log_file: per_test_output.seek(0) log_file.writelines(per_test_output) per_test_output.seek(0) for line in per_test_output: decoded_line = line.decode("utf-8", "replace") if not re.match('[0-9]+', decoded_line): print(decoded_line, end='') per_test_output.close() except BaseException: LOGGER.exception( "Got an exception while trying to print failed test output") finally: print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) else: skipped_counts = 0 try: per_test_output.seek(0) # Here expects skipped test output from unittest when verbosity level is # 2 (or --verbose option is enabled). decoded_lines = map(lambda line: line.decode("utf-8", "replace"), iter(per_test_output)) skipped_tests = list( filter( lambda line: re.search( r'test_.* \(pyspark\..*\) ... (skip|SKIP)', line), decoded_lines)) skipped_counts = len(skipped_tests) if skipped_counts > 0: key = (pyspark_python, test_name) assert SKIPPED_TESTS is not None SKIPPED_TESTS[key] = skipped_tests per_test_output.close() except BaseException: import traceback print_red("\nGot an exception while trying to store " "skipped test output:\n%s" % traceback.format_exc()) # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if # this code is invoked from a thread other than the main thread. os._exit(-1) if skipped_counts != 0: LOGGER.info( "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name, duration, skipped_counts) else: LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
def main(): opts = parse_opts() # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print( "[error] Cannot determine your home directory as an absolute path;", " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) java_exe = determine_java_executable() if not java_exe: print("[error] Cannot find a version of `java` on the system; please", " install one and retry.") sys.exit(2) # Install SparkR should_only_test_modules = opts.modules is not None test_modules = [] if should_only_test_modules: str_test_modules = [m.strip() for m in opts.modules.split(",")] test_modules = [ m for m in modules.all_modules if m.name in str_test_modules ] if not should_only_test_modules or modules.sparkr in test_modules: # If tests modules are specified, we will not run R linter. # SparkR needs the manual SparkR installation. if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) else: print("Cannot install SparkR as R was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") scala_version = os.environ.get("AMPLAB_JENKINS_BUILD_SCALA_PROFILE") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop3.2") hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine # TODO(sknapp): after all builds are ported to the ubuntu workers, change this to be: # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get( "PATH") else: # else we're running locally or GitHub Actions. build_tool = "sbt" scala_version = os.environ.get("SCALA_PROFILE") hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop3.2") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") if "GITHUB_ACTIONS" in os.environ: test_env = "github_actions" else: test_env = "local" extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) + \ get_scala_profiles(scala_version) print("[info] Using build tool", build_tool, "with profiles", *(extra_profiles + ["under environment", test_env])) changed_modules = [] changed_files = [] included_tags = [] excluded_tags = [] if should_only_test_modules: # We're likely in the forked repository is_apache_spark_ref = os.environ.get("APACHE_SPARK_REF", "") != "" # We're likely in the main repo build. is_github_prev_sha = os.environ.get("GITHUB_PREV_SHA", "") != "" # Otherwise, we're in either periodic job in Github Actions or somewhere else. # If we're running the tests in GitHub Actions, attempt to detect and test # only the affected modules. if test_env == "github_actions" and (is_apache_spark_ref or is_github_prev_sha): if is_apache_spark_ref: changed_files = identify_changed_files_from_git_commits( "HEAD", target_ref=os.environ["APACHE_SPARK_REF"]) elif is_github_prev_sha: changed_files = identify_changed_files_from_git_commits( os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"]) modules_to_test = determine_modules_to_test( determine_modules_for_files(changed_files), deduplicated=False) if modules.root not in modules_to_test: # If root module is not found, only test the intersected modules. # If root module is found, just run the modules as specified initially. test_modules = list( set(modules_to_test).intersection(test_modules)) changed_modules = test_modules if len(changed_modules) == 0: print( "[info] There are no modules to test, exiting without testing." ) return # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and # detect modules to test. elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits( "HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) test_modules = determine_modules_to_test(changed_modules) excluded_tags = determine_tags_to_exclude(changed_modules) # If there is no changed module found, tests all. if not changed_modules: changed_modules = [modules.root] if not test_modules: test_modules = determine_modules_to_test(changed_modules) if opts.excluded_tags: excluded_tags.extend( [t.strip() for t in opts.excluded_tags.split(",")]) if opts.included_tags: included_tags.extend( [t.strip() for t in opts.included_tags.split(",")]) print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) # setup environment variables # note - the 'root' module doesn't collect environment variables for all modules. Because the # environment variables should not be set if a module is not changed, even if running the 'root' # module. So here we should use changed_modules rather than test_modules. test_environ = {} for m in changed_modules: test_environ.update(m.environ) setup_test_environ(test_environ) if scala_version is not None: # If not set, assume this is default and doesn't need to change. switch_scala_version(scala_version) should_run_java_style_checks = False if not should_only_test_modules: # license checks run_apache_rat_checks() # style checks if not changed_files or any( f.endswith(".scala") or f.endswith("scalastyle-config.xml") for f in changed_files): run_scala_style_checks(extra_profiles) if not changed_files or any( f.endswith(".java") or f.endswith("checkstyle.xml") or f.endswith("checkstyle-suppressions.xml") for f in changed_files): # Run SBT Checkstyle after the build to prevent a side-effect to the build. should_run_java_style_checks = True if not changed_files or any( f.endswith("lint-python") or f.endswith("tox.ini") or f.endswith(".py") for f in changed_files): run_python_style_checks() if not changed_files or any( f.endswith(".R") or f.endswith("lint-r") or f.endswith(".lintr") for f in changed_files): run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get the Bundler gem installed # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins": run_build_tests() # spark build build_apache_spark(build_tool, extra_profiles) # backwards compatibility checks if build_tool == "sbt": # Note: compatibility tests only supported in sbt for now if not os.environ.get("SKIP_MIMA"): detect_binary_inop_with_mima(extra_profiles) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) # run the test suites run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) modules_with_python_tests = [ m for m in test_modules if m.python_test_goals ] if modules_with_python_tests: # We only run PySpark tests with coverage report in one specific job with # Spark master with SBT in Jenkins. is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ run_python_tests(modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests()
def main(): opts = parse_opts() # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print("[error] Cannot determine your home directory as an absolute path;", " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) java_exe = determine_java_executable() if not java_exe: print("[error] Cannot find a version of `java` on the system; please", " install one and retry.") sys.exit(2) java_version = determine_java_version(java_exe) # install SparkR if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) else: print("Cannot install SparkR as R was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH") else: # else we're running locally and can use local settings build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6") test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "under environment", test_env) changed_modules = None changed_files = None if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) excluded_tags = determine_tags_to_exclude(changed_modules) if not changed_modules: changed_modules = [modules.root] excluded_tags = [] print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) # setup environment variables # note - the 'root' module doesn't collect environment variables for all modules. Because the # environment variables should not be set if a module is not changed, even if running the 'root' # module. So here we should use changed_modules rather than test_modules. test_environ = {} for m in changed_modules: test_environ.update(m.environ) setup_test_environ(test_environ) test_modules = determine_modules_to_test(changed_modules) # license checks run_apache_rat_checks() # style checks if not changed_files or any(f.endswith(".scala") or f.endswith("scalastyle-config.xml") for f in changed_files): run_scala_style_checks() if not changed_files or any(f.endswith(".java") or f.endswith("checkstyle.xml") or f.endswith("checkstyle-suppressions.xml") for f in changed_files): # run_java_style_checks() pass if not changed_files or any(f.endswith(".py") for f in changed_files): run_python_style_checks() if not changed_files or any(f.endswith(".R") for f in changed_files): run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() if any(m.should_run_build_tests for m in test_modules): run_build_tests() # spark build build_apache_spark(build_tool, hadoop_version) # backwards compatibility checks if build_tool == "sbt": # Note: compatibility tests only supported in sbt for now detect_binary_inop_with_mima(hadoop_version) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. build_spark_assembly_sbt(hadoop_version) # run the test suites run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: run_python_tests(modules_with_python_tests, opts.parallelism) run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests()
def main(): opts = parse_opts() # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print("[error] Cannot determine your home directory as an absolute path;", " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) java_exe = determine_java_executable() if not java_exe: print("[error] Cannot find a version of `java` on the system; please", " install one and retry.") sys.exit(2) # install SparkR if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) else: print("Cannot install SparkR as R was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop3.2") hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine # TODO(sknapp): after all builds are ported to the ubuntu workers, change this to be: # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: # else we're running locally and can use local settings build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "and Hive profile", hive_version, "under environment", test_env) extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) changed_modules = None changed_files = None should_only_test_modules = "TEST_ONLY_MODULES" in os.environ included_tags = [] if should_only_test_modules: str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")] test_modules = [m for m in modules.all_modules if m.name in str_test_modules] # Directly uses test_modules as changed modules to apply tags and environments # as if all specified test modules are changed. changed_modules = test_modules str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None) str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None) excluded_tags = [] if str_excluded_tags: excluded_tags = [t.strip() for t in str_excluded_tags.split(",")] included_tags = [] if str_included_tags: included_tags = [t.strip() for t in str_included_tags.split(",")] elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) excluded_tags = determine_tags_to_exclude(changed_modules) if not changed_modules: changed_modules = [modules.root] excluded_tags = [] print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) # setup environment variables # note - the 'root' module doesn't collect environment variables for all modules. Because the # environment variables should not be set if a module is not changed, even if running the 'root' # module. So here we should use changed_modules rather than test_modules. test_environ = {} for m in changed_modules: test_environ.update(m.environ) setup_test_environ(test_environ) should_run_java_style_checks = False if not should_only_test_modules: test_modules = determine_modules_to_test(changed_modules) # license checks run_apache_rat_checks() # style checks if not changed_files or any(f.endswith(".scala") or f.endswith("scalastyle-config.xml") for f in changed_files): run_scala_style_checks(extra_profiles) if not changed_files or any(f.endswith(".java") or f.endswith("checkstyle.xml") or f.endswith("checkstyle-suppressions.xml") for f in changed_files): # Run SBT Checkstyle after the build to prevent a side-effect to the build. should_run_java_style_checks = True if not changed_files or any(f.endswith("lint-python") or f.endswith("tox.ini") or f.endswith(".py") for f in changed_files): run_python_style_checks() if not changed_files or any(f.endswith(".R") or f.endswith("lint-r") or f.endswith(".lintr") for f in changed_files): run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins": run_build_tests() # spark build build_apache_spark(build_tool, extra_profiles) # backwards compatibility checks if build_tool == "sbt": # Note: compatibility tests only supported in sbt for now detect_binary_inop_with_mima(extra_profiles) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) # run the test suites run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: # We only run PySpark tests with coverage report in one specific job with # Spark master with SBT in Jenkins. is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ run_python_tests( modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests()