def run_java_style_checks(build_profiles): set_title_and_block("Running Java style checks", "BLOCK_JAVA_STYLE") # The same profiles used for building are used to run Checkstyle by SBT as well because # the previous build looks reused for Checkstyle and affecting Checkstyle. See SPARK-27130. profiles = " ".join(build_profiles) print("[info] Checking Java style using SBT with these profiles: ", profiles) run_cmd([os.path.join(SPARK_HOME, "dev", "sbt-checkstyle"), profiles])
def run_sparkr_tests(): set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")]) else: print("Ignoring SparkR tests as R was not found in PATH")
def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None): """ Given a git commit and target ref, use the set of files changed in the diff in order to determine which modules' tests should be run. >>> [x.name for x in determine_modules_for_files( \ identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))] ['graphx'] >>> 'root' in [x.name for x in determine_modules_for_files( \ identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))] True """ if target_branch is None and target_ref is None: raise AttributeError("must specify either target_branch or target_ref") elif target_branch is not None and target_ref is not None: raise AttributeError("must specify either target_branch or target_ref, not both") if target_branch is not None: diff_target = target_branch run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) else: diff_target = target_ref raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target], universal_newlines=True) # Remove any empty strings return [f for f in raw_output.split('\n') if f]
def run_python_tests(test_modules): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") command = [os.path.join(SPARK_HOME, "python", "run-tests")] if test_modules != [modules.root]: command.append("--modules=%s" % ','.join(m.name for m in test_modules)) run_cmd(command)
def detect_binary_inop_with_mima(hadoop_version): build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") profiles = " ".join(build_profiles) print("[info] Detecting binary incompatibilities with MiMa using SBT with these profiles: ", profiles) run_cmd([os.path.join(SPARK_HOME, "dev", "mima"), profiles])
def exec_maven(mvn_args=()): """Will call Maven in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" zinc_port = get_zinc_port() os.environ["ZINC_PORT"] = "%s" % zinc_port zinc_flag = "-DzincPort=%s" % zinc_port flags = [os.path.join(SPARK_HOME, "build", "mvn"), "--force", zinc_flag] run_cmd(flags + mvn_args)
def run_sparkr_style_checks(): set_title_and_block("Running R style checks", "BLOCK_R_STYLE") if which("R"): # R style check should be executed after `install-dev.sh`. # Since warnings about `no visible global function definition` appear # without the installation. SEE ALSO: SPARK-9121. run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")]) else: print("Ignoring SparkR style check as R was not found in PATH")
def build_spark_documentation(): set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") os.environ["PRODUCTION"] = "1 jekyll build" os.chdir(os.path.join(SPARK_HOME, "docs")) jekyll_bin = which("jekyll") if not jekyll_bin: print("[error] Cannot find a version of `jekyll` on the system; please", " install one and retry to build documentation.") sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) else: run_cmd([jekyll_bin, "build"]) os.chdir(SPARK_HOME)
def run_pr_checks(pr_tests, ghprb_actual_commit, sha1): """ Executes a set of pull request checks to ease development and report issues with various components such as style, linting, dependencies, compatibilities, etc. @return a list of messages to post back to Github """ # Ensure we save off the current HEAD to revert to current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip() pr_results = list() for pr_test in pr_tests: test_name = pr_test + '.sh' pr_results.append(run_cmd(['bash', os.path.join(SPARK_HOME, 'dev', 'tests', test_name), ghprb_actual_commit, sha1], return_output=True).rstrip()) # Ensure, after each test, that we're back on the current PR run_cmd(['git', 'checkout', '-f', current_pr_head]) return pr_results
def run_python_tests(test_modules, parallelism, with_coverage=False): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") if with_coverage: # Coverage makes the PySpark tests flaky due to heavy parallelism. # When we run PySpark tests with coverage, it uses 4 for now as # workaround. parallelism = 4 script = "run-tests-with-coverage" else: script = "run-tests" command = [os.path.join(SPARK_HOME, "python", script)] if test_modules != [modules.root]: command.append("--modules=%s" % ','.join(m.name for m in test_modules)) command.append("--parallelism=%i" % parallelism) run_cmd(command) if with_coverage: post_python_tests_results()
def main(): opts = parse_opts() # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print( "[error] Cannot determine your home directory as an absolute path;", " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) java_exe = determine_java_executable() if not java_exe: print("[error] Cannot find a version of `java` on the system; please", " install one and retry.") sys.exit(2) # install SparkR if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) else: print("Cannot install SparkR as R was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.7") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine # TODO(sknapp): after all builds are ported to the ubuntu workers, change this to be: # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get( "PATH") else: # else we're running locally and can use local settings build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7") test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "under environment", test_env) changed_modules = None changed_files = None if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits( "HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) excluded_tags = determine_tags_to_exclude(changed_modules) if not changed_modules: changed_modules = [modules.root] excluded_tags = [] print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) # setup environment variables # note - the 'root' module doesn't collect environment variables for all modules. Because the # environment variables should not be set if a module is not changed, even if running the 'root' # module. So here we should use changed_modules rather than test_modules. test_environ = {} for m in changed_modules: test_environ.update(m.environ) setup_test_environ(test_environ) test_modules = determine_modules_to_test(changed_modules) # license checks run_apache_rat_checks() # style checks if not changed_files or any( f.endswith(".scala") or f.endswith("scalastyle-config.xml") for f in changed_files): build_profiles = get_hadoop_profiles( hadoop_version) + modules.root.build_profile_flags run_scala_style_checks(build_profiles) should_run_java_style_checks = False if not changed_files or any( f.endswith(".java") or f.endswith("checkstyle.xml") or f.endswith("checkstyle-suppressions.xml") for f in changed_files): # Run SBT Checkstyle after the build to prevent a side-effect to the build. should_run_java_style_checks = True if not changed_files or any( f.endswith("lint-python") or f.endswith("tox.ini") or f.endswith(".py") for f in changed_files): run_python_style_checks() if not changed_files or any( f.endswith(".R") or f.endswith("lint-r") or f.endswith(".lintr") for f in changed_files): run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() if any(m.should_run_build_tests for m in test_modules): run_build_tests() # spark build build_apache_spark(build_tool, hadoop_version) # backwards compatibility checks if build_tool == "sbt": # Note: compatibility tests only supported in sbt for now detect_binary_inop_with_mima(hadoop_version) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. build_spark_assembly_sbt(hadoop_version, should_run_java_style_checks) # run the test suites run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) modules_with_python_tests = [ m for m in test_modules if m.python_test_goals ] if modules_with_python_tests: # We only run PySpark tests with coverage report in one specific job with # Spark master with SBT in Jenkins. is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ run_python_tests(modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests()
def main(): opts = parse_opts() # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print("[error] Cannot determine your home directory as an absolute path;", " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) java_exe = determine_java_executable() if not java_exe: print("[error] Cannot find a version of `java` on the system; please", " install one and retry.") sys.exit(2) # Install SparkR should_only_test_modules = opts.modules is not None test_modules = [] if should_only_test_modules: str_test_modules = [m.strip() for m in opts.modules.split(",")] test_modules = [m for m in modules.all_modules if m.name in str_test_modules] if not should_only_test_modules or modules.sparkr in test_modules: # If tests modules are specified, we will not run R linter. # SparkR needs the manual SparkR installation. if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) else: print("Cannot install SparkR as R was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop3.2") hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine # TODO(sknapp): after all builds are ported to the ubuntu workers, change this to be: # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: # else we're running locally or GitHub Actions. build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop3.2") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") if "GITHUB_ACTIONS" in os.environ: test_env = "github_actions" else: test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "and Hive profile", hive_version, "under environment", test_env) extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) changed_modules = [] changed_files = [] included_tags = [] excluded_tags = [] if should_only_test_modules: # If we're running the tests in GitHub Actions, attempt to detect and test # only the affected modules. if test_env == "github_actions": if os.environ["GITHUB_INPUT_BRANCH"] != "": # Dispatched request # Note that it assumes GitHub Actions has already merged # the given `GITHUB_INPUT_BRANCH` branch. changed_files = identify_changed_files_from_git_commits( "HEAD", target_branch=os.environ["GITHUB_SHA"]) elif os.environ["GITHUB_BASE_REF"] != "": # Pull requests changed_files = identify_changed_files_from_git_commits( os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"]) else: # Build for each commit. changed_files = identify_changed_files_from_git_commits( os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"]) modules_to_test = determine_modules_to_test( determine_modules_for_files(changed_files), deduplicated=False) if modules.root not in modules_to_test: # If root module is not found, only test the intersected modules. # If root module is found, just run the modules as specified initially. test_modules = list(set(modules_to_test).intersection(test_modules)) changed_modules = test_modules if len(changed_modules) == 0: print("[info] There are no modules to test, exiting without testing.") return # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and # detect modules to test. elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) test_modules = determine_modules_to_test(changed_modules) excluded_tags = determine_tags_to_exclude(changed_modules) # If there is no changed module found, tests all. if not changed_modules: changed_modules = [modules.root] if not test_modules: test_modules = determine_modules_to_test(changed_modules) if opts.excluded_tags: excluded_tags.extend([t.strip() for t in opts.excluded_tags.split(",")]) if opts.included_tags: included_tags.extend([t.strip() for t in opts.included_tags.split(",")]) print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) # setup environment variables # note - the 'root' module doesn't collect environment variables for all modules. Because the # environment variables should not be set if a module is not changed, even if running the 'root' # module. So here we should use changed_modules rather than test_modules. test_environ = {} for m in changed_modules: test_environ.update(m.environ) setup_test_environ(test_environ) should_run_java_style_checks = False if not should_only_test_modules: # license checks run_apache_rat_checks() # style checks if not changed_files or any(f.endswith(".scala") or f.endswith("scalastyle-config.xml") for f in changed_files): run_scala_style_checks(extra_profiles) if not changed_files or any(f.endswith(".java") or f.endswith("checkstyle.xml") or f.endswith("checkstyle-suppressions.xml") for f in changed_files): # Run SBT Checkstyle after the build to prevent a side-effect to the build. should_run_java_style_checks = True if not changed_files or any(f.endswith("lint-python") or f.endswith("tox.ini") or f.endswith(".py") for f in changed_files): run_python_style_checks() if not changed_files or any(f.endswith(".R") or f.endswith("lint-r") or f.endswith(".lintr") for f in changed_files): run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get the Bundler gem installed # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins": run_build_tests() # spark build build_apache_spark(build_tool, extra_profiles) # backwards compatibility checks if build_tool == "sbt": # Note: compatibility tests only supported in sbt for now detect_binary_inop_with_mima(extra_profiles) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) # run the test suites run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: # We only run PySpark tests with coverage report in one specific job with # Spark master with SBT in Jenkins. is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ run_python_tests( modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests()
def detect_binary_inop_with_mima(hadoop_version): build_profiles = get_hadoop_profiles( hadoop_version) + modules.root.build_profile_flags set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") run_cmd([os.path.join(SPARK_HOME, "dev", "mima")] + build_profiles)
def run_python_packaging_tests(): set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS") command = [os.path.join(SPARK_HOME, "dev", "run-pip-tests")] run_cmd(command)
def exec_maven(mvn_args=()): """Will call Maven in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
def run_python_style_checks(): set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE") run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
def run_scala_style_checks(): set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
def main(): opts = parse_opts() if (opts.verbose): log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s") LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) python_execs = opts.python_executables.split(',') modules_to_test = [] for module_name in opts.modules.split(','): if module_name in python_modules: modules_to_test.append(python_modules[module_name]) else: print("Error: unrecognized module '%s'. Supported modules: %s" % (module_name, ", ".join(python_modules))) sys.exit(-1) LOGGER.info("Will test against the following Python executables: %s", python_execs) LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test]) task_queue = Queue.PriorityQueue() for python_exec in python_execs: if "COVERAGE_PROCESS_START" in os.environ: # Make sure if coverage is installed. run_cmd([python_exec, "-c", "import coverage"]) python_implementation = subprocess_check_output( [python_exec, "-c", "import platform; print(platform.python_implementation())"], universal_newlines=True).strip() LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation) LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output( [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip()) for module in modules_to_test: if python_implementation not in module.blacklisted_python_implementations: for test_goal in module.python_test_goals: if test_goal in ('pyspark.streaming.tests', 'pyspark.mllib.tests', 'pyspark.tests', 'pyspark.sql.tests'): priority = 0 else: priority = 100 task_queue.put((priority, (python_exec, test_goal))) def process_queue(task_queue): while True: try: (priority, (python_exec, test_goal)) = task_queue.get_nowait() except Queue.Empty: break try: run_individual_python_test(test_goal, python_exec) finally: task_queue.task_done() start_time = time.time() for _ in range(opts.parallelism): worker = Thread(target=process_queue, args=(task_queue,)) worker.daemon = True worker.start() try: task_queue.join() except (KeyboardInterrupt, SystemExit): print_red("Exiting due to interrupt") sys.exit(-1) total_duration = time.time() - start_time LOGGER.info("Tests passed in %i seconds", total_duration)
def run_java_style_checks(): set_title_and_block("Running Java style checks", "BLOCK_JAVA_STYLE") run_cmd([os.path.join(SPARK_HOME, "dev", "lint-java")])
def run_apache_rat_checks(): set_title_and_block("Running Apache RAT checks", "BLOCK_RAT") run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
def detect_binary_inop_with_mima(): set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
def run_scala_style_checks(build_profiles): set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") profiles = " ".join(build_profiles) print("[info] Checking Scala style using SBT with these profiles: ", profiles) run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala"), profiles])
def post_python_tests_results(): if "SPARK_TEST_KEY" not in os.environ: print( "[error] 'SPARK_TEST_KEY' environment variable was not set. Unable to post " "PySpark coverage results.") sys.exit(1) spark_test_key = os.environ.get("SPARK_TEST_KEY") # The steps below upload HTMLs to 'github.com/spark-test/pyspark-coverage-site'. # 1. Clone PySpark coverage site. run_cmd([ "git", "clone", "https://*****:*****@github.com/spark-test/pyspark-coverage-site.git" % spark_test_key ]) # 2. Remove existing HTMLs. run_cmd(["rm", "-fr"] + glob.glob("pyspark-coverage-site/*")) # 3. Copy generated coverage HTMLs. for f in glob.glob("%s/python/test_coverage/htmlcov/*" % SPARK_HOME): shutil.copy(f, "pyspark-coverage-site/") os.chdir("pyspark-coverage-site") try: # 4. Check out to a temporary branch. run_cmd(["git", "symbolic-ref", "HEAD", "refs/heads/latest_branch"]) # 5. Add all the files. run_cmd(["git", "add", "-A"]) # 6. Commit current HTMLs. run_cmd([ "git", "commit", "-am", "Coverage report at latest commit in Apache Spark", '--author="Apache Spark Test Account <*****@*****.**>"' ]) # 7. Delete the old branch. run_cmd(["git", "branch", "-D", "gh-pages"]) # 8. Rename the temporary branch to master. run_cmd(["git", "branch", "-m", "gh-pages"]) # 9. Finally, force update to our repository. run_cmd(["git", "push", "-f", "origin", "gh-pages"]) finally: os.chdir("..") # 10. Remove the cloned repository. shutil.rmtree("pyspark-coverage-site")
def main(): opts = parse_opts() # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print( "[error] Cannot determine your home directory as an absolute path;", " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) java_exe = determine_java_executable() if not java_exe: print("[error] Cannot find a version of `java` on the system; please", " install one and retry.") sys.exit(2) java_version = determine_java_version(java_exe) if java_version.minor < 8: print("[warn] Java 8 tests will not run because JDK version is < 1.8.") # install SparkR if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) else: print("Can't install SparkR as R is was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get( "PATH") else: # else we're running locally and can use local settings build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.3") test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "under environment", test_env) changed_modules = None changed_files = None if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits( "HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) excluded_tags = determine_tags_to_exclude(changed_modules) if not changed_modules: changed_modules = [modules.root] excluded_tags = [] print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) # setup environment variables # note - the 'root' module doesn't collect environment variables for all modules. Because the # environment variables should not be set if a module is not changed, even if running the 'root' # module. So here we should use changed_modules rather than test_modules. test_environ = {} for m in changed_modules: test_environ.update(m.environ) setup_test_environ(test_environ) test_modules = determine_modules_to_test(changed_modules) # license checks run_apache_rat_checks() # style checks if not changed_files or any(f.endswith(".scala") for f in changed_files): run_scala_style_checks() if not changed_files or any(f.endswith(".java") for f in changed_files): # run_java_style_checks() pass if not changed_files or any(f.endswith(".py") for f in changed_files): run_python_style_checks() if not changed_files or any(f.endswith(".R") for f in changed_files): run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() # spark build build_apache_spark(build_tool, hadoop_version) # backwards compatibility checks if build_tool == "sbt": # Note: compatiblity tests only supported in sbt for now detect_binary_inop_with_mima() # run the test suites run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) modules_with_python_tests = [ m for m in test_modules if m.python_test_goals ] if modules_with_python_tests: run_python_tests(modules_with_python_tests, opts.parallelism) if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests()
def detect_binary_inop_with_mima(hadoop_version): build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") run_cmd([os.path.join(SPARK_HOME, "dev", "mima")] + build_profiles)
def post_python_tests_results(): if "SPARK_TEST_KEY" not in os.environ: print("[error] 'SPARK_TEST_KEY' environment variable was not set. Unable to post " "PySpark coverage results.") sys.exit(1) spark_test_key = os.environ.get("SPARK_TEST_KEY") # The steps below upload HTMLs to 'github.com/spark-test/pyspark-coverage-site'. # 1. Clone PySpark coverage site. run_cmd([ "git", "clone", "https://*****:*****@github.com/spark-test/pyspark-coverage-site.git" % spark_test_key]) # 2. Remove existing HTMLs. run_cmd(["rm", "-fr"] + glob.glob("pyspark-coverage-site/*")) # 3. Copy generated coverage HTMLs. for f in glob.glob("%s/python/test_coverage/htmlcov/*" % SPARK_HOME): shutil.copy(f, "pyspark-coverage-site/") os.chdir("pyspark-coverage-site") try: # 4. Check out to a temporary branch. run_cmd(["git", "symbolic-ref", "HEAD", "refs/heads/latest_branch"]) # 5. Add all the files. run_cmd(["git", "add", "-A"]) # 6. Commit current HTMLs. run_cmd([ "git", "commit", "-am", "Coverage report at latest commit in Apache Spark", '--author="Apache Spark Test Account <*****@*****.**>"']) # 7. Delete the old branch. run_cmd(["git", "branch", "-D", "gh-pages"]) # 8. Rename the temporary branch to master. run_cmd(["git", "branch", "-m", "gh-pages"]) # 9. Finally, force update to our repository. run_cmd(["git", "push", "-f", "origin", "gh-pages"]) finally: os.chdir("..")
def run_build_tests(): set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS") run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")]) pass
def main(): opts = parse_opts() # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print("[error] Cannot determine your home directory as an absolute path;", " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) rm_r(os.path.join(SPARK_HOME, "work")) rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) java_exe = determine_java_executable() if not java_exe: print("[error] Cannot find a version of `java` on the system; please", " install one and retry.") sys.exit(2) java_version = determine_java_version(java_exe) if java_version.minor < 8: print("[warn] Java 8 tests will not run because JDK version is < 1.8.") # install SparkR if which("R"): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) else: print("Cannot install SparkR as R was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH") else: # else we're running locally and can use local settings build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6") test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "under environment", test_env) changed_modules = None changed_files = None if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) excluded_tags = determine_tags_to_exclude(changed_modules) if not changed_modules: changed_modules = [modules.root] excluded_tags = [] print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) # setup environment variables # note - the 'root' module doesn't collect environment variables for all modules. Because the # environment variables should not be set if a module is not changed, even if running the 'root' # module. So here we should use changed_modules rather than test_modules. test_environ = {} for m in changed_modules: test_environ.update(m.environ) setup_test_environ(test_environ) test_modules = determine_modules_to_test(changed_modules) # license checks run_apache_rat_checks() # style checks if not changed_files or any(f.endswith(".scala") or f.endswith("scalastyle-config.xml") for f in changed_files): run_scala_style_checks() if not changed_files or any(f.endswith(".java") or f.endswith("checkstyle.xml") or f.endswith("checkstyle-suppressions.xml") for f in changed_files): # run_java_style_checks() pass if not changed_files or any(f.endswith(".py") for f in changed_files): run_python_style_checks() if not changed_files or any(f.endswith(".R") for f in changed_files): run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() if any(m.should_run_build_tests for m in test_modules): run_build_tests() # spark build build_apache_spark(build_tool, hadoop_version) # backwards compatibility checks if build_tool == "sbt": # Note: compatibility tests only supported in sbt for now detect_binary_inop_with_mima(hadoop_version) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. build_spark_assembly_sbt(hadoop_version) # run the test suites run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: run_python_tests(modules_with_python_tests, opts.parallelism) run_python_packaging_tests() if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests()
def main(): opts = parse_opts() if (opts.verbose): log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s") LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) python_execs = opts.python_executables.split(',') modules_to_test = [] for module_name in opts.modules.split(','): if module_name in python_modules: modules_to_test.append(python_modules[module_name]) else: print("Error: unrecognized module '%s'. Supported modules: %s" % (module_name, ", ".join(python_modules))) sys.exit(-1) LOGGER.info("Will test against the following Python executables: %s", python_execs) LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test]) task_queue = Queue.PriorityQueue() for python_exec in python_execs: if "COVERAGE_PROCESS_START" in os.environ: # Make sure if coverage is installed. run_cmd([python_exec, "-c", "import coverage"]) python_implementation = subprocess_check_output( [ python_exec, "-c", "import platform; print(platform.python_implementation())" ], universal_newlines=True).strip() LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation) LOGGER.debug( "%s version is: %s", python_exec, subprocess_check_output([python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip()) for module in modules_to_test: if python_implementation not in module.blacklisted_python_implementations: for test_goal in module.python_test_goals: if test_goal in ('pyspark.streaming.tests', 'pyspark.mllib.tests', 'pyspark.tests', 'pyspark.sql.tests'): priority = 0 else: priority = 100 task_queue.put((priority, (python_exec, test_goal))) def process_queue(task_queue): while True: try: (priority, (python_exec, test_goal)) = task_queue.get_nowait() except Queue.Empty: break try: run_individual_python_test(test_goal, python_exec) finally: task_queue.task_done() start_time = time.time() for _ in range(opts.parallelism): worker = Thread(target=process_queue, args=(task_queue, )) worker.daemon = True worker.start() try: task_queue.join() except (KeyboardInterrupt, SystemExit): print_red("Exiting due to interrupt") sys.exit(-1) total_duration = time.time() - start_time LOGGER.info("Tests passed in %i seconds", total_duration)
def run_build_tests(): set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS") run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")])