Example #1
0
def run_java_style_checks(build_profiles):
    set_title_and_block("Running Java style checks", "BLOCK_JAVA_STYLE")
    # The same profiles used for building are used to run Checkstyle by SBT as well because
    # the previous build looks reused for Checkstyle and affecting Checkstyle. See SPARK-27130.
    profiles = " ".join(build_profiles)
    print("[info] Checking Java style using SBT with these profiles: ", profiles)
    run_cmd([os.path.join(SPARK_HOME, "dev", "sbt-checkstyle"), profiles])
Example #2
0
def run_sparkr_tests():
    set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")

    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
    else:
        print("Ignoring SparkR tests as R was not found in PATH")
Example #3
0
def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
    """
    Given a git commit and target ref, use the set of files changed in the diff in order to
    determine which modules' tests should be run.

    >>> [x.name for x in determine_modules_for_files( \
            identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
    ['graphx']
    >>> 'root' in [x.name for x in determine_modules_for_files( \
         identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
    True
    """
    if target_branch is None and target_ref is None:
        raise AttributeError("must specify either target_branch or target_ref")
    elif target_branch is not None and target_ref is not None:
        raise AttributeError("must specify either target_branch or target_ref, not both")
    if target_branch is not None:
        diff_target = target_branch
        run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
    else:
        diff_target = target_ref
    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
                                         universal_newlines=True)
    # Remove any empty strings
    return [f for f in raw_output.split('\n') if f]
Example #4
0
def run_python_tests(test_modules):
    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")

    command = [os.path.join(SPARK_HOME, "python", "run-tests")]
    if test_modules != [modules.root]:
        command.append("--modules=%s" % ','.join(m.name for m in test_modules))
    run_cmd(command)
Example #5
0
def detect_binary_inop_with_mima(hadoop_version):
    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
    set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
    profiles = " ".join(build_profiles)
    print("[info] Detecting binary incompatibilities with MiMa using SBT with these profiles: ",
          profiles)
    run_cmd([os.path.join(SPARK_HOME, "dev", "mima"), profiles])
Example #6
0
def exec_maven(mvn_args=()):
    """Will call Maven in the current directory with the list of mvn_args passed
    in and returns the subprocess for any further processing"""

    zinc_port = get_zinc_port()
    os.environ["ZINC_PORT"] = "%s" % zinc_port
    zinc_flag = "-DzincPort=%s" % zinc_port
    flags = [os.path.join(SPARK_HOME, "build", "mvn"), "--force", zinc_flag]
    run_cmd(flags + mvn_args)
Example #7
0
def run_sparkr_style_checks():
    set_title_and_block("Running R style checks", "BLOCK_R_STYLE")

    if which("R"):
        # R style check should be executed after `install-dev.sh`.
        # Since warnings about `no visible global function definition` appear
        # without the installation. SEE ALSO: SPARK-9121.
        run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")])
    else:
        print("Ignoring SparkR style check as R was not found in PATH")
Example #8
0
def build_spark_documentation():
    set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
    os.environ["PRODUCTION"] = "1 jekyll build"

    os.chdir(os.path.join(SPARK_HOME, "docs"))

    jekyll_bin = which("jekyll")

    if not jekyll_bin:
        print("[error] Cannot find a version of `jekyll` on the system; please",
              " install one and retry to build documentation.")
        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
    else:
        run_cmd([jekyll_bin, "build"])

    os.chdir(SPARK_HOME)
Example #9
0
def run_pr_checks(pr_tests, ghprb_actual_commit, sha1):
    """
    Executes a set of pull request checks to ease development and report issues with various
    components such as style, linting, dependencies, compatibilities, etc.
    @return a list of messages to post back to Github
    """
    # Ensure we save off the current HEAD to revert to
    current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip()
    pr_results = list()

    for pr_test in pr_tests:
        test_name = pr_test + '.sh'
        pr_results.append(run_cmd(['bash', os.path.join(SPARK_HOME, 'dev', 'tests', test_name),
                                   ghprb_actual_commit, sha1],
                                  return_output=True).rstrip())
        # Ensure, after each test, that we're back on the current PR
        run_cmd(['git', 'checkout', '-f', current_pr_head])
    return pr_results
Example #10
0
def run_python_tests(test_modules, parallelism, with_coverage=False):
    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")

    if with_coverage:
        # Coverage makes the PySpark tests flaky due to heavy parallelism.
        # When we run PySpark tests with coverage, it uses 4 for now as
        # workaround.
        parallelism = 4
        script = "run-tests-with-coverage"
    else:
        script = "run-tests"
    command = [os.path.join(SPARK_HOME, "python", script)]
    if test_modules != [modules.root]:
        command.append("--modules=%s" % ','.join(m.name for m in test_modules))
    command.append("--parallelism=%i" % parallelism)
    run_cmd(command)

    if with_coverage:
        post_python_tests_results()
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print(
            "[error] Cannot determine your home directory as an absolute path;",
            " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE",
                                        "hadoop2.7")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        # TODO(sknapp):  after all builds are ported to the ubuntu workers, change this to be:
        # /home/jenkins/anaconda2/envs/py36/bin
        os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get(
            "PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile",
          hadoop_version, "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits(
            "HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)
    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(
            f.endswith(".scala") or f.endswith("scalastyle-config.xml")
            for f in changed_files):
        build_profiles = get_hadoop_profiles(
            hadoop_version) + modules.root.build_profile_flags
        run_scala_style_checks(build_profiles)
    should_run_java_style_checks = False
    if not changed_files or any(
            f.endswith(".java") or f.endswith("checkstyle.xml")
            or f.endswith("checkstyle-suppressions.xml")
            for f in changed_files):
        # Run SBT Checkstyle after the build to prevent a side-effect to the build.
        should_run_java_style_checks = True
    if not changed_files or any(
            f.endswith("lint-python") or f.endswith("tox.ini")
            or f.endswith(".py") for f in changed_files):
        run_python_style_checks()
    if not changed_files or any(
            f.endswith(".R") or f.endswith("lint-r") or f.endswith(".lintr")
            for f in changed_files):
        run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules):
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(hadoop_version)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(hadoop_version, should_run_java_style_checks)

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)

    modules_with_python_tests = [
        m for m in test_modules if m.python_test_goals
    ]
    if modules_with_python_tests:
        # We only run PySpark tests with coverage report in one specific job with
        # Spark master with SBT in Jenkins.
        is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
        run_python_tests(modules_with_python_tests,
                         opts.parallelism,
                         with_coverage=is_sbt_master_job)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Example #12
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    # Install SparkR
    should_only_test_modules = opts.modules is not None
    test_modules = []
    if should_only_test_modules:
        str_test_modules = [m.strip() for m in opts.modules.split(",")]
        test_modules = [m for m in modules.all_modules if m.name in str_test_modules]

    if not should_only_test_modules or modules.sparkr in test_modules:
        # If tests modules are specified, we will not run R linter.
        # SparkR needs the manual SparkR installation.
        if which("R"):
            run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
        else:
            print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop3.2")
        hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        # TODO(sknapp):  after all builds are ported to the ubuntu workers, change this to be:
        # /home/jenkins/anaconda2/envs/py36/bin
        os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally or GitHub Actions.
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop3.2")
        hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
        if "GITHUB_ACTIONS" in os.environ:
            test_env = "github_actions"
        else:
            test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "and Hive profile", hive_version, "under environment", test_env)
    extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)

    changed_modules = []
    changed_files = []
    included_tags = []
    excluded_tags = []
    if should_only_test_modules:
        # If we're running the tests in GitHub Actions, attempt to detect and test
        # only the affected modules.
        if test_env == "github_actions":
            if os.environ["GITHUB_INPUT_BRANCH"] != "":
                # Dispatched request
                # Note that it assumes GitHub Actions has already merged
                # the given `GITHUB_INPUT_BRANCH` branch.
                changed_files = identify_changed_files_from_git_commits(
                    "HEAD", target_branch=os.environ["GITHUB_SHA"])
            elif os.environ["GITHUB_BASE_REF"] != "":
                # Pull requests
                changed_files = identify_changed_files_from_git_commits(
                    os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"])
            else:
                # Build for each commit.
                changed_files = identify_changed_files_from_git_commits(
                    os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"])

            modules_to_test = determine_modules_to_test(
                determine_modules_for_files(changed_files), deduplicated=False)

            if modules.root not in modules_to_test:
                # If root module is not found, only test the intersected modules.
                # If root module is found, just run the modules as specified initially.
                test_modules = list(set(modules_to_test).intersection(test_modules))

        changed_modules = test_modules
        if len(changed_modules) == 0:
            print("[info] There are no modules to test, exiting without testing.")
            return

    # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
    # detect modules to test.
    elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        test_modules = determine_modules_to_test(changed_modules)
        excluded_tags = determine_tags_to_exclude(changed_modules)

    # If there is no changed module found, tests all.
    if not changed_modules:
        changed_modules = [modules.root]
    if not test_modules:
        test_modules = determine_modules_to_test(changed_modules)

    if opts.excluded_tags:
        excluded_tags.extend([t.strip() for t in opts.excluded_tags.split(",")])
    if opts.included_tags:
        included_tags.extend([t.strip() for t in opts.included_tags.split(",")])

    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    should_run_java_style_checks = False
    if not should_only_test_modules:
        # license checks
        run_apache_rat_checks()

        # style checks
        if not changed_files or any(f.endswith(".scala")
                                    or f.endswith("scalastyle-config.xml")
                                    for f in changed_files):
            run_scala_style_checks(extra_profiles)
        if not changed_files or any(f.endswith(".java")
                                    or f.endswith("checkstyle.xml")
                                    or f.endswith("checkstyle-suppressions.xml")
                                    for f in changed_files):
            # Run SBT Checkstyle after the build to prevent a side-effect to the build.
            should_run_java_style_checks = True
        if not changed_files or any(f.endswith("lint-python")
                                    or f.endswith("tox.ini")
                                    or f.endswith(".py")
                                    for f in changed_files):
            run_python_style_checks()
        if not changed_files or any(f.endswith(".R")
                                    or f.endswith("lint-r")
                                    or f.endswith(".lintr")
                                    for f in changed_files):
            run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get the Bundler gem installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, extra_profiles)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(extra_profiles)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)

    # run the test suites
    run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        # We only run PySpark tests with coverage report in one specific job with
        # Spark master with SBT in Jenkins.
        is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
        run_python_tests(
            modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Example #13
0
def detect_binary_inop_with_mima(hadoop_version):
    build_profiles = get_hadoop_profiles(
        hadoop_version) + modules.root.build_profile_flags
    set_title_and_block("Detecting binary incompatibilities with MiMa",
                        "BLOCK_MIMA")
    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")] + build_profiles)
Example #14
0
def run_python_packaging_tests():
    set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS")
    command = [os.path.join(SPARK_HOME, "dev", "run-pip-tests")]
    run_cmd(command)
Example #15
0
def exec_maven(mvn_args=()):
    """Will call Maven in the current directory with the list of mvn_args passed
    in and returns the subprocess for any further processing"""

    run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
Example #16
0
def run_python_style_checks():
    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
Example #17
0
def run_scala_style_checks():
    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
def run_python_packaging_tests():
    set_title_and_block("Running PySpark packaging tests",
                        "BLOCK_PYSPARK_PIP_TESTS")
    command = [os.path.join(SPARK_HOME, "dev", "run-pip-tests")]
    run_cmd(command)
Example #19
0
def main():
    opts = parse_opts()
    if (opts.verbose):
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s")
    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
    if os.path.exists(LOG_FILE):
        os.remove(LOG_FILE)
    python_execs = opts.python_executables.split(',')
    modules_to_test = []
    for module_name in opts.modules.split(','):
        if module_name in python_modules:
            modules_to_test.append(python_modules[module_name])
        else:
            print("Error: unrecognized module '%s'. Supported modules: %s" %
                  (module_name, ", ".join(python_modules)))
            sys.exit(-1)
    LOGGER.info("Will test against the following Python executables: %s", python_execs)
    LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])

    task_queue = Queue.PriorityQueue()
    for python_exec in python_execs:
        if "COVERAGE_PROCESS_START" in os.environ:
            # Make sure if coverage is installed.
            run_cmd([python_exec, "-c", "import coverage"])
        python_implementation = subprocess_check_output(
            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
            universal_newlines=True).strip()
        LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation)
        LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output(
            [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip())
        for module in modules_to_test:
            if python_implementation not in module.blacklisted_python_implementations:
                for test_goal in module.python_test_goals:
                    if test_goal in ('pyspark.streaming.tests', 'pyspark.mllib.tests',
                                     'pyspark.tests', 'pyspark.sql.tests'):
                        priority = 0
                    else:
                        priority = 100
                    task_queue.put((priority, (python_exec, test_goal)))

    def process_queue(task_queue):
        while True:
            try:
                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
            except Queue.Empty:
                break
            try:
                run_individual_python_test(test_goal, python_exec)
            finally:
                task_queue.task_done()

    start_time = time.time()
    for _ in range(opts.parallelism):
        worker = Thread(target=process_queue, args=(task_queue,))
        worker.daemon = True
        worker.start()
    try:
        task_queue.join()
    except (KeyboardInterrupt, SystemExit):
        print_red("Exiting due to interrupt")
        sys.exit(-1)
    total_duration = time.time() - start_time
    LOGGER.info("Tests passed in %i seconds", total_duration)
Example #20
0
def run_python_style_checks():
    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
Example #21
0
def run_java_style_checks():
    set_title_and_block("Running Java style checks", "BLOCK_JAVA_STYLE")
    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-java")])
Example #22
0
def run_scala_style_checks():
    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
Example #23
0
def run_apache_rat_checks():
    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
    run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
Example #24
0
def detect_binary_inop_with_mima():
    set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
Example #25
0
def run_apache_rat_checks():
    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
    run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
Example #26
0
def run_scala_style_checks(build_profiles):
    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
    profiles = " ".join(build_profiles)
    print("[info] Checking Scala style using SBT with these profiles: ", profiles)
    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala"), profiles])
Example #27
0
def run_java_style_checks():
    set_title_and_block("Running Java style checks", "BLOCK_JAVA_STYLE")
    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-java")])
def post_python_tests_results():
    if "SPARK_TEST_KEY" not in os.environ:
        print(
            "[error] 'SPARK_TEST_KEY' environment variable was not set. Unable to post "
            "PySpark coverage results.")
        sys.exit(1)
    spark_test_key = os.environ.get("SPARK_TEST_KEY")
    # The steps below upload HTMLs to 'github.com/spark-test/pyspark-coverage-site'.
    # 1. Clone PySpark coverage site.
    run_cmd([
        "git", "clone",
        "https://*****:*****@github.com/spark-test/pyspark-coverage-site.git"
        % spark_test_key
    ])
    # 2. Remove existing HTMLs.
    run_cmd(["rm", "-fr"] + glob.glob("pyspark-coverage-site/*"))
    # 3. Copy generated coverage HTMLs.
    for f in glob.glob("%s/python/test_coverage/htmlcov/*" % SPARK_HOME):
        shutil.copy(f, "pyspark-coverage-site/")
    os.chdir("pyspark-coverage-site")
    try:
        # 4. Check out to a temporary branch.
        run_cmd(["git", "symbolic-ref", "HEAD", "refs/heads/latest_branch"])
        # 5. Add all the files.
        run_cmd(["git", "add", "-A"])
        # 6. Commit current HTMLs.
        run_cmd([
            "git", "commit", "-am",
            "Coverage report at latest commit in Apache Spark",
            '--author="Apache Spark Test Account <*****@*****.**>"'
        ])
        # 7. Delete the old branch.
        run_cmd(["git", "branch", "-D", "gh-pages"])
        # 8. Rename the temporary branch to master.
        run_cmd(["git", "branch", "-m", "gh-pages"])
        # 9. Finally, force update to our repository.
        run_cmd(["git", "push", "-f", "origin", "gh-pages"])
    finally:
        os.chdir("..")
        # 10. Remove the cloned repository.
        shutil.rmtree("pyspark-coverage-site")
Example #29
0
def detect_binary_inop_with_mima():
    set_title_and_block("Detecting binary incompatibilities with MiMa",
                        "BLOCK_MIMA")
    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
Example #30
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print(
            "[error] Cannot determine your home directory as an absolute path;",
            " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    java_version = determine_java_version(java_exe)

    if java_version.minor < 8:
        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Can't install SparkR as R is was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE",
                                        "hadoop2.3")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get(
            "PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.3")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile",
          hadoop_version, "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits(
            "HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)
    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(f.endswith(".scala") for f in changed_files):
        run_scala_style_checks()
    if not changed_files or any(f.endswith(".java") for f in changed_files):
        # run_java_style_checks()
        pass
    if not changed_files or any(f.endswith(".py") for f in changed_files):
        run_python_style_checks()
    if not changed_files or any(f.endswith(".R") for f in changed_files):
        run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatiblity tests only supported in sbt for now
        detect_binary_inop_with_mima()

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)

    modules_with_python_tests = [
        m for m in test_modules if m.python_test_goals
    ]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests, opts.parallelism)
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Example #31
0
def detect_binary_inop_with_mima(hadoop_version):
    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
    set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")] + build_profiles)
Example #32
0
def post_python_tests_results():
    if "SPARK_TEST_KEY" not in os.environ:
        print("[error] 'SPARK_TEST_KEY' environment variable was not set. Unable to post "
              "PySpark coverage results.")
        sys.exit(1)
    spark_test_key = os.environ.get("SPARK_TEST_KEY")
    # The steps below upload HTMLs to 'github.com/spark-test/pyspark-coverage-site'.
    # 1. Clone PySpark coverage site.
    run_cmd([
        "git",
        "clone",
        "https://*****:*****@github.com/spark-test/pyspark-coverage-site.git" % spark_test_key])
    # 2. Remove existing HTMLs.
    run_cmd(["rm", "-fr"] + glob.glob("pyspark-coverage-site/*"))
    # 3. Copy generated coverage HTMLs.
    for f in glob.glob("%s/python/test_coverage/htmlcov/*" % SPARK_HOME):
        shutil.copy(f, "pyspark-coverage-site/")
    os.chdir("pyspark-coverage-site")
    try:
        # 4. Check out to a temporary branch.
        run_cmd(["git", "symbolic-ref", "HEAD", "refs/heads/latest_branch"])
        # 5. Add all the files.
        run_cmd(["git", "add", "-A"])
        # 6. Commit current HTMLs.
        run_cmd([
            "git",
            "commit",
            "-am",
            "Coverage report at latest commit in Apache Spark",
            '--author="Apache Spark Test Account <*****@*****.**>"'])
        # 7. Delete the old branch.
        run_cmd(["git", "branch", "-D", "gh-pages"])
        # 8. Rename the temporary branch to master.
        run_cmd(["git", "branch", "-m", "gh-pages"])
        # 9. Finally, force update to our repository.
        run_cmd(["git", "push", "-f", "origin", "gh-pages"])
    finally:
        os.chdir("..")
Example #33
0
def run_build_tests():
    set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS")
    run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")])
    pass
def run_scala_style_checks(build_profiles):
    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
    profiles = " ".join(build_profiles)
    print("[info] Checking Scala style using SBT with these profiles: ",
          profiles)
    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala"), profiles])
Example #35
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    java_version = determine_java_version(java_exe)

    if java_version.minor < 8:
        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)
    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(f.endswith(".scala")
                                or f.endswith("scalastyle-config.xml")
                                for f in changed_files):
        run_scala_style_checks()
    if not changed_files or any(f.endswith(".java")
                                or f.endswith("checkstyle.xml")
                                or f.endswith("checkstyle-suppressions.xml")
                                for f in changed_files):
        # run_java_style_checks()
        pass
    if not changed_files or any(f.endswith(".py") for f in changed_files):
        run_python_style_checks()
    if not changed_files or any(f.endswith(".R") for f in changed_files):
        run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules):
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(hadoop_version)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(hadoop_version)

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests, opts.parallelism)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Example #36
0
def exec_maven(mvn_args=()):
    """Will call Maven in the current directory with the list of mvn_args passed
    in and returns the subprocess for any further processing"""

    run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
Example #37
0
def main():
    opts = parse_opts()
    if (opts.verbose):
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO
    logging.basicConfig(stream=sys.stdout,
                        level=log_level,
                        format="%(message)s")
    LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE)
    if os.path.exists(LOG_FILE):
        os.remove(LOG_FILE)
    python_execs = opts.python_executables.split(',')
    modules_to_test = []
    for module_name in opts.modules.split(','):
        if module_name in python_modules:
            modules_to_test.append(python_modules[module_name])
        else:
            print("Error: unrecognized module '%s'. Supported modules: %s" %
                  (module_name, ", ".join(python_modules)))
            sys.exit(-1)
    LOGGER.info("Will test against the following Python executables: %s",
                python_execs)
    LOGGER.info("Will test the following Python modules: %s",
                [x.name for x in modules_to_test])

    task_queue = Queue.PriorityQueue()
    for python_exec in python_execs:
        if "COVERAGE_PROCESS_START" in os.environ:
            # Make sure if coverage is installed.
            run_cmd([python_exec, "-c", "import coverage"])
        python_implementation = subprocess_check_output(
            [
                python_exec, "-c",
                "import platform; print(platform.python_implementation())"
            ],
            universal_newlines=True).strip()
        LOGGER.debug("%s python_implementation is %s", python_exec,
                     python_implementation)
        LOGGER.debug(
            "%s version is: %s", python_exec,
            subprocess_check_output([python_exec, "--version"],
                                    stderr=subprocess.STDOUT,
                                    universal_newlines=True).strip())
        for module in modules_to_test:
            if python_implementation not in module.blacklisted_python_implementations:
                for test_goal in module.python_test_goals:
                    if test_goal in ('pyspark.streaming.tests',
                                     'pyspark.mllib.tests', 'pyspark.tests',
                                     'pyspark.sql.tests'):
                        priority = 0
                    else:
                        priority = 100
                    task_queue.put((priority, (python_exec, test_goal)))

    def process_queue(task_queue):
        while True:
            try:
                (priority, (python_exec, test_goal)) = task_queue.get_nowait()
            except Queue.Empty:
                break
            try:
                run_individual_python_test(test_goal, python_exec)
            finally:
                task_queue.task_done()

    start_time = time.time()
    for _ in range(opts.parallelism):
        worker = Thread(target=process_queue, args=(task_queue, ))
        worker.daemon = True
        worker.start()
    try:
        task_queue.join()
    except (KeyboardInterrupt, SystemExit):
        print_red("Exiting due to interrupt")
        sys.exit(-1)
    total_duration = time.time() - start_time
    LOGGER.info("Tests passed in %i seconds", total_duration)
def run_build_tests():
    set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS")
    run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")])