Ejemplo n.º 1
0
def build_apache_spark(build_tool, hadoop_version):
    """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or
    `maven`). Defaults to using `sbt`."""

    set_title_and_block("Building Spark", "BLOCK_BUILD")

    rm_r("lib_managed")

    if build_tool == "maven":
        build_spark_maven(hadoop_version)
    else:
        build_spark_sbt(hadoop_version)
Ejemplo n.º 2
0
def build_apache_spark(build_tool, extra_profiles):
    """Will build Spark with the extra profiles and the passed in build tool
    (either `sbt` or `maven`). Defaults to using `sbt`."""

    set_title_and_block("Building Spark", "BLOCK_BUILD")

    rm_r("lib_managed")

    if build_tool == "maven":
        build_spark_maven(extra_profiles)
    else:
        build_spark_sbt(extra_profiles)
Ejemplo n.º 3
0
def build_apache_spark(build_tool, hadoop_version):
    """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or
    `maven`). Defaults to using `sbt`."""

    set_title_and_block("Building Spark", "BLOCK_BUILD")

    rm_r("lib_managed")

    if build_tool == "maven":
        build_spark_maven(hadoop_version)
    else:
        build_spark_sbt(hadoop_version)
Ejemplo n.º 4
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    java_version = determine_java_version(java_exe)

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)
    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(f.endswith(".scala")
                                or f.endswith("scalastyle-config.xml")
                                for f in changed_files):
        run_scala_style_checks()
    if not changed_files or any(f.endswith(".java")
                                or f.endswith("checkstyle.xml")
                                or f.endswith("checkstyle-suppressions.xml")
                                for f in changed_files):
        # run_java_style_checks()
        pass
    if not changed_files or any(f.endswith(".py") for f in changed_files):
        run_python_style_checks()
    if not changed_files or any(f.endswith(".R") for f in changed_files):
        run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules):
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(hadoop_version)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(hadoop_version)

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests, opts.parallelism)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Ejemplo n.º 5
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print(
            "[error] Cannot determine your home directory as an absolute path;",
            " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    # Install SparkR
    should_only_test_modules = opts.modules is not None
    test_modules = []
    if should_only_test_modules:
        str_test_modules = [m.strip() for m in opts.modules.split(",")]
        test_modules = [
            m for m in modules.all_modules if m.name in str_test_modules
        ]

    if not should_only_test_modules or modules.sparkr in test_modules:
        # If tests modules are specified, we will not run R linter.
        # SparkR needs the manual SparkR installation.
        if which("R"):
            run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
        else:
            print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        scala_version = os.environ.get("AMPLAB_JENKINS_BUILD_SCALA_PROFILE")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE",
                                        "hadoop3.2")
        hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE",
                                      "hive2.3")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        # TODO(sknapp):  after all builds are ported to the ubuntu workers, change this to be:
        # /home/jenkins/anaconda2/envs/py36/bin
        os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get(
            "PATH")
    else:
        # else we're running locally or GitHub Actions.
        build_tool = "sbt"
        scala_version = os.environ.get("SCALA_PROFILE")
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop3.2")
        hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
        if "GITHUB_ACTIONS" in os.environ:
            test_env = "github_actions"
        else:
            test_env = "local"

    extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) + \
        get_scala_profiles(scala_version)

    print("[info] Using build tool", build_tool, "with profiles",
          *(extra_profiles + ["under environment", test_env]))

    changed_modules = []
    changed_files = []
    included_tags = []
    excluded_tags = []
    if should_only_test_modules:
        # We're likely in the forked repository
        is_apache_spark_ref = os.environ.get("APACHE_SPARK_REF", "") != ""
        # We're likely in the main repo build.
        is_github_prev_sha = os.environ.get("GITHUB_PREV_SHA", "") != ""
        # Otherwise, we're in either periodic job in Github Actions or somewhere else.

        # If we're running the tests in GitHub Actions, attempt to detect and test
        # only the affected modules.
        if test_env == "github_actions" and (is_apache_spark_ref
                                             or is_github_prev_sha):
            if is_apache_spark_ref:
                changed_files = identify_changed_files_from_git_commits(
                    "HEAD", target_ref=os.environ["APACHE_SPARK_REF"])
            elif is_github_prev_sha:
                changed_files = identify_changed_files_from_git_commits(
                    os.environ["GITHUB_SHA"],
                    target_ref=os.environ["GITHUB_PREV_SHA"])

            modules_to_test = determine_modules_to_test(
                determine_modules_for_files(changed_files), deduplicated=False)

            if modules.root not in modules_to_test:
                # If root module is not found, only test the intersected modules.
                # If root module is found, just run the modules as specified initially.
                test_modules = list(
                    set(modules_to_test).intersection(test_modules))

        changed_modules = test_modules
        if len(changed_modules) == 0:
            print(
                "[info] There are no modules to test, exiting without testing."
            )
            return

    # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
    # detect modules to test.
    elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits(
            "HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        test_modules = determine_modules_to_test(changed_modules)
        excluded_tags = determine_tags_to_exclude(changed_modules)

    # If there is no changed module found, tests all.
    if not changed_modules:
        changed_modules = [modules.root]
    if not test_modules:
        test_modules = determine_modules_to_test(changed_modules)

    if opts.excluded_tags:
        excluded_tags.extend(
            [t.strip() for t in opts.excluded_tags.split(",")])
    if opts.included_tags:
        included_tags.extend(
            [t.strip() for t in opts.included_tags.split(",")])

    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    if scala_version is not None:
        # If not set, assume this is default and doesn't need to change.
        switch_scala_version(scala_version)

    should_run_java_style_checks = False
    if not should_only_test_modules:
        # license checks
        run_apache_rat_checks()

        # style checks
        if not changed_files or any(
                f.endswith(".scala") or f.endswith("scalastyle-config.xml")
                for f in changed_files):
            run_scala_style_checks(extra_profiles)
        if not changed_files or any(
                f.endswith(".java") or f.endswith("checkstyle.xml")
                or f.endswith("checkstyle-suppressions.xml")
                for f in changed_files):
            # Run SBT Checkstyle after the build to prevent a side-effect to the build.
            should_run_java_style_checks = True
        if not changed_files or any(
                f.endswith("lint-python") or f.endswith("tox.ini")
                or f.endswith(".py") for f in changed_files):
            run_python_style_checks()
        if not changed_files or any(
                f.endswith(".R") or f.endswith("lint-r")
                or f.endswith(".lintr") for f in changed_files):
            run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get the Bundler gem installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests
           for m in test_modules) and test_env != "amplab_jenkins":
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, extra_profiles)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        if not os.environ.get("SKIP_MIMA"):
            detect_binary_inop_with_mima(extra_profiles)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)

    # run the test suites
    run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags,
                    included_tags)

    modules_with_python_tests = [
        m for m in test_modules if m.python_test_goals
    ]
    if modules_with_python_tests:
        # We only run PySpark tests with coverage report in one specific job with
        # Spark master with SBT in Jenkins.
        is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
        run_python_tests(modules_with_python_tests,
                         opts.parallelism,
                         with_coverage=is_sbt_master_job)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Ejemplo n.º 6
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    java_version = determine_java_version(java_exe)

    if java_version.minor < 8:
        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)
    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(f.endswith(".scala")
                                or f.endswith("scalastyle-config.xml")
                                for f in changed_files):
        run_scala_style_checks()
    if not changed_files or any(f.endswith(".java")
                                or f.endswith("checkstyle.xml")
                                or f.endswith("checkstyle-suppressions.xml")
                                for f in changed_files):
        # run_java_style_checks()
        pass
    if not changed_files or any(f.endswith(".py") for f in changed_files):
        run_python_style_checks()
    if not changed_files or any(f.endswith(".R") for f in changed_files):
        run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules):
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(hadoop_version)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(hadoop_version)

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests, opts.parallelism)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Ejemplo n.º 7
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    java_version = determine_java_version(java_exe)

    if java_version.minor < 8:
        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = "hadoop2.3"
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
    if not changed_modules:
        changed_modules = [modules.root]
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(f.endswith(".scala") for f in changed_files):
        run_scala_style_checks()
    if not changed_files or any(f.endswith(".py") for f in changed_files):
        run_python_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    detect_binary_inop_with_mima()

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests, opts.parallelism)
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Ejemplo n.º 8
0
def main():
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print(
            "[error] Cannot determine your home directory as an absolute path;",
            " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    java_version = determine_java_version(java_exe)

    if java_version.minor < 8:
        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE",
                                        "hadoop2.3")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get(
            "PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = "hadoop2.3"
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile",
          hadoop_version, "under environment", test_env)

    changed_modules = None
    changed_files = None
    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits(
            "HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
    if not changed_modules:
        changed_modules = [modules.root]
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    test_modules = determine_modules_to_test(changed_modules)

    # license checks
    run_apache_rat_checks()

    # style checks
    if not changed_files or any(f.endswith(".scala") for f in changed_files):
        run_scala_style_checks()
    if not changed_files or any(f.endswith(".py") for f in changed_files):
        run_python_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    # spark build
    build_apache_spark(build_tool, hadoop_version)

    # backwards compatibility checks
    detect_binary_inop_with_mima()

    # run the test suites
    run_scala_tests(build_tool, hadoop_version, test_modules)

    modules_with_python_tests = [
        m for m in test_modules if m.python_test_goals
    ]
    if modules_with_python_tests:
        run_python_tests(modules_with_python_tests)
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()
Ejemplo n.º 9
0
def main():
    opts = parse_opts()
    # Ensure the user home directory (HOME) is valid and is an absolute directory
    if not USER_HOME or not os.path.isabs(USER_HOME):
        print("[error] Cannot determine your home directory as an absolute path;",
              " ensure the $HOME environment variable is set properly.")
        sys.exit(1)

    os.chdir(SPARK_HOME)

    rm_r(os.path.join(SPARK_HOME, "work"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))

    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])

    java_exe = determine_java_executable()

    if not java_exe:
        print("[error] Cannot find a version of `java` on the system; please",
              " install one and retry.")
        sys.exit(2)

    # install SparkR
    if which("R"):
        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
    else:
        print("Cannot install SparkR as R was not found in PATH")

    if os.environ.get("AMPLAB_JENKINS"):
        # if we're on the Amplab Jenkins build servers setup variables
        # to reflect the environment settings
        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop3.2")
        hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3")
        test_env = "amplab_jenkins"
        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
        # TODO(sknapp):  after all builds are ported to the ubuntu workers, change this to be:
        # /home/jenkins/anaconda2/envs/py36/bin
        os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
    else:
        # else we're running locally and can use local settings
        build_tool = "sbt"
        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
        hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
        test_env = "local"

    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
          "and Hive profile", hive_version, "under environment", test_env)
    extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)

    changed_modules = None
    changed_files = None
    should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
    included_tags = []
    if should_only_test_modules:
        str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
        test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
        # Directly uses test_modules as changed modules to apply tags and environments
        # as if all specified test modules are changed.
        changed_modules = test_modules
        str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
        str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
        excluded_tags = []
        if str_excluded_tags:
            excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
        included_tags = []
        if str_included_tags:
            included_tags = [t.strip() for t in str_included_tags.split(",")]
    elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
        target_branch = os.environ["ghprbTargetBranch"]
        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
        changed_modules = determine_modules_for_files(changed_files)
        excluded_tags = determine_tags_to_exclude(changed_modules)

    if not changed_modules:
        changed_modules = [modules.root]
        excluded_tags = []
    print("[info] Found the following changed modules:",
          ", ".join(x.name for x in changed_modules))

    # setup environment variables
    # note - the 'root' module doesn't collect environment variables for all modules. Because the
    # environment variables should not be set if a module is not changed, even if running the 'root'
    # module. So here we should use changed_modules rather than test_modules.
    test_environ = {}
    for m in changed_modules:
        test_environ.update(m.environ)
    setup_test_environ(test_environ)

    should_run_java_style_checks = False
    if not should_only_test_modules:
        test_modules = determine_modules_to_test(changed_modules)

        # license checks
        run_apache_rat_checks()

        # style checks
        if not changed_files or any(f.endswith(".scala")
                                    or f.endswith("scalastyle-config.xml")
                                    for f in changed_files):
            run_scala_style_checks(extra_profiles)
        if not changed_files or any(f.endswith(".java")
                                    or f.endswith("checkstyle.xml")
                                    or f.endswith("checkstyle-suppressions.xml")
                                    for f in changed_files):
            # Run SBT Checkstyle after the build to prevent a side-effect to the build.
            should_run_java_style_checks = True
        if not changed_files or any(f.endswith("lint-python")
                                    or f.endswith("tox.ini")
                                    or f.endswith(".py")
                                    for f in changed_files):
            run_python_style_checks()
        if not changed_files or any(f.endswith(".R")
                                    or f.endswith("lint-r")
                                    or f.endswith(".lintr")
                                    for f in changed_files):
            run_sparkr_style_checks()

    # determine if docs were changed and if we're inside the amplab environment
    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
    #    build_spark_documentation()

    if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
        run_build_tests()

    # spark build
    build_apache_spark(build_tool, extra_profiles)

    # backwards compatibility checks
    if build_tool == "sbt":
        # Note: compatibility tests only supported in sbt for now
        detect_binary_inop_with_mima(extra_profiles)
        # Since we did not build assembly/package before running dev/mima, we need to
        # do it here because the tests still rely on it; see SPARK-13294 for details.
        build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)

    # run the test suites
    run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)

    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
    if modules_with_python_tests:
        # We only run PySpark tests with coverage report in one specific job with
        # Spark master with SBT in Jenkins.
        is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
        run_python_tests(
            modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
        run_python_packaging_tests()
    if any(m.should_run_r_tests for m in test_modules):
        run_sparkr_tests()