Ejemplo n.º 1
0
def get_zoo_bigdl_classpath_on_driver():
    from bigdl.util.engine import get_bigdl_classpath
    from zoo.util.engine import get_analytics_zoo_classpath
    bigdl_classpath = get_bigdl_classpath()
    assert bigdl_classpath, "Cannot find BigDL classpath, please check your installation"
    zoo_classpath = get_analytics_zoo_classpath()
    assert zoo_classpath, "Cannot find Analytics-Zoo classpath, please check your installation"
    return zoo_classpath, bigdl_classpath
Ejemplo n.º 2
0
        def _yarn_opt(jars):
            from zoo.util.engine import get_analytics_zoo_classpath
            command = " --archives {}#python_env --num-executors {} " \
                      " --executor-cores {} --executor-memory {}".\
                format(penv_archive, num_executor, executor_cores, executor_memory)
            path_to_zoo_jar = get_analytics_zoo_classpath()

            if extra_python_lib:
                command = command + " --py-files {} ".format(extra_python_lib)

            if jars:
                command = command + " --jars {},{} ".format(
                    jars, path_to_zoo_jar)
            elif path_to_zoo_jar:
                command = command + " --jars {} ".format(path_to_zoo_jar)

            if path_to_zoo_jar:
                command = command + " --conf spark.driver.extraClassPath={} ".\
                    format(get_analytics_zoo_classpath())
            return command
Ejemplo n.º 3
0
 def copy_zoo_jar(self):
     jar_path = get_analytics_zoo_classpath()
     if jar_path:
         self.try_copy_bigdl_jar()
     else:
         """
         not install by pip, so run prepare_env here            
         """
         jar_paths = glob.glob(
             os.path.abspath(__file__ + "/../../../../dist/lib/*.jar"))
         assert len(jar_paths) > 0, "No zoo jar is found"
         assert len(
             jar_paths) == 1, "Expecting one jar: %s" % len(jar_paths)
         jar_path = jar_paths[0]
     shutil.copyfile(jar_path, self.zoo_jar)
Ejemplo n.º 4
0
 def _get_zoo_jar_name_on_driver(self):
     from zoo.util.engine import get_analytics_zoo_classpath
     zoo_classpath = get_analytics_zoo_classpath()
     assert zoo_classpath, "Cannot find Analytics-Zoo classpath"
     return zoo_classpath.split("/")[-1]
Ejemplo n.º 5
0
    def init_spark_standalone(self,
                              num_executors,
                              executor_cores,
                              executor_memory="10g",
                              driver_memory="1g",
                              driver_cores=4,
                              master=None,
                              extra_executor_memory_for_ray=None,
                              extra_python_lib=None,
                              conf=None,
                              jars=None):
        import subprocess
        import pyspark
        from zoo.util.utils import get_node_ip
        from zoo.util.engine import get_analytics_zoo_classpath
        from bigdl.util.engine import get_bigdl_classpath

        if 'PYSPARK_PYTHON' not in os.environ:
            os.environ["PYSPARK_PYTHON"] = self._detect_python_location()
        if not master:
            pyspark_home = os.path.abspath(pyspark.__file__ + "/../")
            zoo_standalone_home = os.path.abspath(
                __file__ + "/../../share/bin/standalone")
            node_ip = get_node_ip()
            SparkRunner.standalone_env = {
                "SPARK_HOME": pyspark_home,
                "ZOO_STANDALONE_HOME": zoo_standalone_home,
                # If not set this, by default master is hostname but not ip,
                "SPARK_MASTER_HOST": node_ip
            }
            # The scripts installed from pip don't have execution permission
            # and need to first give them permission.
            pro = subprocess.Popen(
                ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)])
            os.waitpid(pro.pid, 0)
            # Start master
            start_master_pro = subprocess.Popen(
                "{}/sbin/start-master.sh".format(zoo_standalone_home),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_master_pro.pid, 0)
            master = "spark://{}:7077".format(
                node_ip)  # 7077 is the default port
            # Start worker
            start_worker_pro = subprocess.Popen(
                "{}/sbin/start-worker.sh {}".format(zoo_standalone_home,
                                                    master),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_worker_pro.pid, 0)
        else:  # A Spark standalone cluster has already been started by the user.
            assert master.startswith("spark://"), \
                "Please input a valid master address for your Spark standalone cluster: " \
                "spark://master:port"

        # Start pyspark-shell
        submit_args = " --master " + master
        submit_args = submit_args + " --driver-cores {} --driver-memory {} --num-executors {}" \
                                    " --executor-cores {} --executor-memory {}"\
            .format(driver_cores, driver_memory, num_executors, executor_cores, executor_memory)
        if extra_python_lib:
            submit_args = submit_args + " --py-files {}".format(
                extra_python_lib)
        if jars:
            submit_args = submit_args + " --jars {}".format(jars)
        submit_args = submit_args + " pyspark-shell"
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args

        zoo_bigdl_jar_path = ":".join(
            [get_analytics_zoo_classpath(),
             get_bigdl_classpath()])
        spark_conf = init_spark_conf(conf) \
            .set("spark.driver.cores", driver_cores) \
            .set("spark.driver.memory", driver_memory) \
            .set("spark.executor.instances", num_executors) \
            .set("spark.executor.cores", executor_cores) \
            .set("spark.cores.max", num_executors * executor_cores) \
            .set("spark.executorEnv.PYTHONHOME",
                 "/".join(self._detect_python_location().split("/")[:-2]))
        if extra_executor_memory_for_ray:
            spark_conf.set("spark.executor.memoryOverhead",
                           extra_executor_memory_for_ray)
        if spark_conf.contains("spark.executor.extraClassPath"):
            spark_conf.set(
                "spark.executor.extraClassPath",
                "{}:{}".format(zoo_bigdl_jar_path,
                               conf.get("spark.executor.extraClassPath")))
        else:
            spark_conf.set("spark.executor.extraClassPath", zoo_bigdl_jar_path)

        sc = init_nncontext(spark_conf,
                            redirect_spark_log=self.redirect_spark_log)
        sc.setLogLevel(self.spark_log_level)
        return sc