class Configuration(object): __bigdl_jars = [get_bigdl_classpath()] @staticmethod def add_extra_jars(jars): """ Add extra jars to classpath :param jars: a string or a list of strings as jar paths """ import six if isinstance(jars, six.string_types): jars = [jars] Configuration.__bigdl_jars += jars @staticmethod def add_extra_python_modules(packages): """ Add extra python modules to sys.path :param packages: a string or a list of strings as python package paths """ import six if isinstance(packages, six.string_types): packages = [packages] for package in packages: sys.path.insert(0, package) @staticmethod def get_bigdl_jars(): return Configuration.__bigdl_jars
def create_spark_conf(): bigdl_conf = get_bigdl_conf() sparkConf = SparkConf() sparkConf.setAll(bigdl_conf.items()) if not is_spark_below_2_2(): extend_spark_driver_cp(sparkConf, get_bigdl_classpath()) return sparkConf
def get_zoo_bigdl_classpath_on_driver(): from bigdl.util.engine import get_bigdl_classpath from zoo.util.engine import get_analytics_zoo_classpath bigdl_classpath = get_bigdl_classpath() assert bigdl_classpath, "Cannot find BigDL classpath, please check your installation" zoo_classpath = get_analytics_zoo_classpath() assert zoo_classpath, "Cannot find Analytics-Zoo classpath, please check your installation" return zoo_classpath, bigdl_classpath
def try_copy_bigdl_jar(self): try: from bigdl.util.engine import get_bigdl_classpath shutil.copyfile(get_bigdl_classpath(), self.bigdl_jar) except Exception: print("WARNING: if you are running Cluster Serving using pip, you have misconfig" "with bigdl python package, otherwise, ignore this WARNING.")
def try_copy_bigdl_jar(self): try: from bigdl.util.engine import get_bigdl_classpath bigdl_jar_src = get_bigdl_classpath() if bigdl_jar_src == "": raise Exception("BigDL jar not discovered.") shutil.copyfile(bigdl_jar_src, self.bigdl_jar) print("BigDL jar copied from ", bigdl_jar_src) except Exception: print( "WARNING: if you are running Cluster Serving using pip, you have misconfig" "with bigdl python package, otherwise, ignore this WARNING.")
def create_spark_conf(): bigdl_conf = get_bigdl_conf() sparkConf = SparkConf() sparkConf.setAll(bigdl_conf.items()) if not is_spark_below_2_2(): extend_spark_driver_cp(sparkConf, get_bigdl_classpath()) # add content in PYSPARK_FILES in spark.submit.pyFiles # This is a workaround for current Spark on k8s python_lib = os.environ.get('PYSPARK_FILES', None) if python_lib: existing_py_files = sparkConf.get("spark.submit.pyFiles") if existing_py_files: sparkConf.set(key="spark.submit.pyFiles", value="%s,%s" % (python_lib, existing_py_files)) else: sparkConf.set(key="spark.submit.pyFiles", value=python_lib) return sparkConf
def _get_bigdl_jar_name_on_driver(self): from bigdl.util.engine import get_bigdl_classpath bigdl_classpath = get_bigdl_classpath() assert bigdl_classpath, "Cannot find bigdl classpath" return bigdl_classpath.split("/")[-1]
def init_spark_standalone(self, num_executors, executor_cores, executor_memory="10g", driver_memory="1g", driver_cores=4, master=None, extra_executor_memory_for_ray=None, extra_python_lib=None, conf=None, jars=None): import subprocess import pyspark from zoo.util.utils import get_node_ip from zoo.util.engine import get_analytics_zoo_classpath from bigdl.util.engine import get_bigdl_classpath if 'PYSPARK_PYTHON' not in os.environ: os.environ["PYSPARK_PYTHON"] = self._detect_python_location() if not master: pyspark_home = os.path.abspath(pyspark.__file__ + "/../") zoo_standalone_home = os.path.abspath( __file__ + "/../../share/bin/standalone") node_ip = get_node_ip() SparkRunner.standalone_env = { "SPARK_HOME": pyspark_home, "ZOO_STANDALONE_HOME": zoo_standalone_home, # If not set this, by default master is hostname but not ip, "SPARK_MASTER_HOST": node_ip } # The scripts installed from pip don't have execution permission # and need to first give them permission. pro = subprocess.Popen( ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)]) os.waitpid(pro.pid, 0) # Start master start_master_pro = subprocess.Popen( "{}/sbin/start-master.sh".format(zoo_standalone_home), shell=True, env=SparkRunner.standalone_env) os.waitpid(start_master_pro.pid, 0) master = "spark://{}:7077".format( node_ip) # 7077 is the default port # Start worker start_worker_pro = subprocess.Popen( "{}/sbin/start-worker.sh {}".format(zoo_standalone_home, master), shell=True, env=SparkRunner.standalone_env) os.waitpid(start_worker_pro.pid, 0) else: # A Spark standalone cluster has already been started by the user. assert master.startswith("spark://"), \ "Please input a valid master address for your Spark standalone cluster: " \ "spark://master:port" # Start pyspark-shell submit_args = " --master " + master submit_args = submit_args + " --driver-cores {} --driver-memory {} --num-executors {}" \ " --executor-cores {} --executor-memory {}"\ .format(driver_cores, driver_memory, num_executors, executor_cores, executor_memory) if extra_python_lib: submit_args = submit_args + " --py-files {}".format( extra_python_lib) if jars: submit_args = submit_args + " --jars {}".format(jars) submit_args = submit_args + " pyspark-shell" os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args zoo_bigdl_jar_path = ":".join( [get_analytics_zoo_classpath(), get_bigdl_classpath()]) spark_conf = init_spark_conf(conf) \ .set("spark.driver.cores", driver_cores) \ .set("spark.driver.memory", driver_memory) \ .set("spark.executor.instances", num_executors) \ .set("spark.executor.cores", executor_cores) \ .set("spark.cores.max", num_executors * executor_cores) \ .set("spark.executorEnv.PYTHONHOME", "/".join(self._detect_python_location().split("/")[:-2])) if extra_executor_memory_for_ray: spark_conf.set("spark.executor.memoryOverhead", extra_executor_memory_for_ray) if spark_conf.contains("spark.executor.extraClassPath"): spark_conf.set( "spark.executor.extraClassPath", "{}:{}".format(zoo_bigdl_jar_path, conf.get("spark.executor.extraClassPath"))) else: spark_conf.set("spark.executor.extraClassPath", zoo_bigdl_jar_path) sc = init_nncontext(spark_conf, redirect_spark_log=self.redirect_spark_log) sc.setLogLevel(self.spark_log_level) return sc