def __init__(self, hosts=None, processes_per_node=1, env=None): driver_ip = get_node_ip() if hosts is None: # Single node self.hosts = [driver_ip] elif hosts == "all": # All executor nodes in the cluster def get_ip(iter): yield get_node_ip() from bigdl.util.common import get_node_and_core_number from zoo.orca import OrcaContext sc = OrcaContext.get_spark_context() node_num, core_num = get_node_and_core_number() total_cores = node_num * core_num self.hosts = list(set(sc.range(0, total_cores, numSlices=total_cores).barrier() .mapPartitions(get_ip).collect())) else: # User specified hosts, assumed to be non-duplicate assert isinstance(hosts, list) self.hosts = hosts self.master = self.hosts[0] print("Master: ", self.master) self.remote_hosts = [] for host in self.hosts: if host != driver_ip: self.remote_hosts.append(host) print("Remote hosts: ", self.remote_hosts) print("Hosts: ", self.hosts) self.processes_per_node = processes_per_node self.env = env if env else {}
def _start_raylets(iter): from zoo.util.utils import get_node_ip current_ip = get_node_ip() master_ip = redis_address.split(":")[0] do_start = True process_info = None base_path = tempfile.gettempdir() ray_master_flag_path = os.path.join(base_path, self.ray_master_flag) # If there is already a ray master on this node, we need to start one less raylet. if current_ip == master_ip: ray_master_lock_path = os.path.join(base_path, self.ray_master_lock) with filelock.FileLock(ray_master_lock_path): if not os.path.exists(ray_master_flag_path): os.mknod(ray_master_flag_path) do_start = False if do_start: raylet_lock_path = os.path.join(base_path, self.raylet_lock) with filelock.FileLock(raylet_lock_path): process_info = self._start_ray_node( command=RayServiceFuncGenerator._get_raylet_command( redis_address=redis_address, ray_exec=self.ray_exec, password=self.password, ray_node_cpu_cores=self.ray_node_cpu_cores, labels=self.labels, object_store_memory=self.object_store_memory, extra_params=self.extra_params), tag="raylet") kill_redundant_log_monitors(redis_address=redis_address) # Cannot remove ray_master_flag at the end of this task since no barrier is guaranteed. yield process_info
def shutdown_plasma(self): for host in self.hosts: if host != get_node_ip(): p = subprocess.Popen(["ssh", "root@{}".format(host), "pkill plasma"]) else: p = subprocess.Popen(["pkill", "plasma"]) os.waitpid(p.pid, 0)
def _start_ray_master(index, iter): from zoo.util.utils import get_node_ip process_info = None if index == 0: print("partition id is : {}".format(index)) current_ip = get_node_ip() print("master address {}".format(current_ip)) redis_address = "{}:{}".format(current_ip, self.redis_port) process_info = self._start_ray_node( command=self._gen_master_command(), tag="ray-master") process_info.master_addr = redis_address yield process_info
def _start_ray_services(iter): from pyspark import BarrierTaskContext from zoo.util.utils import get_node_ip tc = BarrierTaskContext.get() current_ip = get_node_ip() print("current address {}".format(current_ip)) print("master address {}".format(master_ip)) redis_address = "{}:{}".format(master_ip, self.redis_port) process_info = None base_path = tempfile.gettempdir() ray_master_flag_path = os.path.join(base_path, self.ray_master_flag) if current_ip == master_ip: # Start the ray master. # It is possible that multiple executors are on one node. In this case, # the first executor that gets the lock would be the master and it would # create a flag to indicate the master has initialized. # The flag file is removed when ray start processes finish so that this # won't affect other programs. ray_master_lock_path = os.path.join(base_path, self.ray_master_lock) with filelock.FileLock(ray_master_lock_path): if not os.path.exists(ray_master_flag_path): print("partition id is : {}".format(tc.partitionId())) process_info = self._start_ray_node( command=self._gen_master_command(), tag="ray-master") process_info.master_addr = redis_address os.mknod(ray_master_flag_path) tc.barrier() if not process_info: # Start raylets. # Add a lock to avoid starting multiple raylets on one node at the same time. # See this issue: https://github.com/ray-project/ray/issues/10154 raylet_lock_path = os.path.join(base_path, self.raylet_lock) with filelock.FileLock(raylet_lock_path): print("partition id is : {}".format(tc.partitionId())) process_info = self._start_ray_node( command=RayServiceFuncGenerator._get_raylet_command( redis_address=redis_address, ray_exec=self.ray_exec, password=self.password, ray_node_cpu_cores=self.ray_node_cpu_cores, labels=self.labels, object_store_memory=self.object_store_memory, extra_params=self.extra_params), tag="raylet") kill_redundant_log_monitors(redis_address=redis_address) if os.path.exists(ray_master_flag_path): os.remove(ray_master_flag_path) yield process_info
def f(index, iterator): import pyarrow.plasma as plasma from zoo.util.utils import get_node_ip res = list(iterator) client = plasma.connect(object_store_address) target_id = ids[index] # If the ObjectID exists in plasma, we assume a task trial # succeeds and the data is already in the object store. if not client.contains(target_id): object_id = client.put(res, target_id) assert object_id == target_id, \ "Errors occurred when putting data into plasma object store" client.disconnect() yield target_id, get_node_ip()
def f(index, iterator): import pyarrow.plasma as plasma client = plasma.connect(address) part_size = 1000000 # TODO: Make subpartition size configurable? buffer = [] sub_index = 0 for record in iterator: if len(buffer) == part_size: res_buffer = process_records(buffer) object_id = client.put(res_buffer) buffer = [record] yield index, sub_index, part_size, object_id, get_node_ip() sub_index += 1 else: buffer.append(record) remain_size = len(buffer) if remain_size > 0: res_buffer = process_records(buffer) object_id = client.put(res_buffer) buffer = [] client.disconnect() yield index, sub_index, remain_size, object_id, get_node_ip() else: client.disconnect()
def launch_plasma(self, object_store_memory="2g"): import atexit atexit.register(self.shutdown_plasma) # TODO: Or can use spark to launch plasma from zoo.ray.utils import resource_to_bytes self.plasma_path = "/".join(sys.executable.split("/")[:-1] + ["plasma_store"]) self.object_store_memory = resource_to_bytes(object_store_memory) self.object_store_address = "/tmp/analytics_zoo_plasma" command = "{} -m {} -s {}".format( self.plasma_path, self.object_store_memory, self.object_store_address) for host in self.hosts: if host != get_node_ip(): p = subprocess.Popen(["ssh", "root@{}".format(host), command]) else: p = subprocess.Popen(command.split()) print("Plasma launched on {}".format(host)) return self.object_store_address
def __init__(self, meta_data, object_store_address, workers_per_node=1, batch_size=1): import pyarrow.plasma as plasma self.client = plasma.connect(object_store_address) print("Connected to plasma") # All the subpartitions on this node all_data = [ subpartition for subpartition in meta_data if subpartition[4] == get_node_ip() ] rank = int(os.environ.get("PMI_RANK", 0)) print("Global rank: ", rank) # rank = int(os.environ.get("PMIX_RANK", 0)) # For OpenMPI local_rank = rank % workers_per_node print("Local rank: ", local_rank) data_splits = list(chunks(all_data, len(all_data) // workers_per_node)) worker_data = data_splits[local_rank] if len(data_splits) == (workers_per_node + 1): # Can't evenly split among workers remain_data = data_splits[-1] if local_rank < len(remain_data): worker_data += [remain_data[local_rank]] self.object_ids = [subpartition[3] for subpartition in worker_data] self.sizes = [subpartition[2] for subpartition in worker_data] print("Data size for worker: ", sum(self.sizes)) self.batch_size = batch_size offsets = [] for i in self.sizes: if len(offsets) == 0: offsets.append(i) else: offsets.append(offsets[-1] + i) self.offsets = offsets self.current_index = 0 # Current index for object_id; data loaded self.load_from_plasma(self.current_index)
def __init__(self, hosts=None, processes_per_node=1, env=None): driver_ip = get_node_ip() if hosts is None: # Single node self.hosts = [driver_ip] elif hosts == "all": # All executor nodes in the cluster def get_ip(iter): yield get_node_ip() from zoo.orca import OrcaContext sc = OrcaContext.get_spark_context() master = sc.getConf().get("spark.master") if master == "local" or master.startswith("local["): num_executors = 1 else: num_executors = int( sc.getConf().get("spark.executor.instances")) self.hosts = list( set( sc.range(0, num_executors, numSlices=num_executors).barrier().mapPartitions( get_ip).collect())) else: # User specified hosts, assumed to be non-duplicate assert isinstance(hosts, list) self.hosts = hosts self.master = self.hosts[0] print("Master: ", self.master) self.remote_hosts = [] for host in self.hosts: if host != driver_ip: self.remote_hosts.append(host) print("Remote hosts: ", self.remote_hosts) print("Hosts: ", self.hosts) self.processes_per_node = processes_per_node self.env = env if env else {}
def init_spark_standalone(self, num_executors, executor_cores, executor_memory="2g", driver_cores=4, driver_memory="1g", master=None, extra_executor_memory_for_ray=None, extra_python_lib=None, conf=None, jars=None, python_location=None, enable_numa_binding=False): import subprocess import pyspark from zoo.util.utils import get_node_ip if "PYSPARK_PYTHON" not in os.environ: os.environ["PYSPARK_PYTHON"] = \ python_location if python_location else detect_python_location() if not master: pyspark_home = os.path.abspath(pyspark.__file__ + "/../") zoo_standalone_home = os.path.abspath( __file__ + "/../../share/bin/standalone") node_ip = get_node_ip() SparkRunner.standalone_env = { "SPARK_HOME": pyspark_home, "ZOO_STANDALONE_HOME": zoo_standalone_home, # If not set this, by default master is hostname but not ip, "SPARK_MASTER_HOST": node_ip } if 'JAVA_HOME' in os.environ: SparkRunner.standalone_env["JAVA_HOME"] = os.environ[ "JAVA_HOME"] # The scripts installed from pip don't have execution permission # and need to first give them permission. pro = subprocess.Popen( ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)]) os.waitpid(pro.pid, 0) # Start master start_master_pro = subprocess.Popen( "{}/sbin/start-master.sh".format(zoo_standalone_home), shell=True, env=SparkRunner.standalone_env) _, status = os.waitpid(start_master_pro.pid, 0) if status != 0: raise RuntimeError("starting master failed") master = "spark://{}:7077".format( node_ip) # 7077 is the default port # Start worker if enable_numa_binding: worker_script = "start-worker-with-numactl.sh" SparkRunner.standalone_env["SPARK_WORKER_INSTANCES"] = str( num_executors) else: worker_script = "start-worker.sh" start_worker_pro = subprocess.Popen("{}/sbin/{} {}".format( zoo_standalone_home, worker_script, master), shell=True, env=SparkRunner.standalone_env) _, status = os.waitpid(start_worker_pro.pid, 0) if status != 0: raise RuntimeError("starting worker failed") else: # A Spark standalone cluster has already been started by the user. assert master.startswith("spark://"), \ "Please input a valid master address for your Spark standalone cluster: " \ "spark://master:port" # Start pyspark-shell submit_args = "--master " + master submit_args = submit_args + gen_submit_args( driver_cores, driver_memory, num_executors, executor_cores, executor_memory, extra_python_lib, jars) conf = enrich_conf_for_spark(conf, driver_cores, driver_memory, num_executors, executor_cores, executor_memory, extra_executor_memory_for_ray) conf.update({ "spark.cores.max": num_executors * executor_cores, "spark.executorEnv.PYTHONHOME": "/".join(detect_python_location().split("/")[:-2]) }) zoo_bigdl_jar_path = ":".join(list( get_zoo_bigdl_classpath_on_driver())) if "spark.executor.extraClassPath" in conf: conf["spark.executor.extraClassPath"] = "{}:{}".format( zoo_bigdl_jar_path, conf["spark.executor.extraClassPath"]) else: conf["spark.executor.extraClassPath"] = zoo_bigdl_jar_path sc = self.create_sc(submit_args, conf) return sc
def init_spark_standalone(self, num_executors, executor_cores, executor_memory="10g", driver_memory="1g", driver_cores=4, master=None, extra_executor_memory_for_ray=None, extra_python_lib=None, conf=None, jars=None): import subprocess import pyspark from zoo.util.utils import get_node_ip from zoo.util.engine import get_analytics_zoo_classpath from bigdl.util.engine import get_bigdl_classpath if 'PYSPARK_PYTHON' not in os.environ: os.environ["PYSPARK_PYTHON"] = self._detect_python_location() if not master: pyspark_home = os.path.abspath(pyspark.__file__ + "/../") zoo_standalone_home = os.path.abspath( __file__ + "/../../share/bin/standalone") node_ip = get_node_ip() SparkRunner.standalone_env = { "SPARK_HOME": pyspark_home, "ZOO_STANDALONE_HOME": zoo_standalone_home, # If not set this, by default master is hostname but not ip, "SPARK_MASTER_HOST": node_ip } # The scripts installed from pip don't have execution permission # and need to first give them permission. pro = subprocess.Popen( ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)]) os.waitpid(pro.pid, 0) # Start master start_master_pro = subprocess.Popen( "{}/sbin/start-master.sh".format(zoo_standalone_home), shell=True, env=SparkRunner.standalone_env) os.waitpid(start_master_pro.pid, 0) master = "spark://{}:7077".format( node_ip) # 7077 is the default port # Start worker start_worker_pro = subprocess.Popen( "{}/sbin/start-worker.sh {}".format(zoo_standalone_home, master), shell=True, env=SparkRunner.standalone_env) os.waitpid(start_worker_pro.pid, 0) else: # A Spark standalone cluster has already been started by the user. assert master.startswith("spark://"), \ "Please input a valid master address for your Spark standalone cluster: " \ "spark://master:port" # Start pyspark-shell submit_args = " --master " + master submit_args = submit_args + " --driver-cores {} --driver-memory {} --num-executors {}" \ " --executor-cores {} --executor-memory {}"\ .format(driver_cores, driver_memory, num_executors, executor_cores, executor_memory) if extra_python_lib: submit_args = submit_args + " --py-files {}".format( extra_python_lib) if jars: submit_args = submit_args + " --jars {}".format(jars) submit_args = submit_args + " pyspark-shell" os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args zoo_bigdl_jar_path = ":".join( [get_analytics_zoo_classpath(), get_bigdl_classpath()]) spark_conf = init_spark_conf(conf) \ .set("spark.driver.cores", driver_cores) \ .set("spark.driver.memory", driver_memory) \ .set("spark.executor.instances", num_executors) \ .set("spark.executor.cores", executor_cores) \ .set("spark.cores.max", num_executors * executor_cores) \ .set("spark.executorEnv.PYTHONHOME", "/".join(self._detect_python_location().split("/")[:-2])) if extra_executor_memory_for_ray: spark_conf.set("spark.executor.memoryOverhead", extra_executor_memory_for_ray) if spark_conf.contains("spark.executor.extraClassPath"): spark_conf.set( "spark.executor.extraClassPath", "{}:{}".format(zoo_bigdl_jar_path, conf.get("spark.executor.extraClassPath"))) else: spark_conf.set("spark.executor.extraClassPath", zoo_bigdl_jar_path) sc = init_nncontext(spark_conf, redirect_spark_log=self.redirect_spark_log) sc.setLogLevel(self.spark_log_level) return sc
def info_fn(iter): from zoo.util.utils import get_node_ip yield get_node_ip()
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import argparse import cloudpickle from zoo.util.utils import get_node_ip print("Worker on {} with global rank {}".format(get_node_ip(), os.environ.get("PMI_RANK", 0))) parser = argparse.ArgumentParser() parser.add_argument('--pkl_path', type=str, default="", help='The directory of the pkl files for mpi training.') args = parser.parse_args() pkl_path = args.pkl_path with open("{}/saved_mpi_estimator.pkl".format(pkl_path), "rb") as f: model_creator, optimizer_creator, loss_creator, metrics, \ scheduler_creator, config, init_func = cloudpickle.load(f) with open("{}/mpi_train_data.pkl".format(pkl_path), "rb") as f:
def get_ip(iter): yield get_node_ip()