Ejemplo n.º 1
0
    def __init__(self,
                 hosts=None,
                 processes_per_node=1,
                 env=None):
        driver_ip = get_node_ip()
        if hosts is None:  # Single node
            self.hosts = [driver_ip]
        elif hosts == "all":  # All executor nodes in the cluster
            def get_ip(iter):
                yield get_node_ip()

            from bigdl.util.common import get_node_and_core_number
            from zoo.orca import OrcaContext
            sc = OrcaContext.get_spark_context()
            node_num, core_num = get_node_and_core_number()
            total_cores = node_num * core_num
            self.hosts = list(set(sc.range(0, total_cores, numSlices=total_cores).barrier()
                                  .mapPartitions(get_ip).collect()))
        else:  # User specified hosts, assumed to be non-duplicate
            assert isinstance(hosts, list)
            self.hosts = hosts

        self.master = self.hosts[0]
        print("Master: ", self.master)
        self.remote_hosts = []
        for host in self.hosts:
            if host != driver_ip:
                self.remote_hosts.append(host)
        print("Remote hosts: ", self.remote_hosts)
        print("Hosts: ", self.hosts)
        self.processes_per_node = processes_per_node
        self.env = env if env else {}
Ejemplo n.º 2
0
        def _start_raylets(iter):
            from zoo.util.utils import get_node_ip
            current_ip = get_node_ip()
            master_ip = redis_address.split(":")[0]
            do_start = True
            process_info = None
            base_path = tempfile.gettempdir()
            ray_master_flag_path = os.path.join(base_path,
                                                self.ray_master_flag)
            # If there is already a ray master on this node, we need to start one less raylet.
            if current_ip == master_ip:
                ray_master_lock_path = os.path.join(base_path,
                                                    self.ray_master_lock)
                with filelock.FileLock(ray_master_lock_path):
                    if not os.path.exists(ray_master_flag_path):
                        os.mknod(ray_master_flag_path)
                        do_start = False
            if do_start:
                raylet_lock_path = os.path.join(base_path, self.raylet_lock)
                with filelock.FileLock(raylet_lock_path):
                    process_info = self._start_ray_node(
                        command=RayServiceFuncGenerator._get_raylet_command(
                            redis_address=redis_address,
                            ray_exec=self.ray_exec,
                            password=self.password,
                            ray_node_cpu_cores=self.ray_node_cpu_cores,
                            labels=self.labels,
                            object_store_memory=self.object_store_memory,
                            extra_params=self.extra_params),
                        tag="raylet")
                    kill_redundant_log_monitors(redis_address=redis_address)
            # Cannot remove ray_master_flag at the end of this task since no barrier is guaranteed.

            yield process_info
Ejemplo n.º 3
0
 def shutdown_plasma(self):
     for host in self.hosts:
         if host != get_node_ip():
             p = subprocess.Popen(["ssh", "root@{}".format(host), "pkill plasma"])
         else:
             p = subprocess.Popen(["pkill", "plasma"])
         os.waitpid(p.pid, 0)
Ejemplo n.º 4
0
 def _start_ray_master(index, iter):
     from zoo.util.utils import get_node_ip
     process_info = None
     if index == 0:
         print("partition id is : {}".format(index))
         current_ip = get_node_ip()
         print("master address {}".format(current_ip))
         redis_address = "{}:{}".format(current_ip, self.redis_port)
         process_info = self._start_ray_node(
             command=self._gen_master_command(), tag="ray-master")
         process_info.master_addr = redis_address
     yield process_info
Ejemplo n.º 5
0
        def _start_ray_services(iter):
            from pyspark import BarrierTaskContext
            from zoo.util.utils import get_node_ip
            tc = BarrierTaskContext.get()
            current_ip = get_node_ip()
            print("current address {}".format(current_ip))
            print("master address {}".format(master_ip))
            redis_address = "{}:{}".format(master_ip, self.redis_port)
            process_info = None
            base_path = tempfile.gettempdir()
            ray_master_flag_path = os.path.join(base_path,
                                                self.ray_master_flag)
            if current_ip == master_ip:  # Start the ray master.
                # It is possible that multiple executors are on one node. In this case,
                # the first executor that gets the lock would be the master and it would
                # create a flag to indicate the master has initialized.
                # The flag file is removed when ray start processes finish so that this
                # won't affect other programs.
                ray_master_lock_path = os.path.join(base_path,
                                                    self.ray_master_lock)
                with filelock.FileLock(ray_master_lock_path):
                    if not os.path.exists(ray_master_flag_path):
                        print("partition id is : {}".format(tc.partitionId()))
                        process_info = self._start_ray_node(
                            command=self._gen_master_command(),
                            tag="ray-master")
                        process_info.master_addr = redis_address
                        os.mknod(ray_master_flag_path)

            tc.barrier()
            if not process_info:  # Start raylets.
                # Add a lock to avoid starting multiple raylets on one node at the same time.
                # See this issue: https://github.com/ray-project/ray/issues/10154
                raylet_lock_path = os.path.join(base_path, self.raylet_lock)
                with filelock.FileLock(raylet_lock_path):
                    print("partition id is : {}".format(tc.partitionId()))
                    process_info = self._start_ray_node(
                        command=RayServiceFuncGenerator._get_raylet_command(
                            redis_address=redis_address,
                            ray_exec=self.ray_exec,
                            password=self.password,
                            ray_node_cpu_cores=self.ray_node_cpu_cores,
                            labels=self.labels,
                            object_store_memory=self.object_store_memory,
                            extra_params=self.extra_params),
                        tag="raylet")
                    kill_redundant_log_monitors(redis_address=redis_address)

            if os.path.exists(ray_master_flag_path):
                os.remove(ray_master_flag_path)
            yield process_info
Ejemplo n.º 6
0
 def f(index, iterator):
     import pyarrow.plasma as plasma
     from zoo.util.utils import get_node_ip
     res = list(iterator)
     client = plasma.connect(object_store_address)
     target_id = ids[index]
     # If the ObjectID exists in plasma, we assume a task trial
     # succeeds and the data is already in the object store.
     if not client.contains(target_id):
         object_id = client.put(res, target_id)
         assert object_id == target_id, \
             "Errors occurred when putting data into plasma object store"
     client.disconnect()
     yield target_id, get_node_ip()
Ejemplo n.º 7
0
 def f(index, iterator):
     import pyarrow.plasma as plasma
     client = plasma.connect(address)
     part_size = 1000000  # TODO: Make subpartition size configurable?
     buffer = []
     sub_index = 0
     for record in iterator:
         if len(buffer) == part_size:
             res_buffer = process_records(buffer)
             object_id = client.put(res_buffer)
             buffer = [record]
             yield index, sub_index, part_size, object_id, get_node_ip()
             sub_index += 1
         else:
             buffer.append(record)
     remain_size = len(buffer)
     if remain_size > 0:
         res_buffer = process_records(buffer)
         object_id = client.put(res_buffer)
         buffer = []
         client.disconnect()
         yield index, sub_index, remain_size, object_id, get_node_ip()
     else:
         client.disconnect()
Ejemplo n.º 8
0
 def launch_plasma(self, object_store_memory="2g"):
     import atexit
     atexit.register(self.shutdown_plasma)
     # TODO: Or can use spark to launch plasma
     from zoo.ray.utils import resource_to_bytes
     self.plasma_path = "/".join(sys.executable.split("/")[:-1] + ["plasma_store"])
     self.object_store_memory = resource_to_bytes(object_store_memory)
     self.object_store_address = "/tmp/analytics_zoo_plasma"
     command = "{} -m {} -s {}".format(
         self.plasma_path, self.object_store_memory, self.object_store_address)
     for host in self.hosts:
         if host != get_node_ip():
             p = subprocess.Popen(["ssh", "root@{}".format(host), command])
         else:
             p = subprocess.Popen(command.split())
         print("Plasma launched on {}".format(host))
     return self.object_store_address
Ejemplo n.º 9
0
    def __init__(self,
                 meta_data,
                 object_store_address,
                 workers_per_node=1,
                 batch_size=1):
        import pyarrow.plasma as plasma
        self.client = plasma.connect(object_store_address)
        print("Connected to plasma")

        # All the subpartitions on this node
        all_data = [
            subpartition for subpartition in meta_data
            if subpartition[4] == get_node_ip()
        ]
        rank = int(os.environ.get("PMI_RANK", 0))
        print("Global rank: ", rank)
        # rank = int(os.environ.get("PMIX_RANK", 0))  # For OpenMPI
        local_rank = rank % workers_per_node
        print("Local rank: ", local_rank)
        data_splits = list(chunks(all_data, len(all_data) // workers_per_node))
        worker_data = data_splits[local_rank]
        if len(data_splits) == (workers_per_node +
                                1):  # Can't evenly split among workers
            remain_data = data_splits[-1]
            if local_rank < len(remain_data):
                worker_data += [remain_data[local_rank]]
        self.object_ids = [subpartition[3] for subpartition in worker_data]
        self.sizes = [subpartition[2] for subpartition in worker_data]
        print("Data size for worker: ", sum(self.sizes))
        self.batch_size = batch_size
        offsets = []
        for i in self.sizes:
            if len(offsets) == 0:
                offsets.append(i)
            else:
                offsets.append(offsets[-1] + i)
        self.offsets = offsets
        self.current_index = 0  # Current index for object_id; data loaded
        self.load_from_plasma(self.current_index)
Ejemplo n.º 10
0
    def __init__(self, hosts=None, processes_per_node=1, env=None):
        driver_ip = get_node_ip()
        if hosts is None:  # Single node
            self.hosts = [driver_ip]
        elif hosts == "all":  # All executor nodes in the cluster

            def get_ip(iter):
                yield get_node_ip()

            from zoo.orca import OrcaContext
            sc = OrcaContext.get_spark_context()
            master = sc.getConf().get("spark.master")
            if master == "local" or master.startswith("local["):
                num_executors = 1
            else:
                num_executors = int(
                    sc.getConf().get("spark.executor.instances"))
            self.hosts = list(
                set(
                    sc.range(0, num_executors,
                             numSlices=num_executors).barrier().mapPartitions(
                                 get_ip).collect()))
        else:  # User specified hosts, assumed to be non-duplicate
            assert isinstance(hosts, list)
            self.hosts = hosts

        self.master = self.hosts[0]
        print("Master: ", self.master)
        self.remote_hosts = []
        for host in self.hosts:
            if host != driver_ip:
                self.remote_hosts.append(host)
        print("Remote hosts: ", self.remote_hosts)
        print("Hosts: ", self.hosts)
        self.processes_per_node = processes_per_node
        self.env = env if env else {}
Ejemplo n.º 11
0
    def init_spark_standalone(self,
                              num_executors,
                              executor_cores,
                              executor_memory="2g",
                              driver_cores=4,
                              driver_memory="1g",
                              master=None,
                              extra_executor_memory_for_ray=None,
                              extra_python_lib=None,
                              conf=None,
                              jars=None,
                              python_location=None,
                              enable_numa_binding=False):
        import subprocess
        import pyspark
        from zoo.util.utils import get_node_ip

        if "PYSPARK_PYTHON" not in os.environ:
            os.environ["PYSPARK_PYTHON"] = \
                python_location if python_location else detect_python_location()
        if not master:
            pyspark_home = os.path.abspath(pyspark.__file__ + "/../")
            zoo_standalone_home = os.path.abspath(
                __file__ + "/../../share/bin/standalone")
            node_ip = get_node_ip()
            SparkRunner.standalone_env = {
                "SPARK_HOME": pyspark_home,
                "ZOO_STANDALONE_HOME": zoo_standalone_home,
                # If not set this, by default master is hostname but not ip,
                "SPARK_MASTER_HOST": node_ip
            }
            if 'JAVA_HOME' in os.environ:
                SparkRunner.standalone_env["JAVA_HOME"] = os.environ[
                    "JAVA_HOME"]
            # The scripts installed from pip don't have execution permission
            # and need to first give them permission.
            pro = subprocess.Popen(
                ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)])
            os.waitpid(pro.pid, 0)
            # Start master
            start_master_pro = subprocess.Popen(
                "{}/sbin/start-master.sh".format(zoo_standalone_home),
                shell=True,
                env=SparkRunner.standalone_env)
            _, status = os.waitpid(start_master_pro.pid, 0)
            if status != 0:
                raise RuntimeError("starting master failed")
            master = "spark://{}:7077".format(
                node_ip)  # 7077 is the default port
            # Start worker
            if enable_numa_binding:
                worker_script = "start-worker-with-numactl.sh"
                SparkRunner.standalone_env["SPARK_WORKER_INSTANCES"] = str(
                    num_executors)
            else:
                worker_script = "start-worker.sh"
            start_worker_pro = subprocess.Popen("{}/sbin/{} {}".format(
                zoo_standalone_home, worker_script, master),
                                                shell=True,
                                                env=SparkRunner.standalone_env)
            _, status = os.waitpid(start_worker_pro.pid, 0)
            if status != 0:
                raise RuntimeError("starting worker failed")
        else:  # A Spark standalone cluster has already been started by the user.
            assert master.startswith("spark://"), \
                "Please input a valid master address for your Spark standalone cluster: " \
                "spark://master:port"

        # Start pyspark-shell
        submit_args = "--master " + master
        submit_args = submit_args + gen_submit_args(
            driver_cores, driver_memory, num_executors, executor_cores,
            executor_memory, extra_python_lib, jars)

        conf = enrich_conf_for_spark(conf, driver_cores, driver_memory,
                                     num_executors, executor_cores,
                                     executor_memory,
                                     extra_executor_memory_for_ray)
        conf.update({
            "spark.cores.max":
            num_executors * executor_cores,
            "spark.executorEnv.PYTHONHOME":
            "/".join(detect_python_location().split("/")[:-2])
        })
        zoo_bigdl_jar_path = ":".join(list(
            get_zoo_bigdl_classpath_on_driver()))
        if "spark.executor.extraClassPath" in conf:
            conf["spark.executor.extraClassPath"] = "{}:{}".format(
                zoo_bigdl_jar_path, conf["spark.executor.extraClassPath"])
        else:
            conf["spark.executor.extraClassPath"] = zoo_bigdl_jar_path

        sc = self.create_sc(submit_args, conf)
        return sc
Ejemplo n.º 12
0
    def init_spark_standalone(self,
                              num_executors,
                              executor_cores,
                              executor_memory="10g",
                              driver_memory="1g",
                              driver_cores=4,
                              master=None,
                              extra_executor_memory_for_ray=None,
                              extra_python_lib=None,
                              conf=None,
                              jars=None):
        import subprocess
        import pyspark
        from zoo.util.utils import get_node_ip
        from zoo.util.engine import get_analytics_zoo_classpath
        from bigdl.util.engine import get_bigdl_classpath

        if 'PYSPARK_PYTHON' not in os.environ:
            os.environ["PYSPARK_PYTHON"] = self._detect_python_location()
        if not master:
            pyspark_home = os.path.abspath(pyspark.__file__ + "/../")
            zoo_standalone_home = os.path.abspath(
                __file__ + "/../../share/bin/standalone")
            node_ip = get_node_ip()
            SparkRunner.standalone_env = {
                "SPARK_HOME": pyspark_home,
                "ZOO_STANDALONE_HOME": zoo_standalone_home,
                # If not set this, by default master is hostname but not ip,
                "SPARK_MASTER_HOST": node_ip
            }
            # The scripts installed from pip don't have execution permission
            # and need to first give them permission.
            pro = subprocess.Popen(
                ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)])
            os.waitpid(pro.pid, 0)
            # Start master
            start_master_pro = subprocess.Popen(
                "{}/sbin/start-master.sh".format(zoo_standalone_home),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_master_pro.pid, 0)
            master = "spark://{}:7077".format(
                node_ip)  # 7077 is the default port
            # Start worker
            start_worker_pro = subprocess.Popen(
                "{}/sbin/start-worker.sh {}".format(zoo_standalone_home,
                                                    master),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_worker_pro.pid, 0)
        else:  # A Spark standalone cluster has already been started by the user.
            assert master.startswith("spark://"), \
                "Please input a valid master address for your Spark standalone cluster: " \
                "spark://master:port"

        # Start pyspark-shell
        submit_args = " --master " + master
        submit_args = submit_args + " --driver-cores {} --driver-memory {} --num-executors {}" \
                                    " --executor-cores {} --executor-memory {}"\
            .format(driver_cores, driver_memory, num_executors, executor_cores, executor_memory)
        if extra_python_lib:
            submit_args = submit_args + " --py-files {}".format(
                extra_python_lib)
        if jars:
            submit_args = submit_args + " --jars {}".format(jars)
        submit_args = submit_args + " pyspark-shell"
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args

        zoo_bigdl_jar_path = ":".join(
            [get_analytics_zoo_classpath(),
             get_bigdl_classpath()])
        spark_conf = init_spark_conf(conf) \
            .set("spark.driver.cores", driver_cores) \
            .set("spark.driver.memory", driver_memory) \
            .set("spark.executor.instances", num_executors) \
            .set("spark.executor.cores", executor_cores) \
            .set("spark.cores.max", num_executors * executor_cores) \
            .set("spark.executorEnv.PYTHONHOME",
                 "/".join(self._detect_python_location().split("/")[:-2]))
        if extra_executor_memory_for_ray:
            spark_conf.set("spark.executor.memoryOverhead",
                           extra_executor_memory_for_ray)
        if spark_conf.contains("spark.executor.extraClassPath"):
            spark_conf.set(
                "spark.executor.extraClassPath",
                "{}:{}".format(zoo_bigdl_jar_path,
                               conf.get("spark.executor.extraClassPath")))
        else:
            spark_conf.set("spark.executor.extraClassPath", zoo_bigdl_jar_path)

        sc = init_nncontext(spark_conf,
                            redirect_spark_log=self.redirect_spark_log)
        sc.setLogLevel(self.spark_log_level)
        return sc
Ejemplo n.º 13
0
 def info_fn(iter):
     from zoo.util.utils import get_node_ip
     yield get_node_ip()
Ejemplo n.º 14
0
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import argparse
import cloudpickle
from zoo.util.utils import get_node_ip

print("Worker on {} with global rank {}".format(get_node_ip(),
                                                os.environ.get("PMI_RANK", 0)))

parser = argparse.ArgumentParser()
parser.add_argument('--pkl_path',
                    type=str,
                    default="",
                    help='The directory of the pkl files for mpi training.')
args = parser.parse_args()
pkl_path = args.pkl_path

with open("{}/saved_mpi_estimator.pkl".format(pkl_path), "rb") as f:
    model_creator, optimizer_creator, loss_creator, metrics, \
        scheduler_creator, config, init_func = cloudpickle.load(f)

with open("{}/mpi_train_data.pkl".format(pkl_path), "rb") as f:
Ejemplo n.º 15
0
 def get_ip(iter):
     yield get_node_ip()