parser.add_argument('-n', '--num_workers', type=int, default=2, help='The number of workers to be launched.') parser.add_argument('-m', '--mode', type=str, default='gridrandom', choices=['gridrandom', 'skopt'], help='The search algorithm to use.') opt = parser.parse_args() if opt.hadoop_conf: assert opt.conda_name is not None, "conda_name must be specified for yarn mode" sc = init_spark_on_yarn(hadoop_conf=opt.hadoop_conf, conda_name=opt.conda_name, num_executors=opt.num_workers, executor_cores=opt.executor_cores) else: sc = init_spark_on_local(cores="*") ray_ctx = RayContext(sc=sc) ray_ctx.init() input_cols = [ "Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "FlightNum",
weights = ray.get(ps.pull.remote(keys)) net.set_weights(keys, weights) # Compute an update and push it to the parameter server. xs, ys = mnist.train.next_batch(batch_size) gradients = net.compute_update(xs, ys) ps.push.remote(keys, gradients) if __name__ == "__main__": args = parser.parse_args() if args.hadoop_conf: sc = init_spark_on_yarn( hadoop_conf=args.hadoop_conf, conda_name=args.conda_name, num_executor=args.num_workers, executor_cores=args.executor_cores, executor_memory=args.executor_memory, driver_memory=args.driver_memory, driver_cores=args.driver_cores, extra_executor_memory_for_ray=args.extra_executor_memory_for_ray) ray_ctx = RayContext(sc=sc, object_store_memory=args.object_store_memory) else: sc = init_spark_on_local(cores=args.driver_cores) ray_ctx = RayContext(sc=sc, object_store_memory=args.object_store_memory) ray_ctx.init() # Create a parameter server with some random weights. net = model.SimpleCNN() all_keys, all_values = net.get_weights()
default=10, type=int, help="The number of rollouts to do per batch.") parser.add_argument("--iterations", default=-1, type=int, help="The number of model updates to perform. By " "default, training will not terminate.") args = parser.parse_args() if args.hadoop_conf: slave_num = 2 sc = init_spark_on_yarn(hadoop_conf=args.hadoop_conf, conda_name="ray36", num_executor=slave_num, executor_cores=28, executor_memory="10g", driver_memory="2g", driver_cores=4, extra_executor_memory_for_ray="30g") ray_ctx = RayContext(sc=sc, object_store_memory="25g") else: sc = init_spark_on_local(cores=4) ray_ctx = RayContext(sc=sc) ray_ctx.init() batch_size = args.batch_size # Run the reinforcement learning. running_reward = None batch_num = 1 model = {} # "Xavier" initialization.
def init_orca_context(cluster_mode="local", cores=2, memory="2g", num_nodes=1, init_ray_on_spark=False, **kwargs): """ Creates or gets a SparkContext for different Spark cluster modes (and launch Ray services across the cluster if necessary). :param cluster_mode: The mode for the Spark cluster. One of "local", "yarn-client", "k8s-client", "standalone" and "spark-submit". Default to be "local". For "spark-submit", you are supposed to use spark-submit to submit the application. In this case, please set the Spark configurations through command line options or the properties file. You need to use "spark-submit" for yarn-cluster or k8s-cluster mode. To make things easier, you are recommended to use the launch scripts we provide: https://github.com/intel-analytics/analytics-zoo/tree/master/scripts. For other cluster modes, you are recommended to install and run analytics-zoo through pip, which is more convenient. :param cores: The number of cores to be used on each node. Default to be 2. :param memory: The memory allocated for each node. Default to be '2g'. :param num_nodes: The number of nodes to be used in the cluster. Default to be 1. For Spark local, num_nodes should always be 1 and you don't need to change it. :param init_ray_on_spark: Whether to launch Ray services across the cluster. Default to be False and in this case the Ray cluster would be launched lazily when Ray is involved in Project Orca. :param kwargs: The extra keyword arguments used for creating SparkContext and launching Ray if any. :return: An instance of SparkContext. """ print("Initializing orca context") import atexit atexit.register(stop_orca_context) cluster_mode = cluster_mode.lower() spark_args = {} for key in ["conf", "spark_log_level", "redirect_spark_log"]: if key in kwargs: spark_args[key] = kwargs[key] if cluster_mode == "spark-submit": from zoo import init_nncontext sc = init_nncontext(**spark_args) elif cluster_mode == "local": assert num_nodes == 1, "For Spark local mode, num_nodes should be 1" os.environ["SPARK_DRIVER_MEMORY"] = memory if "python_location" in kwargs: spark_args["python_location"] = kwargs["python_location"] from zoo import init_spark_on_local sc = init_spark_on_local(cores, **spark_args) elif cluster_mode.startswith("yarn"): # yarn or yarn-client if cluster_mode == "yarn-cluster": raise ValueError('For yarn-cluster mode, please set cluster_mode to "spark-submit" ' 'and submit the application via spark-submit instead') hadoop_conf = os.environ.get("HADOOP_CONF_DIR") if not hadoop_conf: assert "hadoop_conf" in kwargs,\ "Directory path to hadoop conf not found for yarn-client mode. Please either " \ "specify argument hadoop_conf or set the environment variable HADOOP_CONF_DIR" hadoop_conf = kwargs["hadoop_conf"] from zoo.util.utils import detect_python_location python_location = detect_python_location() # /path/to/conda/envs/conda_name/bin/python assert "envs" in python_location, "You must use a conda environment for yarn-client mode" for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray", "extra_python_lib", "penv_archive", "additional_archive", "hadoop_user_name", "spark_yarn_archive", "jars"]: if key in kwargs: spark_args[key] = kwargs[key] from zoo import init_spark_on_yarn sc = init_spark_on_yarn(hadoop_conf=hadoop_conf, conda_name=python_location.split("/")[-3], num_executors=num_nodes, executor_cores=cores, executor_memory=memory, **spark_args) elif cluster_mode.startswith("k8s"): # k8s or k8s-client if cluster_mode == "k8s-cluster": raise ValueError('For k8s-cluster mode, please set cluster_mode to "spark-submit" ' 'and submit the application via spark-submit instead') assert "master" in kwargs, "Please specify master for k8s-client mode" assert "container_image" in kwargs, "Please specify container_image for k8s-client mode" for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray", "extra_python_lib", "jars", "python_location"]: if key in kwargs: spark_args[key] = kwargs[key] from zoo import init_spark_on_k8s sc = init_spark_on_k8s(master=kwargs["master"], container_image=kwargs["container_image"], num_executors=num_nodes, executor_cores=cores, executor_memory=memory, **spark_args) elif cluster_mode == "standalone": for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray", "extra_python_lib", "jars", "master", "python_location", "enable_numa_binding"]: if key in kwargs: spark_args[key] = kwargs[key] from zoo import init_spark_standalone sc = init_spark_standalone(num_executors=num_nodes, executor_cores=cores, executor_memory=memory, **spark_args) else: raise ValueError("cluster_mode can only be local, yarn-client, standalone or spark-submit, " "but got: %s".format(cluster_mode)) ray_args = {} for key in ["redis_port", "password", "object_store_memory", "verbose", "env", "extra_params", "num_ray_nodes", "ray_node_cpu_cores"]: if key in kwargs: ray_args[key] = kwargs[key] from zoo.ray import RayContext ray_ctx = RayContext(sc, **ray_args) if init_ray_on_spark: driver_cores = 0 # This is the default value. if "driver_cores" in kwargs: driver_cores = kwargs["driver_cores"] ray_ctx.init(driver_cores=driver_cores) return sc
np.random.seed(1337) # for reproducibility @ray.remote class TestRay(): def hostname(self): import socket return socket.gethostname() node_num = 4 sc = init_spark_on_yarn(hadoop_conf="/opt/work/hadoop-2.7.2/etc/hadoop/", conda_name="rayexample", num_executor=node_num, executor_cores=28, executor_memory="10g", driver_memory="2g", driver_cores=4, extra_executor_memory_for_ray="30g") ray_ctx = RayContext(sc=sc, object_store_memory="2g") ray_ctx.init() actors = [TestRay.remote() for i in range(0, node_num)] print(ray.get([actor.hostname.remote() for actor in actors])) ray_ctx.stop() # repeat ray_ctx = RayContext(sc=sc, object_store_memory="1g") ray_ctx.init() actors = [TestRay.remote() for i in range(0, node_num)] print(ray.get([actor.hostname.remote() for actor in actors])) ray_ctx.stop()
# See the License for the specific language governing permissions and # limitations under the License. # import ray from zoo import init_spark_on_yarn from zoo.ray import RayContext slave_num = 2 sc = init_spark_on_yarn(hadoop_conf="/opt/work/almaren-yarn-config/", conda_name="ray_train", num_executors=slave_num, executor_cores=28, executor_memory="10g", driver_memory="2g", driver_cores=4, extra_executor_memory_for_ray="30g", conf={"hello": "world"}) ray_ctx = RayContext(sc=sc, object_store_memory="25g", extra_params={"temp-dir": "/tmp/hello/"}, env={ "http_proxy": "http://child-prc.intel.com:913", "http_proxys": "http://child-prc.intel.com:913" }) ray_ctx.init()
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import ray from zoo import init_spark_on_yarn from zoo.ray.util.raycontext import RayContext slave_num = 2 sc = init_spark_on_yarn(hadoop_conf="/opt/work/almaren-yarn-config/", conda_name="ray36-dev", num_executor=slave_num, executor_cores=28, executor_memory="10g", driver_memory="2g", driver_cores=4, extra_executor_memory_for_ray="30g") ray_ctx = RayContext(sc=sc, object_store_memory="25g", env={ "http_proxy": "http://child-prc.intel.com:913", "http_proxys": "http://child-prc.intel.com:913" }) ray_ctx.init() @ray.remote class TestRay():
help="Enables GPU training") parser.add_argument( "--tune", action="store_true", default=False, help="Tune training") args, _ = parser.parse_known_args() import ray #ray.init(redis_address=args.redis_address) if args.hadoop_conf: slave_num = args.num_replicas print("Slave num : " + str(slave_num)) sc = init_spark_on_yarn( hadoop_conf=args.hadoop_conf, conda_name="rayexample", num_executor=slave_num, executor_cores=88,#88 executor_memory="10g", driver_memory="5g", driver_cores=4, extra_executor_memory_for_ray="10g") print("Init spark success!") ray_ctx = RayContext(sc=sc,object_store_memory="10g") print("RayContext success!") ray_ctx.init() else: # sc = init_spark_on_local(cores=22) # ray_ctx = RayContext(sc=sc) ray.init() print("ray init") t_s = time.time()