def __init__(self, config, train_data, model_creator, loss_creator=None, train_resize_batch_num=None, eval_metrics_creator=None, test_data=None, validation_metrics_creator=None, num_workers=1, num_servers=None, runner_cores=None): self.config = config self.train_data = train_data self.test_data = test_data self.model_creator = model_creator self.loss_creator = loss_creator self.validation_metrics_creator = validation_metrics_creator self.eval_metrics_creator = eval_metrics_creator self.num_workers = num_workers self.num_servers = num_servers if num_servers else self.num_workers self.train_resize_batch_num = train_resize_batch_num # Generate actor class # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server # if runner_cores is specified so that we can place one worker and one server on a node # for better performance. Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) # Start runners: workers followed by servers self.runners = [Worker.remote() for i in range(self.num_workers)] self.runners += [Server.remote() for i in range(self.num_servers)] # Compute URL for initializing distributed setup ips = ray.get([runner.get_node_ip.remote() for runner in self.runners]) ports = ray.get( [runner.find_free_port.remote() for runner in self.runners]) logger = logging.getLogger() logger.info(ips) logger.info(ports) env = { "DMLC_PS_ROOT_URI": str(get_host_ip()), "DMLC_PS_ROOT_PORT": str(find_free_port()), "DMLC_NUM_SERVER": str(self.num_servers), "DMLC_NUM_WORKER": str(self.num_workers), } envs = [] for i in range(self.num_workers): current_env = env.copy() current_env['DMLC_ROLE'] = 'worker' envs.append(current_env) for i in range(self.num_servers): current_env = env.copy() current_env['DMLC_ROLE'] = 'server' envs.append(current_env) env['DMLC_ROLE'] = 'scheduler' modified_env = os.environ.copy() modified_env.update(env) # Need to contain system env to run bash # TODO: Need to kill this process manually? subprocess.Popen("python -c 'import mxnet'", shell=True, env=modified_env) ray.get([ runner.setup_distributed.remote( envs[i], self.config, self.train_data, self.model_creator, self.loss_creator, self.validation_metrics_creator, self.test_data, self.train_resize_batch_num, self.eval_metrics_creator) for i, runner in enumerate(self.runners) ])
def __init__(self, config, model_creator, loss_creator=None, eval_metrics_creator=None, validation_metrics_creator=None, num_workers=None, num_servers=None, runner_cores=None): ray_ctx = RayContext.get() if not num_workers: num_workers = ray_ctx.num_ray_nodes self.config = {} if config is None else config assert isinstance(config, dict), "config must be a dict" for param in ["optimizer", "optimizer_params", "log_interval"]: assert param in config, param + " must be specified in config" self.model_creator = model_creator self.loss_creator = loss_creator self.validation_metrics_creator = validation_metrics_creator self.eval_metrics_creator = eval_metrics_creator self.num_workers = num_workers self.num_servers = num_servers if num_servers else self.num_workers # Generate actor class # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server # if runner_cores is specified so that we can place one worker and one server on a node # for better performance. Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) # Start runners: workers followed by servers self.workers = [Worker.remote() for i in range(self.num_workers)] self.servers = [Server.remote() for i in range(self.num_servers)] self.runners = self.workers + self.servers env = { "DMLC_PS_ROOT_URI": str(get_host_ip()), "DMLC_PS_ROOT_PORT": str(find_free_port()), "DMLC_NUM_SERVER": str(self.num_servers), "DMLC_NUM_WORKER": str(self.num_workers), } envs = [] for i in range(self.num_workers): current_env = env.copy() current_env['DMLC_ROLE'] = 'worker' envs.append(current_env) for i in range(self.num_servers): current_env = env.copy() current_env['DMLC_ROLE'] = 'server' envs.append(current_env) env['DMLC_ROLE'] = 'scheduler' modified_env = os.environ.copy() modified_env.update(env) # Need to contain system env to run bash # TODO: Need to kill this process manually? subprocess.Popen("python -c 'import mxnet'", shell=True, env=modified_env) ray.get([ runner.setup_distributed.remote(envs[i], self.config, self.model_creator, self.loss_creator, self.validation_metrics_creator, self.eval_metrics_creator) for i, runner in enumerate(self.runners) ])
def __init__(self, config, train_data, model_creator, loss_creator=None, train_resize_batch_num=None, eval_metrics_creator=None, test_data=None, validation_metrics_creator=None, num_workers=1, num_servers=None, runner_cores=None): self.config = config self.model_creator = model_creator self.loss_creator = loss_creator self.validation_metrics_creator = validation_metrics_creator self.eval_metrics_creator = eval_metrics_creator self.num_workers = num_workers self.num_servers = num_servers if num_servers else self.num_workers self.train_resize_batch_num = train_resize_batch_num from zoo.orca.data import RayXShards, SparkXShards if isinstance(train_data, SparkXShards): train_data = train_data.repartition(self.num_workers).to_ray() if test_data: assert isinstance(test_data, SparkXShards) test_data = test_data.repartition(self.num_workers).to_ray() if isinstance(train_data, RayXShards): if train_data.num_partitions() != self.num_workers: train_data.repartition(self.num_workers) if test_data: assert isinstance(test_data, RayXShards) if test_data.num_partitions() != self.num_workers: test_data.repartition(self.num_workers) self.train_data = train_data self.test_data = test_data # Generate actor class # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server # if runner_cores is specified so that we can place one worker and one server on a node # for better performance. Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) # Start runners: workers followed by servers self.workers = [Worker.remote() for i in range(self.num_workers)] self.servers = [Server.remote() for i in range(self.num_servers)] if isinstance(self.train_data, RayXShards): self.workers = self.train_data.colocate_actors(self.workers) self.runners = self.workers + self.servers env = { "DMLC_PS_ROOT_URI": str(get_host_ip()), "DMLC_PS_ROOT_PORT": str(find_free_port()), "DMLC_NUM_SERVER": str(self.num_servers), "DMLC_NUM_WORKER": str(self.num_workers), } envs = [] for i in range(self.num_workers): current_env = env.copy() current_env['DMLC_ROLE'] = 'worker' envs.append(current_env) for i in range(self.num_servers): current_env = env.copy() current_env['DMLC_ROLE'] = 'server' envs.append(current_env) env['DMLC_ROLE'] = 'scheduler' modified_env = os.environ.copy() modified_env.update(env) # Need to contain system env to run bash # TODO: Need to kill this process manually? subprocess.Popen("python -c 'import mxnet'", shell=True, env=modified_env) ray.get([ runner.setup_distributed.remote( envs[i], self.config, self.train_data, self.model_creator, self.loss_creator, self.validation_metrics_creator, self.test_data, self.train_resize_batch_num, self.eval_metrics_creator) for i, runner in enumerate(self.runners) ])