def teardown_cluster(config_file, yes, workers_only, override_cluster_name): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name validate_config(config) config = fillout_defaults(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) if not workers_only: for node in provider.nodes({TAG_RAY_NODE_TYPE: "head"}): logger.info("Terminating head node {}".format(node)) provider.terminate_node(node) nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"}) while nodes: for node in nodes: logger.info("Terminating worker {}".format(node)) provider.terminate_node(node) time.sleep(5) nodes = provider.nodes({TAG_RAY_NODE_TYPE: "worker"})
def _test_invalid_config(self, config_path): with open(os.path.join(RAY_PATH, config_path)) as f: config = yaml.safe_load(f) try: validate_config(config) self.fail("Expected validation to fail for {}".format(config_path)) except jsonschema.ValidationError: pass
def testValidateDefaultConfig(self): for config_path in CONFIG_PATHS: with open(config_path) as f: config = yaml.safe_load(f) config = fillout_defaults(config) try: validate_config(config) except Exception: self.fail("Config did not pass validation test!")
def testValidateDefaultConfig(self): config = {} config["provider"] = { "type": "aws", "region": "us-east-1", "availability_zone": "us-east-1a", } config = fillout_defaults(config) try: validate_config(config) except Exception: self.fail("Default config did not pass validation test!")
def testValidateDefaultConfig(self): config = {} config["provider"] = { "type": "aws", "region": "us-east-1", "availability_zone": "us-east-1a", } config = fillout_defaults(config) try: validate_config(config) except Exception: self.fail("Default config did not pass validation test!")
def testValidateNetworkConfig(self): web_yaml = "https://raw.githubusercontent.com/ray-project/ray/" \ "master/python/ray/autoscaler/aws/example-full.yaml" response = urllib.request.urlopen(web_yaml, timeout=5) content = response.read() with tempfile.TemporaryFile() as f: f.write(content) f.seek(0) config = yaml.safe_load(f) config = fillout_defaults(config) try: validate_config(config) except Exception: self.fail("Config did not pass validation test!")
def teardown_cluster(config_file, yes, workers_only, override_cluster_name, keep_min_workers): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.safe_load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = fillout_defaults(config) validate_config(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): workers = provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: NODE_TYPE_WORKER }) if keep_min_workers: min_workers = config.get("min_workers", 0) logger.info("teardown_cluster: " "Keeping {} nodes...".format(min_workers)) workers = random.sample(workers, len(workers) - min_workers) if workers_only: return workers head = provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: NODE_TYPE_HEAD }) return head + workers # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: done."): while A: logger.info("teardown_cluster: " "Shutting down {} nodes...".format(len(A))) provider.terminate_nodes(A) time.sleep(1) A = remaining_nodes() finally: provider.cleanup()
def teardown_cluster(config_file, yes, workers_only, override_cluster_name): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name validate_config(config) config = fillout_defaults(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): if workers_only: A = [] else: A = [ node_id for node_id in provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: "head" }) ] A += [ node_id for node_id in provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: "worker" }) ] return A # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: Termination done."): while A: logger.info("teardown_cluster: " "Terminating {} nodes...".format(len(A))) provider.terminate_nodes(A) time.sleep(1) A = remaining_nodes() finally: provider.cleanup()
def teardown_cluster(config_file, yes, workers_only, override_cluster_name): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) if override_cluster_name is not None: config["cluster_name"] = override_cluster_name validate_config(config) config = fillout_defaults(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) try: def remaining_nodes(): if workers_only: A = [] else: A = [ node_id for node_id in provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: "head" }) ] A += [ node_id for node_id in provider.non_terminated_nodes({ TAG_RAY_NODE_TYPE: "worker" }) ] return A # Loop here to check that both the head and worker nodes are actually # really gone A = remaining_nodes() with LogTimer("teardown_cluster: Termination done."): while A: logger.info("teardown_cluster: " "Terminating {} nodes...".format(len(A))) provider.terminate_nodes(A) time.sleep(1) A = remaining_nodes() finally: provider.cleanup()
def create_or_update_cluster(config_file, override_min_workers, override_max_workers, no_restart): """Create or updates an autoscaling Ray cluster from a config json.""" config = yaml.load(open(config_file).read()) validate_config(config) if override_min_workers is not None: config["min_workers"] = override_min_workers if override_max_workers is not None: config["max_workers"] = override_max_workers importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) bootstrap_config, _ = importer() config = bootstrap_config(config) get_or_create_head_node(config, no_restart)
def teardown_cluster(config_file): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) validate_config(config) provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } for node in provider.nodes(head_node_tags): print("Terminating head node {}".format(node)) provider.terminate_node(node) nodes = provider.nodes({}) while nodes: for node in nodes: print("Terminating worker {}".format(node)) provider.terminate_node(node) time.sleep(5) nodes = provider.nodes({})
def _bootstrap_config(config): config = fillout_defaults(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key): return json.loads(open(cache_key).read()) validate_config(config) importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) bootstrap_config, _ = importer() resolved_config = bootstrap_config(config) with open(cache_key, "w") as f: f.write(json.dumps(resolved_config)) return resolved_config
def _bootstrap_config(config): config = fillout_defaults(config) hasher = hashlib.sha1() hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) cache_key = os.path.join(tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())) if os.path.exists(cache_key): return json.loads(open(cache_key).read()) validate_config(config) importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) bootstrap_config, _ = importer() resolved_config = bootstrap_config(config) with open(cache_key, "w") as f: f.write(json.dumps(resolved_config)) return resolved_config
def create_or_update_cluster( config_file, override_min_workers, override_max_workers, no_restart, yes): """Create or updates an autoscaling Ray cluster from a config json.""" config = yaml.load(open(config_file).read()) validate_config(config) config = fillout_defaults(config) if override_min_workers is not None: config["min_workers"] = override_min_workers if override_max_workers is not None: config["max_workers"] = override_max_workers importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError( "Unsupported provider {}".format(config["provider"])) bootstrap_config, _ = importer() config = bootstrap_config(config) get_or_create_head_node(config, no_restart, yes)
def teardown_cluster(config_file, yes): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) validate_config(config) config = fillout_defaults(config) confirm("This will destroy your cluster", yes) provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } for node in provider.nodes(head_node_tags): print("Terminating head node {}".format(node)) provider.terminate_node(node) nodes = provider.nodes({}) while nodes: for node in nodes: print("Terminating worker {}".format(node)) provider.terminate_node(node) time.sleep(5) nodes = provider.nodes({})
def file_sync(config_file): """Returns head node IP for given configuration file if exists.""" config = yaml.load(open(config_file).read()) validate_config(config) config = fillout_defaults(config) importer = NODE_PROVIDERS.get(config["provider"]["type"]) if not importer: raise NotImplementedError("Unsupported provider {}".format( config["provider"])) bootstrap_config, provider_cls = importer() config = bootstrap_config(config) provider = provider_cls(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", } nodes = provider.nodes(head_node_tags) if len(nodes) > 0: head_node = nodes[0] else: print("Head node of cluster ({}) not found!".format( config["cluster_name"])) sys.exit(1) runtime_hash = hash_runtime_conf(config["file_mounts"], config) updater = NodeUpdaterProcess( head_node, config["provider"], config["auth"], config["cluster_name"], config["file_mounts"], [], runtime_hash, redirect_output=False) updater.sync_files(config["file_mounts"])
def testValidation(self): """Ensures that schema validation is working.""" config = copy.deepcopy(SMALL_CLUSTER) try: validate_config(config) except Exception: self.fail("Test config did not pass validation test!") config["blah"] = "blah" with pytest.raises(ValueError): validate_config(config) del config["blah"] config["provider"]["blah"] = "blah" with pytest.raises(ValueError): validate_config(config) del config["provider"]["blah"] del config["provider"] with pytest.raises(ValueError): validate_config(config)
def testValidation(self): """Ensures that schema validation is working.""" config = copy.deepcopy(SMALL_CLUSTER) try: validate_config(config) except Exception: self.fail("Test config did not pass validation test!") config["blah"] = "blah" with self.assertRaises(ValueError): validate_config(config) del config["blah"] config["provider"]["blah"] = "blah" with self.assertRaises(ValueError): validate_config(config) del config["provider"]["blah"] del config["provider"] with self.assertRaises(ValueError): validate_config(config)
def load_config(yaml_file_name): config = yaml.load(open(yaml_file_name).read()) validate_config(config) return fillout_defaults(config)