Beispiel #1
0
    def create_node(self, node_config, tags, count):
        resource_group = self.provider_config["resource_group"]

        if self.cache_stopped_nodes:
            VALIDITY_TAGS = [
                TAG_RAY_CLUSTER_NAME,
                TAG_RAY_NODE_KIND,
                TAG_RAY_LAUNCH_CONFIG,
                TAG_RAY_USER_NODE_TYPE,
            ]
            filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags}
            reuse_nodes = self.stopped_nodes(filters)[:count]
            logger.info(
                f"Reusing nodes {list(reuse_nodes)}. "
                "To disable reuse, set `cache_stopped_nodes: False` "
                "under `provider` in the cluster configuration.", )
            start = get_azure_sdk_function(
                client=self.compute_client.virtual_machines,
                function_name="start")
            for node_id in reuse_nodes:
                start(resource_group_name=resource_group,
                      vm_name=node_id).wait()
                self.set_node_tags(node_id, tags)
            count -= len(reuse_nodes)

        if count:
            self._create_node(node_config, tags, count)
Beispiel #2
0
 def set_node_tags(self, node_id, tags):
     """Sets the tag values (string dict) for the specified node."""
     node_tags = self._get_cached_node(node_id)["tags"]
     node_tags.update(tags)
     update = get_azure_sdk_function(
         client=self.compute_client.virtual_machines,
         function_name="update")
     update(resource_group_name=self.provider_config["resource_group"],
            vm_name=node_id,
            parameters={"tags": node_tags})
     self.cached_nodes[node_id]["tags"] = node_tags
Beispiel #3
0
    def _create_node(self, node_config, tags, count):
        """Creates a number of nodes within the namespace."""
        resource_group = self.provider_config["resource_group"]

        # load the template file
        current_path = Path(__file__).parent
        template_path = current_path.joinpath("azure-vm-template.json")
        with open(template_path, "r") as template_fp:
            template = json.load(template_fp)

        # get the tags
        config_tags = node_config.get("tags", {}).copy()
        config_tags.update(tags)
        config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name

        name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node")
        unique_id = uuid4().hex[:VM_NAME_UUID_LEN]
        vm_name = "{name}-{id}".format(name=name_tag, id=unique_id)
        use_internal_ips = self.provider_config.get("use_internal_ips", False)

        template_params = node_config["azure_arm_parameters"].copy()
        template_params["vmName"] = vm_name
        template_params["provisionPublicIp"] = not use_internal_ips
        template_params["vmTags"] = config_tags
        template_params["vmCount"] = count

        parameters = {
            "properties": {
                "mode": DeploymentMode.incremental,
                "template": template,
                "parameters": {
                    key: {
                        "value": value
                    }
                    for key, value in template_params.items()
                },
            }
        }

        # TODO: we could get the private/public ips back directly
        create_or_update = get_azure_sdk_function(
            client=self.resource_client.deployments,
            function_name="create_or_update")
        create_or_update(
            resource_group_name=resource_group,
            deployment_name="ray-vm-{}".format(name_tag),
            parameters=parameters,
        ).wait()
Beispiel #4
0
    def terminate_node(self, node_id):
        """Terminates the specified node. This will delete the VM and
           associated resources (NIC, IP, Storage) for the specified node."""

        resource_group = self.provider_config["resource_group"]
        try:
            # get metadata for node
            metadata = self._get_node(node_id)
        except KeyError:
            # node no longer exists
            return

        # TODO: deallocate instead of delete to allow possible reuse
        # self.compute_client.virtual_machines.deallocate(
        #   resource_group_name=resource_group,
        #   vm_name=node_id)

        # gather disks to delete later
        vm = self.compute_client.virtual_machines.get(
            resource_group_name=resource_group, vm_name=node_id)
        disks = {d.name for d in vm.storage_profile.data_disks}
        disks.add(vm.storage_profile.os_disk.name)

        try:
            # delete machine, must wait for this to complete
            delete = get_azure_sdk_function(
                client=self.compute_client.virtual_machines,
                function_name="delete")
            delete(resource_group_name=resource_group, vm_name=node_id).wait()
        except Exception as e:
            logger.warning("Failed to delete VM: {}".format(e))

        try:
            # delete nic
            delete = get_azure_sdk_function(
                client=self.network_client.network_interfaces,
                function_name="delete")
            delete(resource_group_name=resource_group,
                   network_interface_name=metadata["nic_name"])
        except Exception as e:
            logger.warning("Failed to delete nic: {}".format(e))

        # delete ip address
        if "public_ip_name" in metadata:
            try:
                delete = get_azure_sdk_function(
                    client=self.network_client.public_ip_addresses,
                    function_name="delete")
                delete(resource_group_name=resource_group,
                       public_ip_address_name=metadata["public_ip_name"])
            except Exception as e:
                logger.warning("Failed to delete public ip: {}".format(e))

        # delete disks
        for disk in disks:
            try:
                delete = get_azure_sdk_function(
                    client=self.compute_client.disks, function_name="delete")
                delete(resource_group_name=resource_group, disk_name=disk)
            except Exception as e:
                logger.warning("Failed to delete disk: {}".format(e))
Beispiel #5
0
    def terminate_node(self, node_id):
        """Terminates the specified node. This will delete the VM and
        associated resources (NIC, IP, Storage) for the specified node."""

        resource_group = self.provider_config["resource_group"]
        try:
            # get metadata for node
            metadata = self._get_node(node_id)
        except KeyError:
            # node no longer exists
            return

        if self.cache_stopped_nodes:
            try:
                # stop machine and leave all resources
                logger.info(f"Stopping instance {node_id}"
                            "(to fully terminate instead, "
                            "set `cache_stopped_nodes: False` "
                            "under `provider` in the cluster configuration)")
                stop = get_azure_sdk_function(
                    client=self.compute_client.virtual_machines,
                    function_name="deallocate",
                )
                stop(resource_group_name=resource_group, vm_name=node_id)
            except Exception as e:
                logger.warning("Failed to stop VM: {}".format(e))
        else:
            vm = self.compute_client.virtual_machines.get(
                resource_group_name=resource_group, vm_name=node_id)
            disks = {d.name for d in vm.storage_profile.data_disks}
            disks.add(vm.storage_profile.os_disk.name)

            try:
                # delete machine, must wait for this to complete
                delete = get_azure_sdk_function(
                    client=self.compute_client.virtual_machines,
                    function_name="delete")
                delete(resource_group_name=resource_group,
                       vm_name=node_id).wait()
            except Exception as e:
                logger.warning("Failed to delete VM: {}".format(e))

            try:
                # delete nic
                delete = get_azure_sdk_function(
                    client=self.network_client.network_interfaces,
                    function_name="delete",
                )
                delete(
                    resource_group_name=resource_group,
                    network_interface_name=metadata["nic_name"],
                )
            except Exception as e:
                logger.warning("Failed to delete nic: {}".format(e))

            # delete ip address
            if "public_ip_name" in metadata:
                try:
                    delete = get_azure_sdk_function(
                        client=self.network_client.public_ip_addresses,
                        function_name="delete",
                    )
                    delete(
                        resource_group_name=resource_group,
                        public_ip_address_name=metadata["public_ip_name"],
                    )
                except Exception as e:
                    logger.warning("Failed to delete public ip: {}".format(e))

            # delete disks
            for disk in disks:
                try:
                    delete = get_azure_sdk_function(
                        client=self.compute_client.disks,
                        function_name="delete")
                    delete(resource_group_name=resource_group, disk_name=disk)
                except Exception as e:
                    logger.warning("Failed to delete disk: {}".format(e))