Exemple #1
0
    def test_no_file_mounts_k8s_operator_cluster_launch(self):
        with patch.object(NodeUpdaterThread, START, mock_start),\
                patch.object(NodeUpdaterThread, JOIN, mock_join),\
                patch.object(RayCluster, SETUP_LOGGING, mock_setup_logging),\
                patch.object(RayCluster, WRITE_CONFIG, mock_write_config),\
                patch.object(KubernetesNodeProvider, INIT, mock_init),\
                patch.object(KubernetesNodeProvider, NON_TERMINATED_NODES,
                             mock_non_terminated_nodes),\
                patch.object(KubernetesNodeProvider, CREATE_NODE,
                             mock_create_node),\
                patch.object(KubernetesNodeProvider, BOOTSTRAP_CONFIG,
                             mock_bootstrap_config):

            cluster_cr1, cluster_cr2 = custom_resources()

            # Ensure that operator does not mount any files during cluster
            # launch.
            config1 = cr_to_config(cluster_cr1)
            config1["provider"]["namespace"] = "test"
            cluster1 = RayCluster(config1)
            cluster1.start_head()

            # Check that this test is working correctly by inserting extraneous
            # file mounts and confirming a ValueError from the mocked
            # NodeUpdater.
            config2 = cr_to_config(cluster_cr2)
            config2["provider"]["namespace"] = "test"
            # Note: There is no user interface for adding file mounts
            # to the config of a Ray cluster run via the operator.
            # This purely for purposes of testing this test.
            config2["file_mounts"] = {"remote_foo": os.path.abspath(__file__)}
            cluster2 = RayCluster(config2)
            with pytest.raises(ValueError):
                cluster2.start_head()
Exemple #2
0
def main() -> None:
    # Make directory for ray cluster configs
    if not os.path.isdir(operator_utils.RAY_CONFIG_DIR):
        os.mkdir(operator_utils.RAY_CONFIG_DIR)
    # Control loop
    cluster_cr_stream = operator_utils.cluster_cr_stream()
    try:
        for event in cluster_cr_stream:
            cluster_cr = event["object"]
            event_type = event["type"]
            cluster_config = operator_utils.cr_to_config(cluster_cr)
            cluster_action(cluster_config, event_type)
    except ApiException as e:
        if e.status == 404:
            raise Exception(
                "Caught a 404 error. Has the RayCluster CRD been created?")
        else:
            raise
Exemple #3
0
def _create_or_update_cluster(cluster_cr_body,
                              name,
                              namespace,
                              memo,
                              restart_ray=False):
    """Create, update, or restart the Ray cluster described by a RayCluster
    resource.

    Args:
        cluster_cr_body: The body of the K8s RayCluster resources describing
            a Ray cluster.
        name: The name of the Ray cluster.
        namespace: The K8s namespace in which the Ray cluster runs.
        memo: kopf memo state for this Ray cluster.
        restart_ray: Only restart cluster Ray processes if this is true.
    """
    # Convert the RayCluster custom resource to a Ray autoscaling config.
    cluster_config = operator_utils.cr_to_config(cluster_cr_body)
    # Verify the user didn't set a custom Redis password in Ray start commands.
    # (custom Redis password is not supported by K8s operator.)
    operator_utils.check_redis_password_not_specified(cluster_config, name,
                                                      namespace)

    # Fetch or create the RayCluster python object encapsulating cluster state.
    ray_cluster = memo.get("ray_cluster")
    if ray_cluster is None:
        ray_cluster = RayCluster(cluster_config)
        memo.ray_cluster = ray_cluster

    # Indicate in status.phase that a "create-or-update" is in progress.
    cluster_status_q.put((name, namespace, STATUS_UPDATING))

    # Store the autoscaling config for use by the Ray autoscaler.
    ray_cluster.set_config(cluster_config)

    # Launch a the Ray cluster by SSHing into the pod and running
    # the initialization commands. This will not restart the cluster
    # unless there was a failure.
    ray_cluster.create_or_update(restart_ray=restart_ray)

    # Indicate in status.phase that the head is up and the monitor is running.
    cluster_status_q.put((name, namespace, STATUS_RUNNING))
Exemple #4
0
def cluster_action(event_type, cluster_cr, cluster_name) -> None:

    cluster_config = operator_utils.cr_to_config(cluster_cr)
    cluster_name = cluster_config["cluster_name"]

    if event_type == "ADDED":
        operator_utils.set_status(cluster_cr, cluster_name, "Running")
        ray_clusters[cluster_name] = RayCluster(cluster_config)
        ray_clusters[cluster_name].create_or_update()
        last_generation[cluster_name] = cluster_cr["metadata"]["generation"]
    elif event_type == "MODIFIED":
        # Check metadata.generation to determine if there's a spec change.
        current_generation = cluster_cr["metadata"]["generation"]
        if current_generation > last_generation[cluster_name]:
            ray_clusters[cluster_name].set_config(cluster_config)
            ray_clusters[cluster_name].create_or_update()
            last_generation[cluster_name] = current_generation

    elif event_type == "DELETED":
        ray_clusters[cluster_name].clean_up()
        del ray_clusters[cluster_name]
        del last_generation[cluster_name]
Exemple #5
0
def cluster_action(event_type: str, cluster_cr: Dict[str, Any],
                   cluster_name: str, cluster_namespace: str) -> None:

    cluster_config = operator_utils.cr_to_config(cluster_cr)
    cluster_name = cluster_config["cluster_name"]
    cluster_identifier = (cluster_name, cluster_namespace)

    if event_type == "ADDED":
        operator_utils.check_redis_password_not_specified(
            cluster_config, cluster_identifier)

        cluster_status_q.put((cluster_name, cluster_namespace, "Running"))

        ray_cluster = RayCluster(cluster_config)

        # Track changes to the custom resource's spec field:
        generation = cluster_cr["metadata"]["generation"]
        ray_cluster.set_generation(generation)

        ray_cluster.create_or_update()

        ray_clusters[cluster_identifier] = ray_cluster

    elif event_type == "MODIFIED":
        ray_cluster = ray_clusters[cluster_identifier]
        # Check metadata.generation to determine if there's a spec change.
        current_generation = cluster_cr["metadata"]["generation"]
        # Only update if there's been a change to the spec.
        if current_generation > ray_cluster.get_generation():
            ray_cluster.set_generation(current_generation)
            ray_cluster.set_config(cluster_config)
            ray_cluster.create_or_update()

    elif event_type == "DELETED":
        ray_cluster = ray_clusters[cluster_identifier]
        ray_cluster.clean_up()
        del ray_clusters[cluster_identifier]
Exemple #6
0
def cluster_action(event_type: str, cluster_cr: Dict[str, Any],
                   cluster_name: str, cluster_namespace: str) -> None:

    cluster_config = operator_utils.cr_to_config(cluster_cr)
    cluster_identifier = (cluster_name, cluster_namespace)
    log_prefix = ",".join(cluster_identifier)

    if event_type == "ADDED":
        operator_utils.check_redis_password_not_specified(
            cluster_config, cluster_identifier)

        cluster_status_q.put(
            (cluster_name, cluster_namespace, STATUS_UPDATING))

        ray_cluster = RayCluster(cluster_config)

        # Track changes to the custom resource's spec field:
        generation = cluster_cr["metadata"]["generation"]
        ray_cluster.set_generation(generation)

        logger.info(f"{log_prefix}: Launching cluster.")
        ray_cluster.create_or_update()

        ray_clusters[cluster_identifier] = ray_cluster

        cluster_status_q.put((cluster_name, cluster_namespace, STATUS_RUNNING))

    elif event_type == "MODIFIED":
        ray_cluster = ray_clusters[cluster_identifier]
        # Check metadata.generation to determine if there's a spec change.
        current_generation = cluster_cr["metadata"]["generation"]
        # Check metadata.labels.autoscalerRetries to see if we need to restart
        # Ray processes.
        status = cluster_cr.get("status", {})
        autoscaler_retries = status.get(AUTOSCALER_RETRIES_FIELD, 0)

        # True if there's been a chamge to the spec of the custom resource,
        # triggering an increment of metadata.generation:
        spec_changed = current_generation > ray_cluster.get_generation()
        # True if monitor has failed, triggering an increment of
        # status.autoscalerRetries:
        ray_restart_required = (autoscaler_retries >
                                ray_cluster.get_num_retries())
        if ray_restart_required:
            logger.error(f"{log_prefix}: Failed, restarting cluster.")
            ray_cluster.set_num_retries(autoscaler_retries)
        if spec_changed:
            logger.info(f"{log_prefix}: Updating cluster.")
            ray_cluster.set_generation(current_generation)

        # Update if there's been a change to the spec or if we're attempting
        # recovery from autoscaler failure.
        if spec_changed or ray_restart_required:
            cluster_status_q.put(
                (cluster_name, cluster_namespace, STATUS_UPDATING))
            ray_cluster.set_config(cluster_config)
            # Trigger Ray restart only if there's been a failure.
            ray_cluster.create_or_update(restart_ray=ray_restart_required)
            cluster_status_q.put(
                (cluster_name, cluster_namespace, STATUS_RUNNING))

    elif event_type == "DELETED":
        ray_cluster = ray_clusters[cluster_identifier]
        ray_cluster.clean_up()
        del ray_clusters[cluster_identifier]