def test_gcs_client_reconnect(ray_start_regular, auto_reconnect):
    redis_client = ray.worker.global_worker.redis_client
    channel = gcs_utils.GcsChannel(redis_client=redis_client)
    gcs_client = gcs_utils.GcsClient(channel) if auto_reconnect \
        else gcs_utils.GcsClient(channel, nums_reconnect_retry=0)

    gcs_client.internal_kv_put(b"a", b"b", True, None)
    gcs_client.internal_kv_get(b"a", None) == b"b"

    ray.worker._global_node.kill_gcs_server()
    ray.worker._global_node.start_gcs_server()
    if auto_reconnect is False:
        with pytest.raises(Exception):
            gcs_client.internal_kv_get(b"a", None)
    else:
        assert gcs_client.internal_kv_get(b"a", None) == b"b"
Exemple #2
0
def test_gcs_aio_client_reconnect(ray_start_regular_with_external_redis,
                                  auto_reconnect):
    gcs_address = ray._private.worker.global_worker.gcs_client.address
    gcs_client = gcs_utils.GcsClient(address=gcs_address)

    gcs_client.internal_kv_put(b"a", b"b", True, None)
    assert gcs_client.internal_kv_get(b"a", None) == b"b"

    passed = [False]

    async def async_kv_get():
        gcs_aio_client = gcs_utils.GcsAioClient(
            address=gcs_address,
            nums_reconnect_retry=20 if auto_reconnect else 0)
        if not auto_reconnect:
            with pytest.raises(Exception):
                await gcs_aio_client.internal_kv_get(b"a", None)
        else:
            assert await gcs_aio_client.internal_kv_get(b"a", None) == b"b"
        return True

    def kv_get():
        import asyncio

        asyncio.set_event_loop(asyncio.new_event_loop())
        passed[0] = asyncio.get_event_loop().run_until_complete(async_kv_get())

    ray._private.worker._global_node.kill_gcs_server()
    t = threading.Thread(target=kv_get)
    t.start()
    sleep(5)
    ray._private.worker._global_node.start_gcs_server()
    t.join()
    assert passed[0]
Exemple #3
0
def test_gcs_client_reconnect(ray_start_regular_with_external_redis,
                              auto_reconnect):
    gcs_address = ray._private.worker.global_worker.gcs_client.address
    gcs_client = gcs_utils.GcsClient(
        address=gcs_address, nums_reconnect_retry=20 if auto_reconnect else 0)

    gcs_client.internal_kv_put(b"a", b"b", True, None)
    assert gcs_client.internal_kv_get(b"a", None) == b"b"

    passed = [False]

    def kv_get():
        if not auto_reconnect:
            with pytest.raises(Exception):
                gcs_client.internal_kv_get(b"a", None)
        else:
            assert gcs_client.internal_kv_get(b"a", None) == b"b"
        passed[0] = True

    ray._private.worker._global_node.kill_gcs_server()
    t = threading.Thread(target=kv_get)
    t.start()
    sleep(5)
    ray._private.worker._global_node.start_gcs_server()
    t.join()
    assert passed[0]
Exemple #4
0
def test_gcs_client_reconnect(ray_start_regular_with_external_redis,
                              auto_reconnect):
    gcs_address = ray.worker.global_worker.gcs_client.address
    gcs_client = gcs_utils.GcsClient(address=gcs_address) if auto_reconnect \
        else gcs_utils.GcsClient(address=gcs_address, nums_reconnect_retry=0)

    gcs_client.internal_kv_put(b"a", b"b", True, None)
    gcs_client.internal_kv_get(b"a", None) == b"b"

    ray.worker._global_node.kill_gcs_server()
    ray.worker._global_node.start_gcs_server()
    if auto_reconnect is False:
        # This may flake: when GCS server restarted quickly, there would be no
        # connection error when calling internal_kv_get().
        # with pytest.raises(Exception):
        #     gcs_client.internal_kv_get(b"a", None)
        pass
    else:
        assert gcs_client.internal_kv_get(b"a", None) == b"b"
def get_ray_status_output(address):
    gcs_client = gcs_utils.GcsClient(address=address)
    internal_kv._initialize_internal_kv(gcs_client)
    status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS)
    error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR)
    return {
        "demand":
        debug_status(status,
                     error).split("Demands:")[1].strip("\n").strip(" "),
        "usage":
        debug_status(status, error).split("Demands:")[0].split("Usage:")
        [1].strip("\n").strip(" "),
    }
Exemple #6
0
def test_usage_lib_get_total_num_running_jobs_to_report(
        ray_start_cluster, reset_lib_usage):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=1)
    gcs_client = gcs_utils.GcsClient(address=cluster.gcs_address)
    assert ray_usage_lib.get_total_num_running_jobs_to_report(gcs_client) == 0

    ray.init(address=cluster.address)
    assert ray_usage_lib.get_total_num_running_jobs_to_report(gcs_client) == 1
    ray.shutdown()

    ray.init(address=cluster.address)
    # Make sure the previously finished job is not counted.
    assert ray_usage_lib.get_total_num_running_jobs_to_report(gcs_client) == 1
    ray.shutdown()
def get_ray_status_output(address):
    if gcs_utils.use_gcs_for_bootstrap():
        gcs_client = gcs_utils.GcsClient(address=address)
    else:
        redis_client = ray._private.services.create_redis_client(address, "")
        gcs_client = gcs_utils.GcsClient.create_from_redis(redis_client)
    internal_kv._initialize_internal_kv(gcs_client)
    status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS)
    error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR)
    return {
        "demand":
        debug_status(status,
                     error).split("Demands:")[1].strip("\n").strip(" "),
        "usage":
        debug_status(status, error).split("Demands:")[0].split("Usage:")
        [1].strip("\n").strip(" ")
    }
Exemple #8
0
def test_kv_timeout(ray_start_regular):
    gcs_address = ray._private.worker.global_worker.gcs_client.address
    gcs_client = gcs_utils.GcsClient(address=gcs_address,
                                     nums_reconnect_retry=0)

    assert gcs_client.internal_kv_put(b"A", b"", False, b"") == 1

    with stop_gcs_server():
        with pytest.raises(grpc.RpcError, match="Deadline Exceeded"):
            gcs_client.internal_kv_put(b"A", b"B", False, b"NS", timeout=2)

        with pytest.raises(grpc.RpcError, match="Deadline Exceeded"):
            gcs_client.internal_kv_get(b"A", b"NS", timeout=2)

        with pytest.raises(grpc.RpcError, match="Deadline Exceeded"):
            gcs_client.internal_kv_keys(b"A", b"NS", timeout=2)

        with pytest.raises(grpc.RpcError, match="Deadline Exceeded"):
            gcs_client.internal_kv_del(b"A", True, b"NS", timeout=2)
Exemple #9
0
def test_kv_basic(ray_start_regular):
    gcs_address = ray._private.worker.global_worker.gcs_client.address
    gcs_client = gcs_utils.GcsClient(address=gcs_address,
                                     nums_reconnect_retry=0)

    assert gcs_client.internal_kv_get(b"A", b"NS") is None
    assert gcs_client.internal_kv_put(b"A", b"B", False, b"NS") == 1
    assert gcs_client.internal_kv_get(b"A", b"NS") == b"B"
    assert gcs_client.internal_kv_put(b"A", b"C", False, b"NS") == 0
    assert gcs_client.internal_kv_get(b"A", b"NS") == b"B"
    assert gcs_client.internal_kv_put(b"A", b"C", True, b"NS") == 0
    assert gcs_client.internal_kv_get(b"A", b"NS") == b"C"
    assert gcs_client.internal_kv_put(b"AA", b"B", False, b"NS") == 1
    assert gcs_client.internal_kv_put(b"AB", b"B", False, b"NS") == 1
    assert set(gcs_client.internal_kv_keys(b"A",
                                           b"NS")) == {b"A", b"AA", b"AB"}
    assert gcs_client.internal_kv_del(b"A", False, b"NS") == 1
    assert set(gcs_client.internal_kv_keys(b"A", b"NS")) == {b"AA", b"AB"}
    assert gcs_client.internal_kv_keys(b"A", b"NSS") == []
    assert gcs_client.internal_kv_del(b"A", True, b"NS") == 2
    assert gcs_client.internal_kv_keys(b"A", b"NS") == []
    assert gcs_client.internal_kv_del(b"A", False, b"NSS") == 0
Exemple #10
0
def generate_report_data(
    cluster_config_to_report: ClusterConfigToReport,
    total_success: int,
    total_failed: int,
    seq_number: int,
    gcs_address: str,
) -> UsageStatsToReport:
    """Generate the report data.

    Params:
        cluster_config_to_report: The cluster (autoscaler)
            config generated by `get_cluster_config_to_report`.
        total_success: The total number of successful report
            for the lifetime of the cluster.
        total_failed: The total number of failed report
            for the lifetime of the cluster.
        seq_number: The sequence number that's incremented whenever
            a new report is sent.
        gcs_address: the address of gcs to get data to report.

    Returns:
        UsageStats
    """
    gcs_client = gcs_utils.GcsClient(address=gcs_address,
                                     nums_reconnect_retry=20)

    cluster_metadata = get_cluster_metadata(gcs_client)
    cluster_status_to_report = get_cluster_status_to_report(gcs_client)

    data = UsageStatsToReport(
        ray_version=cluster_metadata["ray_version"],
        python_version=cluster_metadata["python_version"],
        schema_version=cluster_metadata["schema_version"],
        source=cluster_metadata["source"],
        session_id=cluster_metadata["session_id"],
        git_commit=cluster_metadata["git_commit"],
        os=cluster_metadata["os"],
        collect_timestamp_ms=int(time.time() * 1000),
        session_start_timestamp_ms=cluster_metadata[
            "session_start_timestamp_ms"],
        cloud_provider=cluster_config_to_report.cloud_provider,
        min_workers=cluster_config_to_report.min_workers,
        max_workers=cluster_config_to_report.max_workers,
        head_node_instance_type=cluster_config_to_report.
        head_node_instance_type,
        worker_node_instance_types=cluster_config_to_report.
        worker_node_instance_types,
        total_num_cpus=cluster_status_to_report.total_num_cpus,
        total_num_gpus=cluster_status_to_report.total_num_gpus,
        total_memory_gb=cluster_status_to_report.total_memory_gb,
        total_object_store_memory_gb=cluster_status_to_report.
        total_object_store_memory_gb,  # noqa: E501
        library_usages=get_library_usages_to_report(gcs_client),
        total_success=total_success,
        total_failed=total_failed,
        seq_number=seq_number,
        extra_usage_tags=_parse_extra_usage_tags(),
        total_num_nodes=get_total_num_nodes_to_report(gcs_client),
        total_num_running_jobs=get_total_num_running_jobs_to_report(
            gcs_client),
    )
    return data