Example #1
0
async def test_aio_publish_and_subscribe_logs(ray_start_regular):
    address_info = ray_start_regular
    redis = ray._private.services.create_redis_client(
        address_info["redis_address"],
        password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)

    gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis)

    subscriber = GcsAioSubscriber(address=gcs_server_addr)
    await subscriber.subscribe_logs()

    publisher = GcsAioPublisher(address=gcs_server_addr)
    log_batch = {
        "ip": "127.0.0.1",
        "pid": "gcs",
        "job": "0001",
        "is_err": False,
        "lines": ["line 1", "line 2"],
        "actor_name": "test actor",
        "task_name": "test task",
    }
    await publisher.publish_logs(log_batch)

    assert await subscriber.poll_logs() == log_batch

    await subscriber.close()
Example #2
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if self.autoscaler is not None and \
           os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1":
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True)
        redis_client = ray._private.services.create_redis_client(
            self.redis_address, password=self.redis_password)
        gcs_publisher = None
        if args.gcs_address:
            gcs_publisher = GcsPublisher(address=args.gcs_address)
        elif gcs_pubsub_enabled():
            gcs_publisher = GcsPublisher(
                address=get_gcs_address_from_redis(redis_client))
        from ray._private.utils import publish_error_to_driver
        publish_error_to_driver(
            ray_constants.MONITOR_DIED_ERROR,
            message,
            redis_client=redis_client,
            gcs_publisher=gcs_publisher)
Example #3
0
def test_publish_and_subscribe_logs(ray_start_regular):
    address_info = ray_start_regular
    redis = ray._private.services.create_redis_client(
        address_info["redis_address"],
        password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)

    gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis)

    subscriber = GcsLogSubscriber(address=gcs_server_addr)
    subscriber.subscribe()

    publisher = GcsPublisher(address=gcs_server_addr)
    log_batch = {
        "ip": "127.0.0.1",
        "pid": 1234,
        "job": "0001",
        "is_err": False,
        "lines": ["line 1", "line 2"],
        "actor_name": "test actor",
        "task_name": "test task",
    }
    publisher.publish_logs(log_batch)

    # PID is treated as string.
    log_batch["pid"] = "1234"
    assert subscriber.poll() == log_batch

    subscriber.close()
Example #4
0
 def __init__(self, logs_dir, redis_address, redis_password=None):
     """Initialize the log monitor object."""
     self.ip = services.get_node_ip_address()
     self.logs_dir = logs_dir
     self.redis_client = ray._private.services.create_redis_client(
         redis_address, password=redis_password)
     self.publisher = None
     if gcs_pubsub.gcs_pubsub_enabled():
         gcs_addr = gcs_utils.get_gcs_address_from_redis(self.redis_client)
         self.publisher = gcs_pubsub.GcsPublisher(address=gcs_addr)
     self.log_filenames = set()
     self.open_file_infos = []
     self.closed_file_infos = []
     self.can_open_more_files = True
Example #5
0
def test_publish_and_subscribe_function_keys(ray_start_regular):
    address_info = ray_start_regular
    redis = ray._private.services.create_redis_client(
        address_info["redis_address"],
        password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)

    gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis)

    subscriber = GcsFunctionKeySubscriber(address=gcs_server_addr)
    subscriber.subscribe()

    publisher = GcsPublisher(address=gcs_server_addr)
    publisher.publish_function_key(b"111")
    publisher.publish_function_key(b"222")

    assert subscriber.poll() == b"111"
    assert subscriber.poll() == b"222"

    subscriber.close()
Example #6
0
def test_publish_error_to_driver(ray_start_regular, error_pubsub):
    address_info = ray_start_regular
    address = address_info["redis_address"]
    redis_client = ray._private.services.create_redis_client(
        address, password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
    gcs_publisher = None
    if gcs_pubsub_enabled():
        gcs_publisher = GcsPublisher(
            address=gcs_utils.get_gcs_address_from_redis(redis_client))
    error_message = "Test error message"
    ray._private.utils.publish_error_to_driver(
        ray_constants.DASHBOARD_AGENT_DIED_ERROR,
        error_message,
        redis_client=redis_client,
        gcs_publisher=gcs_publisher)
    errors = get_error_message(error_pubsub, 1,
                               ray_constants.DASHBOARD_AGENT_DIED_ERROR)
    assert errors[0].type == ray_constants.DASHBOARD_AGENT_DIED_ERROR
    assert errors[0].error_message == error_message
Example #7
0
async def test_aio_publish_and_subscribe_error_info(ray_start_regular):
    address_info = ray_start_regular
    redis = ray._private.services.create_redis_client(
        address_info["redis_address"],
        password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)

    gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis)

    subscriber = GcsAioSubscriber(address=gcs_server_addr)
    await subscriber.subscribe_error()

    publisher = GcsAioPublisher(address=gcs_server_addr)
    err1 = ErrorTableData(error_message="test error message 1")
    err2 = ErrorTableData(error_message="test error message 2")
    await publisher.publish_error(b"aaa_id", err1)
    await publisher.publish_error(b"bbb_id", err2)

    assert await subscriber.poll_error() == (b"aaa_id", err1)
    assert await subscriber.poll_error() == (b"bbb_id", err2)

    await subscriber.close()
Example #8
0
        raylet_pid = os.environ["RAY_RAYLET_PID"]
        node_ip = args.node_ip_address
        if restart_count >= max_restart_count:
            # Agent is failed to be started many times.
            # Push an error to all drivers, so that users can know the
            # impact of the issue.
            redis_client = None
            gcs_publisher = None
            if gcs_pubsub_enabled():
                if use_gcs_for_bootstrap():
                    gcs_publisher = GcsPublisher(args.gcs_address)
                else:
                    redis_client = ray._private.services.create_redis_client(
                        args.redis_address, password=args.redis_password)
                    gcs_publisher = GcsPublisher(
                        address=get_gcs_address_from_redis(redis_client))
            else:
                redis_client = ray._private.services.create_redis_client(
                    args.redis_address, password=args.redis_password)

            traceback_str = ray._private.utils.format_error_message(
                traceback.format_exc())
            message = (
                f"(ip={node_ip}) "
                f"The agent on node {platform.uname()[1]} failed to "
                f"be restarted {max_restart_count} "
                "times. There are 3 possible problems if you see this error."
                "\n  1. The dashboard might not display correct "
                "information on this node."
                "\n  2. Metrics on this node won't be reported."
                "\n  3. runtime_env APIs won't work."
Example #9
0
        service_discovery.start()
        loop = asyncio.get_event_loop()
        loop.run_until_complete(dashboard.run())
    except Exception as e:
        traceback_str = ray._private.utils.format_error_message(
            traceback.format_exc())
        message = f"The dashboard on node {platform.uname()[1]} " \
                  f"failed with the following " \
                  f"error:\n{traceback_str}"
        if isinstance(e, FrontendNotFoundError):
            logger.warning(message)
        else:
            logger.error(message)
            raise e

        # Something went wrong, so push an error to all drivers.
        redis_client = ray._private.services.create_redis_client(
            args.redis_address, password=args.redis_password)
        gcs_publisher = None
        if args.gcs_address:
            gcs_publisher = GcsPublisher(address=args.gcs_address)
        elif gcs_pubsub_enabled():
            gcs_publisher = GcsPublisher(
                address=gcs_utils.get_gcs_address_from_redis(redis_client))
        ray._private.utils.publish_error_to_driver(
            redis_client,
            ray_constants.DASHBOARD_DIED_ERROR,
            message,
            redis_client=redis_client,
            gcs_publisher=gcs_publisher)
Example #10
0
def test_subscribe_two_channels(ray_start_regular):
    """Tests concurrently subscribing to two channels work."""

    address_info = ray_start_regular
    redis = ray._private.services.create_redis_client(
        address_info["redis_address"],
        password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)

    gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis)

    num_messages = 100

    errors = []

    def receive_errors():
        subscriber = GcsErrorSubscriber(address=gcs_server_addr)
        subscriber.subscribe()
        while len(errors) < num_messages:
            _, msg = subscriber.poll()
            errors.append(msg)

    logs = []

    def receive_logs():
        subscriber = GcsLogSubscriber(address=gcs_server_addr)
        subscriber.subscribe()
        while len(logs) < num_messages:
            log_batch = subscriber.poll()
            logs.append(log_batch)

    t1 = threading.Thread(target=receive_errors)
    t1.start()

    t2 = threading.Thread(target=receive_logs)
    t2.start()

    publisher = GcsPublisher(address=gcs_server_addr)
    for i in range(0, num_messages):
        publisher.publish_error(b"msg_id",
                                ErrorTableData(error_message=f"error {i}"))
        publisher.publish_logs({
            "ip": "127.0.0.1",
            "pid": "gcs",
            "job": "0001",
            "is_err": False,
            "lines": [f"line {i}"],
            "actor_name": "test actor",
            "task_name": "test task",
        })

    t1.join(timeout=10)
    assert not t1.is_alive(), len(errors)
    assert len(errors) == num_messages, len(errors)

    t2.join(timeout=10)
    assert not t2.is_alive(), len(logs)
    assert len(logs) == num_messages, len(logs)

    for i in range(0, num_messages):
        assert errors[i].error_message == f"error {i}"
        assert logs[i]["lines"][0] == f"line {i}"
Example #11
0
    def __init__(
        self,
        address,
        autoscaling_config,
        redis_password=None,
        prefix_cluster_info=False,
        monitor_ip=None,
        stop_event: Optional[Event] = None,
    ):
        if not use_gcs_for_bootstrap():
            # Initialize the Redis clients.
            redis_address = address
            self.redis = ray._private.services.create_redis_client(
                redis_address, password=redis_password)
            (ip, port) = address.split(":")
            # Initialize the gcs stub for getting all node resource usage.
            gcs_address = get_gcs_address_from_redis(self.redis)
        else:
            gcs_address = address
            redis_address = None

        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = (
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel))
        self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub(
            gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        if use_gcs_for_bootstrap():
            gcs_client = GcsClient(address=gcs_address)
        else:
            worker.redis_client = self.redis
            gcs_client = GcsClient.create_from_redis(self.redis)

        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            if use_gcs_for_bootstrap():
                gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                           monitor_addr.encode(), True, None)
            else:
                self.redis.set("AutoscalerMetricsAddress", monitor_addr)
        _initialize_internal_kv(gcs_client)
        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            if use_gcs_for_bootstrap():
                gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                           monitor_addr.encode(), True, None)
            else:
                self.redis.set("AutoscalerMetricsAddress", monitor_addr)
        worker.mode = 0
        if use_gcs_for_bootstrap():
            head_node_ip = gcs_address.split(":")[0]
        else:
            head_node_ip = redis_address.split(":")[0]
            self.redis_address = redis_address
            self.redis_password = redis_password

        self.load_metrics = LoadMetrics()
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry,
                )
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning("`prometheus_client` not found, so metrics will "
                           "not be exported.")

        logger.info("Monitor: Started")