Example #1
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if self.autoscaler is not None and \
           os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1":
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True)
        redis_client = ray._private.services.create_redis_client(
            self.redis_address, password=self.redis_password)
        gcs_publisher = None
        if args.gcs_address:
            gcs_publisher = GcsPublisher(address=args.gcs_address)
        elif gcs_pubsub_enabled():
            gcs_publisher = GcsPublisher(
                address=get_gcs_address_from_redis(redis_client))
        from ray._private.utils import publish_error_to_driver
        publish_error_to_driver(
            ray_constants.MONITOR_DIED_ERROR,
            message,
            redis_client=redis_client,
            gcs_publisher=gcs_publisher)
Example #2
0
def upload_runtime_env_package_if_needed(job_config: JobConfig) -> None:
    """Upload runtime env if it's not there.

    It'll check whether the runtime environment exists in the cluster or not.
    If it doesn't exist, a package will be created based on the working
    directory and modules defined in job config. The package will be
    uploaded to the cluster after this.

    Args:
        job_config (JobConfig): The job config of driver.
    """
    assert _internal_kv_initialized()
    pkg_uris = job_config.get_runtime_env_uris()
    for pkg_uri in pkg_uris:
        if not package_exists(pkg_uri):
            file_path = _get_local_path(pkg_uri)
            pkg_file = Path(file_path)
            working_dir = job_config.runtime_env.get("working_dir")
            py_modules = job_config.runtime_env.get("py_modules")
            excludes = job_config.runtime_env.get("excludes") or []
            logger.info(f"{pkg_uri} doesn't exist. Create new package with"
                        f" {working_dir} and {py_modules}")
            if not pkg_file.exists():
                create_project_package(working_dir, py_modules, excludes,
                                       file_path)
            # Push the data to remote storage
            pkg_size = push_package(pkg_uri, pkg_file)
            logger.info(f"{pkg_uri} has been pushed with {pkg_size} bytes")
Example #3
0
    def __init__(self, dashboard_agent):
        super().__init__(dashboard_agent)
        self._runtime_env_dir = dashboard_agent.runtime_env_dir
        self._logging_params = dashboard_agent.logging_params
        self._per_job_logger_cache = dict()
        # Cache the results of creating envs to avoid repeatedly calling into
        # conda and other slow calls.
        self._env_cache: Dict[str, CreatedEnvResult] = dict()
        # Maps a serialized runtime env to a lock that is used
        # to prevent multiple concurrent installs of the same env.
        self._env_locks: Dict[str, asyncio.Lock] = dict()
        # Keeps track of the URIs contained within each env so we can
        # invalidate the env cache when a URI is deleted.
        # This is a temporary mechanism until we have per-URI caching.
        self._uris_to_envs: Dict[str, Set[str]] = defaultdict(set)
        # Initialize internal KV to be used by the working_dir setup code.
        _initialize_internal_kv(self._dashboard_agent.gcs_client)
        assert _internal_kv_initialized()

        self._pip_manager = PipManager(self._runtime_env_dir)
        self._conda_manager = CondaManager(self._runtime_env_dir)
        self._py_modules_manager = PyModulesManager(self._runtime_env_dir)
        self._working_dir_manager = WorkingDirManager(self._runtime_env_dir)
        self._container_manager = ContainerManager(dashboard_agent.temp_dir)

        self._working_dir_uri_cache = URICache(
            self._working_dir_manager.delete_uri, WORKING_DIR_CACHE_SIZE_BYTES)
        self._py_modules_uri_cache = URICache(
            self._py_modules_manager.delete_uri, PY_MODULES_CACHE_SIZE_BYTES)
        self._conda_uri_cache = URICache(self._conda_manager.delete_uri,
                                         CONDA_CACHE_SIZE_BYTES)
        self._pip_uri_cache = URICache(self._pip_manager.delete_uri,
                                       PIP_CACHE_SIZE_BYTES)
        self._logger = default_logger
Example #4
0
    def _run(self):
        """Run the monitor loop."""
        while True:
            if self.stop_event and self.stop_event.is_set():
                break
            self.update_load_metrics()
            self.update_resource_requests()
            self.update_event_summary()
            status = {
                "load_metrics_report": self.load_metrics.summary()._asdict(),
                "time": time.time(),
                "monitor_pid": os.getpid()
            }

            # Process autoscaling actions
            if self.autoscaler:
                # Only used to update the load metrics for the autoscaler.
                self.autoscaler.update()
                status[
                    "autoscaler_report"] = self.autoscaler.summary()._asdict()

                for msg in self.event_summarizer.summary():
                    logger.info("{}{}".format(
                        ray_constants.LOG_PREFIX_EVENT_SUMMARY, msg))
                self.event_summarizer.clear()

            as_json = json.dumps(status)
            if _internal_kv_initialized():
                _internal_kv_put(
                    DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True)

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Example #5
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if (
            self.autoscaler is not None
            and os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1"
        ):
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(
                ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True
            )
        gcs_publisher = GcsPublisher(address=args.gcs_address)
        from ray._private.utils import publish_error_to_driver

        publish_error_to_driver(
            ray_constants.MONITOR_DIED_ERROR,
            message,
            gcs_publisher=gcs_publisher,
        )
Example #6
0
    def _run(self):
        """Run the monitor loop."""

        while True:
            self.update_raylet_map()
            self.update_load_metrics()
            self.update_resource_requests()
            status = {
                "load_metrics_report": self.load_metrics.summary()._asdict()
            }

            # Process autoscaling actions
            if self.autoscaler:
                # Only used to update the load metrics for the autoscaler.
                self.autoscaler.update()
                status["autoscaler_report"] = self.autoscaler.summary(
                )._asdict()

            as_json = json.dumps(status)
            if _internal_kv_initialized():
                _internal_kv_put(DEBUG_AUTOSCALING_STATUS,
                                 as_json,
                                 overwrite=True)

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Example #7
0
    def __init__(self, namespace: str = None):
        assert ray_kv._internal_kv_initialized()
        if namespace is not None and not isinstance(namespace, str):
            raise TypeError("namespace must a string, got: {}.".format(
                type(namespace)))

        self.namespace = namespace or ""
Example #8
0
def ensure_runtime_env_setup(pkg_uris: List[str]) -> Optional[str]:
    """Make sure all required packages are downloaded it local.

    Necessary packages required to run the job will be downloaded
    into local file system if it doesn't exist.

    Args:
        pkg_uri list(str): Package of the working dir for the runtime env.

    Return:
        Working directory is returned if the pkg_uris is not empty,
        otherwise, None is returned.
    """
    pkg_dir = None
    assert _internal_kv_initialized()
    for pkg_uri in pkg_uris:
        # For each node, the package will only be downloaded one time
        # Locking to avoid multiple process download concurrently
        pkg_file = Path(_get_local_path(pkg_uri))
        with FileLock(str(pkg_file) + ".lock"):
            pkg_dir = fetch_package(pkg_uri)
        sys.path.insert(0, str(pkg_dir))
    # Right now, multiple pkg_uris are not supported correctly.
    # We return the last one as working directory
    return str(pkg_dir) if pkg_dir else None
Example #9
0
 def register(self, category, key, value):
     if category not in KNOWN_CATEGORIES:
         from ray.tune import TuneError
         raise TuneError("Unknown category {} not among {}".format(
             category, KNOWN_CATEGORIES))
     self._to_flush[(category, key)] = pickle.dumps(value)
     if _internal_kv_initialized():
         self.flush_values()
Example #10
0
 def register(self, category, key, value):
     if category not in KNOWN_CATEGORIES:
         from ray.tune import TuneError
         raise TuneError("Unknown category {} not among {}".format(
             category, KNOWN_CATEGORIES))
     self._to_flush[(category, key)] = pickle.dumps(value)
     if _internal_kv_initialized():
         self.flush_values()
Example #11
0
    def __init__(self, dashboard_head):
        super().__init__(dashboard_head)
        self._gcs_job_info_stub = None
        self._gcs_actor_info_stub = None
        self._dashboard_head = dashboard_head

        assert _internal_kv_initialized()
        self._job_status_client = JobStatusStorageClient()
Example #12
0
    def _run(self):
        """Run the monitor loop."""
        while True:
            try:
                if self.stop_event and self.stop_event.is_set():
                    break
                self.update_load_metrics()
                self.update_resource_requests()
                self.update_event_summary()
                status = {
                    "load_metrics_report": asdict(self.load_metrics.summary()),
                    "time": time.time(),
                    "monitor_pid": os.getpid(),
                }

                if self.autoscaler and not self.load_metrics:
                    # load_metrics is Falsey iff we haven't collected any
                    # resource messages from the GCS, which can happen at startup if
                    # the GCS hasn't yet received data from the Raylets.
                    # In this case, do not do an autoscaler update.
                    # Wait to get load metrics.
                    logger.info(
                        "Autoscaler has not yet received load metrics. Waiting."
                    )
                elif self.autoscaler:
                    # Process autoscaling actions
                    self.autoscaler.update()
                    autoscaler_summary = self.autoscaler.summary()
                    if autoscaler_summary:
                        status["autoscaler_report"] = asdict(autoscaler_summary)

                    for msg in self.event_summarizer.summary():
                        # Need to prefix each line of the message for the lines to
                        # get pushed to the driver logs.
                        for line in msg.split("\n"):
                            logger.info(
                                "{}{}".format(
                                    ray_constants.LOG_PREFIX_EVENT_SUMMARY, line
                                )
                            )
                    self.event_summarizer.clear()

                as_json = json.dumps(status)
                if _internal_kv_initialized():
                    _internal_kv_put(
                        ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
                    )
            except Exception:
                # By default, do not exit the monitor on failure.
                if self.retry_on_failure:
                    logger.exception("Monitor: Execution exception. Trying again...")
                else:
                    raise

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Example #13
0
def _put_pre_init_library_usages():
    assert _internal_kv_initialized()
    # NOTE: When the lib is imported from a worker, ray should
    # always be initialized, so there's no need to register the
    # pre init hook.
    if ray._private.worker.global_worker.mode != ray.SCRIPT_MODE:
        return
    for library_usage in _recorded_library_usages:
        _put_library_usage(library_usage)
Example #14
0
    def __init__(self, dashboard_head):
        super().__init__(dashboard_head)
        self._gcs_job_info_stub = None
        self._gcs_actor_info_stub = None
        self._dashboard_head = dashboard_head

        # Initialize internal KV to be used by the working_dir setup code.
        _initialize_internal_kv(dashboard_head.gcs_client)
        assert _internal_kv_initialized()
Example #15
0
 def get(self, category, key):
     if _internal_kv_initialized():
         value = _internal_kv_get(_make_key(category, key))
         if value is None:
             raise ValueError(
                 "Registry value for {}/{} doesn't exist.".format(
                     category, key))
         return pickle.loads(value)
     else:
         return pickle.loads(self._to_flush[(category, key)])
Example #16
0
 def __init__(self, dashboard_head):
     super().__init__(dashboard_head)
     self._gcs_job_info_stub = None
     self._gcs_actor_info_stub = None
     self._dashboard_head = dashboard_head
     assert _internal_kv_initialized()
     self._job_info_client = JobInfoStorageClient()
     # For offloading CPU intensive work.
     self._thread_pool = concurrent.futures.ThreadPoolExecutor(
         max_workers=2, thread_name_prefix="api_head")
Example #17
0
 def get(self, category, key):
     if _internal_kv_initialized():
         value = _internal_kv_get(_make_key(category, key))
         if value is None:
             raise ValueError(
                 "Registry value for {}/{} doesn't exist.".format(
                     category, key))
         return pickle.loads(value)
     else:
         return pickle.loads(self._to_flush[(category, key)])
Example #18
0
def _put_library_usage(library_usage: str):
    assert _internal_kv_initialized()
    try:
        _internal_kv_put(
            f"{usage_constant.LIBRARY_USAGE_PREFIX}{library_usage}",
            "",
            namespace=usage_constant.USAGE_STATS_NAMESPACE,
        )
    except Exception as e:
        logger.debug(f"Failed to put library usage, {e}")
Example #19
0
 def update_resource_requests(self):
     """Fetches resource requests from the internal KV and updates load."""
     if not _internal_kv_initialized():
         return
     data = _internal_kv_get(ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL)
     if data:
         try:
             resource_request = json.loads(data)
             self.load_metrics.set_resource_requests(resource_request)
         except Exception:
             logger.exception("Error parsing resource requests")
Example #20
0
 def log_info_string(self, nodes):
     tmp = "Cluster status: "
     tmp += self.info_string(nodes)
     tmp += "\n"
     tmp += self.load_metrics.info_string()
     tmp += "\n"
     tmp += self.resource_demand_scheduler.debug_string(
         nodes, self.pending_launches.breakdown(),
         self.load_metrics.get_resource_utilization())
     if _internal_kv_initialized():
         _internal_kv_put(DEBUG_AUTOSCALING_STATUS, tmp, overwrite=True)
     logger.debug(tmp)
Example #21
0
    def register(self, category, key, value):
        """Registers the value with the global registry.

        Raises:
            PicklingError if unable to pickle to provided file.
        """
        if category not in KNOWN_CATEGORIES:
            from ray.tune import TuneError
            raise TuneError("Unknown category {} not among {}".format(
                category, KNOWN_CATEGORIES))
        self._to_flush[(category, key)] = pickle.dumps_debug(value)
        if _internal_kv_initialized():
            self.flush_values()
Example #22
0
 def run(self):
     # Register signal handlers for autoscaler termination.
     signal.signal(signal.SIGINT, self._signal_handler)
     signal.signal(signal.SIGTERM, self._signal_handler)
     try:
         if _internal_kv_initialized():
             # Delete any previous autoscaling errors.
             _internal_kv_del(DEBUG_AUTOSCALING_ERROR)
         self._initialize_autoscaler()
         self._run()
     except Exception:
         self._handle_failure(traceback.format_exc())
         raise
Example #23
0
def record_library_usage(library_usage: str):
    """Record library usage (e.g. which library is used)"""
    if library_usage in _recorded_library_usages:
        return
    _recorded_library_usages.add(library_usage)

    if not _internal_kv_initialized():
        # This happens if the library is imported before ray.init
        return

    # Only report library usage from driver to reduce
    # the load to kv store.
    if ray.worker.global_worker.mode == ray.SCRIPT_MODE:
        _put_library_usage(library_usage)
Example #24
0
def legacy_log_info_string(autoscaler, nodes):
    tmp = "Cluster status: "
    tmp += info_string(autoscaler, nodes)
    tmp += "\n"
    tmp += autoscaler.load_metrics.info_string()
    tmp += "\n"
    tmp += autoscaler.resource_demand_scheduler.debug_string(
        nodes,
        autoscaler.pending_launches.breakdown(),
        autoscaler.load_metrics.get_resource_utilization(),
    )
    if _internal_kv_initialized():
        _internal_kv_put(DEBUG_AUTOSCALING_STATUS_LEGACY, tmp, overwrite=True)
    logger.debug(tmp)
Example #25
0
    def __init__(self, dashboard_agent):
        super().__init__(dashboard_agent)
        self._runtime_env_dir = dashboard_agent.runtime_env_dir
        self._logging_params = dashboard_agent.logging_params
        self._per_job_logger_cache = dict()
        # Cache the results of creating envs to avoid repeatedly calling into
        # conda and other slow calls.
        self._env_cache: Dict[str, CreatedEnvResult] = dict()
        # Maps a serialized runtime env to a lock that is used
        # to prevent multiple concurrent installs of the same env.
        self._env_locks: Dict[str, asyncio.Lock] = dict()

        # Initialize internal KV to be used by the working_dir setup code.
        _initialize_internal_kv(self._dashboard_agent.gcs_client)
        assert _internal_kv_initialized()
Example #26
0
 def update(self):
     try:
         self.reset(errors_fatal=False)
         self._update()
     except Exception as e:
         logger.exception("StandardAutoscaler: "
                          "Error during autoscaling.")
         if _internal_kv_initialized():
             _internal_kv_put(
                 DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
         self.num_failures += 1
         if self.num_failures > self.max_failures:
             logger.critical("StandardAutoscaler: "
                             "Too many errors, abort.")
             raise e
Example #27
0
def package_exists(pkg_uri: str) -> bool:
    """Check whether the package with given uri exists or not.

    Args:
        pkg_uri (str): The uri of the package

    Return:
        True for package existing and False for not.
    """
    assert _internal_kv_initialized()
    (protocol, pkg_name) = _parse_uri(pkg_uri)
    if protocol in (Protocol.GCS, Protocol.PIN_GCS):
        return _internal_kv_exists(pkg_uri)
    else:
        raise NotImplementedError(f"Protocol {protocol} is not supported")
Example #28
0
def record_library_usage(library_usage: str):
    """Record library usage (e.g. which library is used)"""
    if library_usage in _recorded_library_usages:
        return
    _recorded_library_usages.add(library_usage)

    if not _internal_kv_initialized():
        # This happens if the library is imported before ray.init
        return

    # Only report lib usage for driver / workers. Otherwise,
    # it can be reported if the library is imported from
    # e.g., API server.
    if (ray._private.worker.global_worker.mode == ray.SCRIPT_MODE
            or ray._private.worker.global_worker.mode == ray.WORKER_MODE):
        _put_library_usage(library_usage)
Example #29
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if self.autoscaler is not None:
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True)
        redis_client = ray._private.services.create_redis_client(
            args.redis_address, password=args.redis_password)
        from ray.utils import push_error_to_driver_through_redis
        push_error_to_driver_through_redis(redis_client,
                                           ray_constants.MONITOR_DIED_ERROR,
                                           message)
Example #30
0
    def _run(self):
        """Run the monitor loop."""
        while True:
            try:
                if self.stop_event and self.stop_event.is_set():
                    break
                self.update_load_metrics()
                self.update_resource_requests()
                self.update_event_summary()
                status = {
                    "load_metrics_report": asdict(self.load_metrics.summary()),
                    "time": time.time(),
                    "monitor_pid": os.getpid(),
                }

                # Process autoscaling actions
                if self.autoscaler:
                    # Only used to update the load metrics for the autoscaler.
                    self.autoscaler.update()
                    status["autoscaler_report"] = asdict(self.autoscaler.summary())

                    for msg in self.event_summarizer.summary():
                        # Need to prefix each line of the message for the lines to
                        # get pushed to the driver logs.
                        for line in msg.split("\n"):
                            logger.info(
                                "{}{}".format(
                                    ray_constants.LOG_PREFIX_EVENT_SUMMARY, line
                                )
                            )
                    self.event_summarizer.clear()

                as_json = json.dumps(status)
                if _internal_kv_initialized():
                    _internal_kv_put(
                        ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
                    )
            except Exception:
                logger.exception("Monitor: Execution exception. Trying again...")

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Example #31
0
 def update(self):
     try:
         self.reset(errors_fatal=False)
         self._update()
     except Exception as e:
         logger.exception("StandardAutoscaler: "
                          "Error during autoscaling.")
         if _internal_kv_initialized():
             _internal_kv_put(
                 DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
         # Don't abort the autoscaler if the K8s API server is down.
         # https://github.com/ray-project/ray/issues/12255
         is_k8s_connection_error = (
             self.config["provider"]["type"] == "kubernetes"
             and isinstance(e, MaxRetryError))
         if not is_k8s_connection_error:
             self.num_failures += 1
         if self.num_failures > self.max_failures:
             logger.critical("StandardAutoscaler: "
                             "Too many errors, abort.")
             raise e
Example #32
0
def record_library_usage(library_usage: str):
    """Record library usage (e.g. which library is used)"""
    if library_usage in _recorded_library_usages:
        return
    if "-" in library_usage:
        # - is not permitted since it should be used as a separator
        # of the lib usage file name. See LibUsageRecorder.
        raise ValueError(
            "The library name contains a char - which is not permitted.")
    _recorded_library_usages.add(library_usage)

    if not _internal_kv_initialized():
        # This happens if the library is imported before ray.init
        return

    # Only report lib usage for driver / workers. Otherwise,
    # it can be reported if the library is imported from
    # e.g., API server.
    if (ray._private.worker.global_worker.mode == ray.SCRIPT_MODE
            or ray._private.worker.global_worker.mode == ray.WORKER_MODE):
        _put_library_usage(library_usage)
Example #33
0
 def contains(self, category, key):
     if _internal_kv_initialized():
         value = _internal_kv_get(_make_key(category, key))
         return value is not None
     else:
         return (category, key) in self._to_flush