Beispiel #1
0
 def stop(self):
     logging.info('Stopping worker watchdog.')
     self._worker_manager.configure(
         event_pb2.WorkerHeartbeatRequest(
             watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1, )))
     self._running = False
     self.join()
Beispiel #2
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     logging.info('Disabling worker watchdog.')
     self._worker_manager.configure(
         event_pb2.WorkerHeartbeatRequest(
             watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1, )))
     self._running = False
     self.join()
    def _reset_manager(self, stopping=False):
        """Reset the graph, session and worker manager."""
        self._graph = ops.Graph()
        self._session = session_lib.Session(
            target=self._target,
            graph=self._graph,
            config=self._config,
        )

        if self._devices is None:
            self._devices = all_worker_devices(self._session)

        with self._graph.as_default():
            self._worker_manager = WorkerHeartbeatManager.from_devices(
                self._session, self._devices)

        if stopping:
            timeout_ms = -1
            shutdown_mode = event_pb2.NOT_CONFIGURED
        else:
            timeout_ms = self.shutdown_timeout * 1000
            shutdown_mode = event_pb2.WAIT_FOR_COORDINATOR

        self._worker_manager.configure(
            event_pb2.WorkerHeartbeatRequest(
                watchdog_config=event_pb2.WatchdogConfig(
                    timeout_ms=timeout_ms),
                shutdown_mode=shutdown_mode))
Beispiel #4
0
    def configure_and_run(self):
        logging.info('Enabling worker watchdog.')
        self._running = True
        self._worker_manager.configure(
            event_pb2.WorkerHeartbeatRequest(
                watchdog_config=event_pb2.WatchdogConfig(
                    timeout_ms=self.shutdown_timeout * 1000, )))

        self.start()
Beispiel #5
0
    def shutdown(self, timeout_ms=10000):
        """Shutdown all workers after `shutdown_timeout_secs`."""
        logging.info('Shutting down %s.', self)
        req = event_pb2.WorkerHeartbeatRequest(
            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms))
        self.configure(req)

        # Wait for workers to shutdown.  This isn't strictly required
        # but it avoids triggering multiple checkpoints with the same lame worker.
        logging.info('Waiting %dms for worker shutdown.', timeout_ms)
        time.sleep(timeout_ms / 1000)
Beispiel #6
0
  def shutdown(self, wait_time_in_ms=60000):
    """Shutdown all workers after `shutdown_timeout_secs`."""
    logging.info('Shutting down %s.', self)
    req = event_pb2.WorkerHeartbeatRequest(
        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=wait_time_in_ms),
        shutdown_mode=event_pb2.SHUTDOWN_AFTER_TIMEOUT)
    self.configure(req)

    # Wait for workers to shutdown.
    sleep_sec = 10.0 + wait_time_in_ms / 1000
    logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec)
    time.sleep(sleep_sec)
Beispiel #7
0
    def shutdown(self, timeout_ms=10000):
        """Shutdown all workers after `shutdown_timeout_secs`."""
        logging.info('Shutting down %s.', self)
        req = event_pb2.WorkerHeartbeatRequest(
            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms),
            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR)
        self.configure(req)

        # Wait for workers to shutdown. If we continue immediately, we can create a
        # new heartbeat manager before the workers shutdown: this keeps the workers
        # alive and can introduce confusing behavior.
        sleep_sec = 10.0 + timeout_ms / 1000
        logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec)
        time.sleep(sleep_sec)
Beispiel #8
0
    def _reset_manager(self):
        """Reset the graph, session and worker manager."""
        self._graph = ops.Graph()
        self._session = session_lib.Session(
            target=self._target,
            graph=self._graph,
        )

        if self._devices is None:
            self._devices = all_worker_devices(self._session)

        with self._graph.as_default():
            self._worker_manager = WorkerHeartbeatManager.from_devices(
                self._session, self._devices)

        self._worker_manager.configure(
            event_pb2.WorkerHeartbeatRequest(
                watchdog_config=event_pb2.WatchdogConfig(
                    timeout_ms=self.shutdown_timeout * 1000, )))
Beispiel #9
0
 def shutdown(self, timeout_ms=10000):
     """Shutdown all workers after `shutdown_timeout_secs`."""
     req = event_pb2.WorkerHeartbeatRequest(
         watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms))
     self.configure(req)