Beispiel #1
0
  def shutdown(self, timeout_ms=10000):
    """Shutdown all workers after `shutdown_timeout_secs`."""
    logging.info('Shutting down %s.', self)
    req = event_pb2.WorkerHeartbeatRequest(
        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms),
        shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR)
    self.configure(req)

    # Wait for workers to shutdown.  This isn't strictly required
    # but it avoids triggering multiple checkpoints with the same lame worker.
    logging.info('Waiting %dms for worker shutdown.', timeout_ms)
    time.sleep(timeout_ms / 1000)
Beispiel #2
0
    def shutdown(self, timeout_ms=10000):
        """Shutdown all workers after `shutdown_timeout_secs`."""
        logging.info('Shutting down %s.', self)
        req = event_pb2.WorkerHeartbeatRequest(
            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms),
            shutdown_mode=event_pb2.SHUTDOWN_AFTER_TIMEOUT)
        self.configure(req)

        # Wait for workers to shutdown.
        sleep_sec = 10.0 + timeout_ms / 1000
        logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec)
        time.sleep(sleep_sec)
Beispiel #3
0
    def shutdown(self, timeout_ms=10000):
        """Shutdown all workers after `shutdown_timeout_secs`."""
        logging.info('Shutting down %s.', self)
        req = event_pb2.WorkerHeartbeatRequest(
            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms),
            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR)
        self.configure(req)

        # Wait for workers to shutdown. If we continue immediately, we can create a
        # new heartbeat manager before the workers shutdown: this keeps the workers
        # alive and can introduce confusing behavior.
        sleep_sec = 10.0 + timeout_ms / 1000
        logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec)
        time.sleep(sleep_sec)
Beispiel #4
0
    def shutdown(self, wait_time_in_ms=60000, exit_code=None):
        """Shutdown all workers after `shutdown_timeout_secs`."""
        logging.info('Shutting down %s.', self)
        req = event_pb2.WorkerHeartbeatRequest(
            watchdog_config=event_pb2.WatchdogConfig(
                timeout_ms=wait_time_in_ms),
            shutdown_mode=event_pb2.SHUTDOWN_AFTER_TIMEOUT,
            exit_code=event_pb2.RequestedExitCode(
                exit_code=exit_code) if exit_code is not None else None)
        self.configure(req)

        # Wait for workers to shutdown.
        sleep_sec = 10.0 + wait_time_in_ms / 1000
        logging.info('Waiting %.2f seconds for worker shutdown.', sleep_sec)
        time.sleep(sleep_sec)
Beispiel #5
0
    def ping(self, request=None, timeout_in_ms=5000):
        """Ping all workers, returning the parsed status results."""
        if request is None:
            request = event_pb2.WorkerHeartbeatRequest()

        options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
        results = self._session.run(
            self._ops,
            feed_dict={self._request_placeholder: request.SerializeToString()},
            options=options)
        parsed_results = [
            event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
            for res_pb in results
        ]
        logging.debug('Ping results: %s', parsed_results)
        return parsed_results
Beispiel #6
0
    def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
        # N.B. We have to pull the global step here to avoid it being unavailable
        # at checkpoint time; the graph has been frozen at that point.
        if training_util.get_global_step() is None and self.saver(
        ) is not None:
            raise ValueError(
                'Saver defined but no global step.  Run `get_or_create_global_step()`'
                ' in your model definition to allow checkpointing.')

        with self._graph.as_default():
            self._session = session_lib.Session(
                target=training_session.sess_str, graph=self._graph)
            self._workers = WorkerHeartbeatManager.from_devices(
                self._session, all_worker_devices(self._session))

            self._workers.configure(
                event_pb2.WorkerHeartbeatRequest(
                    shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
Beispiel #7
0
    def _reset_manager(self):
        """Reset the graph, session and worker manager."""
        self._graph = ops.Graph()
        self._session = session_lib.Session(
            target=self._target,
            graph=self._graph,
        )

        if self._devices is None:
            self._devices = all_worker_devices(self._session)

        with self._graph.as_default():
            self._worker_manager = WorkerHeartbeatManager.from_devices(
                self._session, self._devices)

        self._worker_manager.configure(
            event_pb2.WorkerHeartbeatRequest(
                watchdog_config=event_pb2.WatchdogConfig(
                    timeout_ms=self.shutdown_timeout * 1000, )))
  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
    # N.B. We have to pull the global step here to avoid it being unavailable
    # at checkpoint time; the graph has been frozen at that point.
    if training_util.get_global_step() is None and self.saver() is not None:
      raise ValueError(
          'Saver defined but no global step.  Run `get_or_create_global_step()`'
          ' in your model definition to allow checkpointing.')

    with self._graph.as_default():
      logging.info('Installing graceful shutdown hook.')
      self._session = _clone_session(training_session, self._graph)
      self._workers = WorkerHeartbeatManager.from_devices(
          self._session, all_worker_devices(self._session))
      self._heartbeat_supported = self._workers.num_workers() > 0
      if self._heartbeat_supported:
        self._workers.configure(
            event_pb2.WorkerHeartbeatRequest(
                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
      else:
        logging.warn(
            'No workers support hearbeats. Failure handling will be disabled.')
Beispiel #9
0
 def shutdown(self, timeout_ms=10000):
     """Shutdown all workers after `shutdown_timeout_secs`."""
     req = event_pb2.WorkerHeartbeatRequest(
         watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms))
     self.configure(req)