コード例 #1
0
ファイル: sender.py プロジェクト: nitinvasanth/client-ng
 def _maybe_setup_resume(self, run):
     """This maybe queries the backend for a run and fails if the settings are
     incompatible."""
     error = None
     if self._settings.resume:
         # TODO: This causes a race, we need to make the upsert atomically
         # only create or update depending on the resume config
         # we use the runs entity if set, otherwise fallback to users entity
         entity = run.entity or self._entity
         logger.info("checking resume status for %s/%s/%s", entity,
                     run.project, run.run_id)
         resume_status = self._api.run_resume_status(
             entity=entity, project_name=run.project, name=run.run_id)
         logger.info("resume status %s", resume_status)
         if resume_status is None:
             if self._settings.resume == "must":
                 error = wandb_internal_pb2.ErrorInfo()
                 error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.INVALID
                 error.message = (
                     "resume='must' but run (%s) doesn't exist" %
                     run.run_id)
         else:
             if self._settings.resume == "never":
                 error = wandb_internal_pb2.ErrorInfo()
                 error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.INVALID
                 error.message = "resume='never' but run (%s) exists" % run.run_id
             elif self._settings.resume in ("allow", "auto"):
                 history = {}
                 events = {}
                 try:
                     history = json.loads(
                         json.loads(resume_status["historyTail"])[-1])
                     events = json.loads(
                         json.loads(resume_status["eventsTail"])[-1])
                 except (IndexError, ValueError) as e:
                     logger.error("unable to load resume tails", exc_info=e)
                 # TODO: Do we need to restore config / summary?
                 # System metrics runtime is usually greater than history
                 events_rt = events.get("_runtime", 0)
                 history_rt = history.get("_runtime", 0)
                 self._offsets["runtime"] = max(events_rt, history_rt)
                 self._offsets["step"] = history.get("_step", -1) + 1
                 self._offsets["history"] = resume_status[
                     "historyLineCount"]
                 self._offsets["events"] = resume_status["eventsLineCount"]
                 self._offsets["output"] = resume_status["logLineCount"]
                 logger.info("configured resuming with: %s" % self._offsets)
     return error
コード例 #2
0
ファイル: sender.py プロジェクト: nbardy/client
    def _maybe_setup_resume(self,
                            run) -> "Optional[wandb_internal_pb2.ErrorInfo]":
        """This maybe queries the backend for a run and fails if the settings are
        incompatible."""
        if not self._settings.resume:
            return None

        # TODO: This causes a race, we need to make the upsert atomically
        # only create or update depending on the resume config
        # we use the runs entity if set, otherwise fallback to users entity
        entity = run.entity or self._entity
        logger.info("checking resume status for %s/%s/%s", entity, run.project,
                    run.run_id)
        resume_status = self._api.run_resume_status(entity=entity,
                                                    project_name=run.project,
                                                    name=run.run_id)

        if not resume_status:
            if self._settings.resume == "must":
                error = wandb_internal_pb2.ErrorInfo()
                error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.INVALID
                error.message = "resume='must' but run (%s) doesn't exist" % run.run_id
                return error
            return None

        #
        # handle cases where we have resume_status
        #
        if self._settings.resume == "never":
            error = wandb_internal_pb2.ErrorInfo()
            error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.INVALID
            error.message = "resume='never' but run (%s) exists" % run.run_id
            return error

        history = {}
        events = {}
        config = {}
        summary = {}
        try:
            events_rt = 0
            history_rt = 0
            history = json.loads(resume_status["historyTail"])
            if history:
                history = json.loads(history[-1])
                history_rt = history.get("_runtime", 0)
            events = json.loads(resume_status["eventsTail"])
            if events:
                events = json.loads(events[-1])
                events_rt = events.get("_runtime", 0)
            config = json.loads(resume_status["config"] or "{}")
            summary = json.loads(resume_status["summaryMetrics"] or "{}")
        except (IndexError, ValueError) as e:
            logger.error("unable to load resume tails", exc_info=e)
            if self._settings.resume == "must":
                error = wandb_internal_pb2.ErrorInfo()
                error.code = wandb_internal_pb2.ErrorInfo.ErrorCode.INVALID
                error.message = "resume='must' but could not resume (%s) " % run.run_id
                return error

        # TODO: Do we need to restore config / summary?
        # System metrics runtime is usually greater than history
        self._resume_state["runtime"] = max(events_rt, history_rt)
        self._resume_state["step"] = history.get("_step",
                                                 -1) + 1 if history else 0
        self._resume_state["history"] = resume_status["historyLineCount"]
        self._resume_state["events"] = resume_status["eventsLineCount"]
        self._resume_state["output"] = resume_status["logLineCount"]
        self._resume_state["config"] = config
        self._resume_state["summary"] = summary
        self._resume_state["resumed"] = True
        logger.info("configured resuming with: %s" % self._resume_state)
        return None