def _set_run_obj(self, run_obj): self._run_obj = run_obj # TODO: Update run summary when resuming? self.history._update_step() # TODO: It feels weird to call this twice.. sentry_set_scope("user", run_obj.entity, run_obj.project, self._get_run_url())
def _start_run_threads(self): self._fs = file_stream.FileStreamApi( self._api, self._run.run_id, self._run.start_time.ToSeconds(), settings=self._api_settings, ) # Ensure the streaming polices have the proper offsets self._fs.set_file_policy("wandb-summary.json", file_stream.SummaryFilePolicy()) self._fs.set_file_policy( "wandb-history.jsonl", file_stream.JsonlFilePolicy(start_chunk_id=self._resume_state["history"]), ) self._fs.set_file_policy( "wandb-events.jsonl", file_stream.JsonlFilePolicy(start_chunk_id=self._resume_state["events"]), ) self._fs.set_file_policy( "output.log", file_stream.CRDedupeFilePolicy(start_chunk_id=self._resume_state["output"]), ) util.sentry_set_scope( "internal", entity=self._run.entity, project=self._run.project, email=self._settings.email, ) self._fs.start() self._pusher = FilePusher(self._api, silent=self._settings.silent) self._dir_watcher = DirWatcher(self._settings, self._api, self._pusher) logger.info( "run started: %s with start time %s", self._run.run_id, self._run.start_time.ToSeconds(), )
def handle_run(self, data): run = data.run run_tags = run.tags[:] # build config dict config_dict = None if run.HasField("config"): config_dict = _config_dict_from_proto_list(run.config.update) ups = self._api.upsert_run( name=run.run_id, entity=run.entity or None, project=run.project or None, group=run.run_group or None, job_type=run.job_type or None, display_name=run.display_name or None, notes=run.notes or None, tags=run_tags or None, config=config_dict or None, sweep_name=run.sweep_id or None, ) if data.control.req_resp: storage_id = ups.get("id") if storage_id: data.run.storage_id = storage_id display_name = ups.get("displayName") if display_name: data.run.display_name = display_name project = ups.get("project") if project: project_name = project.get("name") if project_name: data.run.project = project_name self._project = project_name entity = project.get("entity") if entity: entity_name = entity.get("name") if entity_name: data.run.entity = entity_name self._entity = entity_name self._resp_q.put(data) if self._entity is not None: self._api_settings["entity"] = self._entity if self._project is not None: self._api_settings["project"] = self._project self._fs = file_stream.FileStreamApi(self._api, run.run_id, settings=self._api_settings) self._fs.start() self._pusher = FilePusher(self._api) self._run_id = run.run_id sentry_set_scope("internal", run.entity, run.project) logger.info("run started: %s", self._run_id)
def __init__(self, config=None, settings=None): self._config = wandb_config.Config() self._config._set_callback(self._config_callback) self.summary = wandb_summary.Summary() self.summary._set_callback(self._summary_callback) self.history = wandb_history.History(self) self.history._set_callback(self._history_callback) _datatypes_set_callback(self._datatypes_callback) self._settings = settings self._wl = None self._backend = None self._reporter = None self._data = dict() self._entity = None self._project = None self._group = None self._job_type = None self._run_id = settings.run_id self._start_time = time.time() self._starting_step = 0 self._name = None self._notes = None self._tags = None self._hooks = None self._redirect_cb = None self._out_redir = None self._err_redir = None self.stdout_redirector = None self.stderr_redirector = None self._save_stdout = None self._save_stderr = None self._stdout_slave_fd = None self._stderr_slave_fd = None self._exit_code = None self._exit_result = None self._final_summary = None # Pull info from settings self._init_from_settings(settings) # Initial scope setup for sentry. This might get changed when the # actual run comes back. sentry_set_scope("user", self._entity, self._project) # Returned from backend send_run_sync, set from wandb_init? self._run_obj = None # Created when the run "starts". self._run_status_checker = None config = config or dict() wandb_key = "_wandb" config.setdefault(wandb_key, dict()) config[wandb_key]["cli_version"] = wandb.__version__ if settings.save_code and settings.program_relpath: config[wandb_key]["code_path"] = to_forward_slash_path( os.path.join("code", settings.program_relpath)) self._config._update(config) self._atexit_cleanup_called = None self._use_redirect = True
def handle_run(self, data): run = data.run run_tags = run.tags[:] error = None is_wandb_init = self._run is None # build config dict config_dict = None if run.HasField("config"): config_dict = _config_dict_from_proto_list(run.config.update) config_path = os.path.join(self._settings.files_dir, CONFIG_FNAME) save_config_file_from_dict(config_path, config_dict) repo = GitRepo(remote=self._settings.git_remote) if is_wandb_init: # Only check resume status on `wandb.init` error = self._maybe_setup_resume(run) if error is not None: if data.control.req_resp: resp = wandb_internal_pb2.Result(uuid=data.uuid) resp.run_result.run.CopyFrom(run) resp.run_result.error.CopyFrom(error) self._resp_q.put(resp) else: logger.error("Got error in async mode: %s", error.message) return # TODO: we don't check inserted currently, ultimately we should make # the upsert know the resume state and fail transactionally ups, inserted = self._api.upsert_run( name=run.run_id, entity=run.entity or None, project=run.project or None, group=run.run_group or None, job_type=run.job_type or None, display_name=run.display_name or None, notes=run.notes or None, tags=run_tags or None, config=config_dict or None, sweep_name=run.sweep_id or None, host=run.host or None, program_path=self._settings.program or None, repo=repo.remote_url, commit=repo.last_commit, ) # We subtract the previous runs runtime when resuming start_time = run.start_time.ToSeconds() - self._offsets["runtime"] self._run = run self._run.starting_step = self._offsets["step"] self._run.start_time.FromSeconds(start_time) storage_id = ups.get("id") if storage_id: self._run.storage_id = storage_id display_name = ups.get("displayName") if display_name: self._run.display_name = display_name project = ups.get("project") if project: project_name = project.get("name") if project_name: self._run.project = project_name self._project = project_name entity = project.get("entity") if entity: entity_name = entity.get("name") if entity_name: self._run.entity = entity_name self._entity = entity_name if data.control.req_resp: resp = wandb_internal_pb2.Result(uuid=data.uuid) resp.run_result.run.CopyFrom(self._run) self._resp_q.put(resp) if self._entity is not None: self._api_settings["entity"] = self._entity if self._project is not None: self._api_settings["project"] = self._project # Only spin up our threads on the first run message if is_wandb_init: self._fs = file_stream.FileStreamApi(self._api, run.run_id, start_time, settings=self._api_settings) # Ensure the streaming polices have the proper offsets self._fs.set_file_policy("wandb-summary.json", file_stream.SummaryFilePolicy()) self._fs.set_file_policy( "wandb-history.jsonl", file_stream.JsonlFilePolicy( start_chunk_id=self._offsets["history"]), ) self._fs.set_file_policy( "wandb-events.jsonl", file_stream.JsonlFilePolicy( start_chunk_id=self._offsets["events"]), ) self._fs.set_file_policy( "output.log", file_stream.CRDedupeFilePolicy( start_chunk_id=self._offsets["output"]), ) self._fs.start() self._pusher = FilePusher(self._api) self._dir_watcher = DirWatcher(self._settings, self._api, self._pusher) self._tb_watcher = tb_watcher.TBWatcher(self._settings, sender=self) if self._run_meta: self._run_meta.write() sentry_set_scope("internal", run.entity, run.project) logger.info("run started: %s with start time %s", self._run.run_id, start_time) else: logger.info("updated run: %s", self._run.run_id)