def upload_dataset(dataset_name): """ Uploads dataset from local database to Weights & Biases. Args: dataset_name: The name of the dataset in the Prodigy database. """ # Check if wandb.init has been called if wandb.run is None: raise ValueError("You must call wandb.init() before upload_dataset()") with wb_telemetry.context(run=wandb.run) as tel: tel.feature.prodigy = True prodigy_db = util.get_module( "prodigy.components.db", required= "`prodigy` library is required but not installed. Please see https://prodi.gy/docs/install", ) # Retrieve and upload prodigy dataset database = prodigy_db.connect() data = database.get_dataset(dataset_name) array_dict_types = [] schema = get_schema(data, {}, array_dict_types) for i, _d in enumerate(data): standardize(data[i], schema, array_dict_types) table = create_table(data) wandb.log({dataset_name: table}) print("Prodigy dataset `" + dataset_name + "` uploaded.")
def __init__(self, metric_period: int = 1): if wandb.run is None: raise wandb.Error( "You must call `wandb.init()` before `WandbCallback()`") with wb_telemetry.context() as tel: tel.feature.catboost_wandb_callback = True self.metric_period: int = metric_period
def _init(env: "CallbackEnv") -> None: with wb_telemetry.context() as tel: tel.feature.lightgbm_wandb_callback = True wandb.config.update(env.params) log_params_list[0] = False if define_metric_list[0]: for i in range(len(env.evaluation_result_list)): data_type = env.evaluation_result_list[i][0] metric_name = env.evaluation_result_list[i][1] _define_metric(data_type, metric_name)
def log_summary(model: Booster, feature_importance: bool = True, save_model_checkpoint: bool = False) -> None: """Logs useful metrics about lightgbm model after training is done. Arguments: model: (Booster) is an instance of lightgbm.basic.Booster. feature_importance: (boolean) if True (default), logs the feature importance plot. save_model_checkpoint: (boolean) if True saves the best model and upload as W&B artifacts. Using this along with `wandb_callback` will: - log `best_iteration` and `best_score` as `wandb.summary`. - log feature importance plot. - save and upload your best trained model to Weights & Biases Artifacts (when `save_model_checkpoint = True`) Example: ```python params = { 'boosting_type': 'gbdt', 'objective': 'regression', . } gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_eval, valid_names=('validation'), callbacks=[wandb_callback()]) log_summary(gbm) ``` """ if wandb.run is None: raise wandb.Error("You must call wandb.init() before WandbCallback()") if not isinstance(model, Booster): raise wandb.Error( "Model should be an instance of lightgbm.basic.Booster") wandb.run.summary["best_iteration"] = model.best_iteration wandb.run.summary["best_score"] = model.best_score # Log feature importance if feature_importance: _log_feature_importance(model) if save_model_checkpoint: _checkpoint_artifact(model, model.best_iteration, aliases=["best"]) with wb_telemetry.context() as tel: tel.feature.lightgbm_log_summary = True
def wrapper(self, *args, settings=settings, **kwargs): if not isinstance(settings, wandb.sdk.wandb_settings.Settings): settings = wandb.Settings() settings.update( run_group=coalesce(settings.run_group, f"{current.flow_name}/{current.run_id}"), source=wandb.sdk.wandb_settings.Source.INIT, ) settings.update( run_job_type=coalesce(settings.run_job_type, current.step_name), source=wandb.sdk.wandb_settings.Source.INIT, ) with wandb.init(settings=settings) as run: with wb_telemetry.context(run=run) as tel: tel.feature.metaflow = True proxy = ArtifactProxy(self) run.config.update(proxy.params) func(proxy, *args, **kwargs) for name, data in proxy.inputs.items(): wandb_use( name, data, datasets=datasets, models=models, others=others, run=run, ) for name, data in proxy.outputs.items(): wandb_track( name, data, datasets=datasets, models=models, others=others, run=run, )
def __init__( self, verbose: int = 0, model_save_path: str = None, model_save_freq: int = 0, gradient_save_freq: int = 0, ): super(WandbCallback, self).__init__(verbose) if wandb.run is None: raise wandb.Error( "You must call wandb.init() before WandbCallback()") with wb_telemetry.context() as tel: tel.feature.sb3 = True self.model_save_freq = model_save_freq self.model_save_path = model_save_path self.gradient_save_freq = gradient_save_freq # Create folder if needed if self.model_save_path is not None: os.makedirs(self.model_save_path, exist_ok=True) self.path = os.path.join(self.model_save_path, "model.zip") else: assert ( self.model_save_freq == 0 ), "to use the `model_save_freq` you have to set the `model_save_path` parameter"
def log_summary( model: Union[CatBoostClassifier, CatBoostRegressor], log_all_params: bool = True, save_model_checkpoint: bool = False, log_feature_importance: bool = True, ) -> None: """`log_summary` logs useful metrics about catboost model after training is done Arguments: model: it can be CatBoostClassifier or CatBoostRegressor. log_all_params: (boolean) if True (default) log the model hyperparameters as W&B config. save_model_checkpoint: (boolean) if True saves the model upload as W&B artifacts. log_feature_importance: (boolean) if True (default) logs feature importance as W&B bar chart using the default setting of `get_feature_importance`. Using this along with `wandb_callback` will: - save the hyperparameters as W&B config, - log `best_iteration` and `best_score` as `wandb.summary`, - save and upload your trained model to Weights & Biases Artifacts (when `save_model_checkpoint = True`) - log feature importance plot. Example: ```python train_pool = Pool(train[features], label=train['label'], cat_features=cat_features) test_pool = Pool(test[features], label=test['label'], cat_features=cat_features) model = CatBoostRegressor( iterations=100, loss_function='Cox', eval_metric='Cox', ) model.fit( train_pool, eval_set=test_pool, callbacks=[WandbCallback()], ) log_summary(model) ``` """ if wandb.run is None: raise wandb.Error( "You must call `wandb.init()` before `log_summary()`") if not (isinstance(model, (CatBoostClassifier, CatBoostRegressor))): raise wandb.Error( "Model should be an instance of CatBoostClassifier or CatBoostRegressor" ) with wb_telemetry.context() as tel: tel.feature.catboost_log_summary = True # log configs params = model.get_all_params() if log_all_params: wandb.config.update(params) # log best score and iteration wandb.run.summary["best_iteration"] = model.get_best_iteration() wandb.run.summary["best_score"] = model.get_best_score() # log model if save_model_checkpoint: aliases = ["best"] if params["use_best_model"] else ["last"] _checkpoint_artifact(model, aliases=aliases) # Feature importance if log_feature_importance: _log_feature_importance(model)
def wandb_save( glob_str: Optional[str] = None, base_path: Optional[str] = None, policy: str = "live", ) -> Union[bool, List[str]]: """ NOTE: This reimplements wandb.save, but copies files instead of symlinking. The symlinks have caused many issues on Windows and google colab. ORIGINAL DOCS: Ensure all files matching `glob_str` are synced to wandb with the policy specified. Arguments: glob_str: (string) a relative or absolute path to a unix glob or regular path. If this isn't specified the method is a noop. base_path: (string) the base path to run the glob relative to policy: (string) on of `live`, `now`, or `end` - live: upload the file as it changes, overwriting the previous version - now: upload the file once now - end: only upload file when the run ends """ if glob_str is None: # noop for historical reasons, run.save() may be called in legacy code wandb.termwarn( ("Calling run.save without any arguments is deprecated." "Changes to attributes are automatically persisted.")) return True if policy not in ("live", "end", "now"): raise ValueError( 'Only "live" "end" and "now" policies are currently supported.' ) if isinstance(glob_str, bytes): glob_str = glob_str.decode("utf-8") if not isinstance(glob_str, string_types): raise ValueError( "Must call wandb.save(glob_str) with glob_str a str") if base_path is None: if os.path.isabs(glob_str): base_path = os.path.dirname(glob_str) wandb.termwarn( ("Saving files without folders. If you want to preserve " "sub directories pass base_path to wandb.save, i.e. " 'wandb.save("/mnt/folder/file.h5", base_path="/mnt")')) else: base_path = "" wandb_glob_str = os.path.relpath(glob_str, base_path) if ".." + os.sep in wandb_glob_str: raise ValueError("globs can't walk above base_path") with telemetry.context(run=wandb.run) as tel: tel.feature.save = True if glob_str.startswith("gs://") or glob_str.startswith("s3://"): wandb.termlog( "%s is a cloud storage url, can't save file to wandb." % glob_str) return [] files = glob.glob(os.path.join(wandb.run.dir, wandb_glob_str)) warn = False if len(files) == 0 and "*" in wandb_glob_str: warn = True for path in glob.glob(glob_str): file_name = os.path.relpath(path, base_path) abs_path = os.path.abspath(path) wandb_path = os.path.join(wandb.run.dir, file_name) wandb.util.mkdir_exists_ok(os.path.dirname(wandb_path)) # We overwrite symlinks because namespaces can change in Tensorboard if os.path.islink( wandb_path) and abs_path != os.readlink(wandb_path): os.remove(wandb_path) shutil.copy(abs_path, wandb.run.dir) # os.symlink(abs_path, wandb_path) elif not os.path.exists(wandb_path): shutil.copy(abs_path, wandb.run.dir) # os.symlink(abs_path, wandb_path) files.append(wandb_path) if warn: file_str = "%i file" % len(files) if len(files) > 1: file_str += "s" wandb.termwarn( ("Symlinked %s into the W&B run directory, " "call wandb.save again to sync new files.") % file_str) files_dict = dict(files=[(wandb_glob_str, policy)]) if wandb.run._backend: wandb.run._backend.interface.publish_files(files_dict) return files
def torch_trace_handler(): """Creates a trace handler for traces generated by the profiler. Provide as an argument to `torch.profiler.profile`: ```python torch.profiler.profile(..., on_trace_ready = wandb.profiler.torch_trace_handler()) ``` Calling this function ensures that profiler charts & tables can be viewed in your run dashboard on wandb.ai. Please note that `wandb.init()` must be called before this function is invoked. The PyTorch (torch) version must also be at least 1.9, in order to ensure stability of their Profiler API. Args: None Returns: None Raises: UsageError if wandb.init() hasn't been called before profiling. Error if torch version is less than 1.9.0. Examples: ```python run = wandb.init() run.config.id = "profile_code" with torch.profiler.profile( schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), on_trace_ready=wandb.profiler.torch_trace_handler(), record_shapes=True, with_stack=True, ) as prof: for i, batch in enumerate(dataloader): if step >= 5: break train(batch) prof.step() ``` """ torch = wandb.util.get_module(PYTORCH_MODULE, required=True) torch_profiler = wandb.util.get_module(PYTORCH_PROFILER_MODULE, required=True) version = tuple( map(lambda x: int(x), torch.__version__.replace("+cpu", "").split("."))) if version < (1, 9, 0): raise Error( f"torch version must be at least 1.9 in order to use the PyTorch Profiler API.\ \nVersion of torch currently installed: {torch.__version__}") try: logdir = os.path.join(wandb.run.dir, "pytorch_traces") # type: ignore os.mkdir(logdir) except AttributeError: raise UsageError( "Please call `wandb.init()` before `wandb.profiler.torch_trace_handler()`" ) from None with telemetry.context() as tel: tel.feature.torch_profiler_trace = True return torch_profiler.tensorboard_trace_handler(logdir)