Exemple #1
0
def upload_dataset(dataset_name):
    """ Uploads dataset from local database to Weights & Biases.

    Args:
        dataset_name: The name of the dataset in the Prodigy database.
    """
    # Check if wandb.init has been called
    if wandb.run is None:
        raise ValueError("You must call wandb.init() before upload_dataset()")

    with wb_telemetry.context(run=wandb.run) as tel:
        tel.feature.prodigy = True

    prodigy_db = util.get_module(
        "prodigy.components.db",
        required=
        "`prodigy` library is required but not installed. Please see https://prodi.gy/docs/install",
    )
    # Retrieve and upload prodigy dataset
    database = prodigy_db.connect()
    data = database.get_dataset(dataset_name)

    array_dict_types = []
    schema = get_schema(data, {}, array_dict_types)

    for i, _d in enumerate(data):
        standardize(data[i], schema, array_dict_types)
    table = create_table(data)
    wandb.log({dataset_name: table})
    print("Prodigy dataset `" + dataset_name + "` uploaded.")
Exemple #2
0
    def __init__(self, metric_period: int = 1):
        if wandb.run is None:
            raise wandb.Error(
                "You must call `wandb.init()` before `WandbCallback()`")

        with wb_telemetry.context() as tel:
            tel.feature.catboost_wandb_callback = True

        self.metric_period: int = metric_period
Exemple #3
0
    def _init(env: "CallbackEnv") -> None:
        with wb_telemetry.context() as tel:
            tel.feature.lightgbm_wandb_callback = True

        wandb.config.update(env.params)
        log_params_list[0] = False

        if define_metric_list[0]:
            for i in range(len(env.evaluation_result_list)):
                data_type = env.evaluation_result_list[i][0]
                metric_name = env.evaluation_result_list[i][1]
                _define_metric(data_type, metric_name)
Exemple #4
0
def log_summary(model: Booster,
                feature_importance: bool = True,
                save_model_checkpoint: bool = False) -> None:
    """Logs useful metrics about lightgbm model after training is done.

    Arguments:
        model: (Booster) is an instance of lightgbm.basic.Booster.
        feature_importance: (boolean) if True (default), logs the feature importance plot.
        save_model_checkpoint: (boolean) if True saves the best model and upload as W&B artifacts.

    Using this along with `wandb_callback` will:

    - log `best_iteration` and `best_score` as `wandb.summary`.
    - log feature importance plot.
    - save and upload your best trained model to Weights & Biases Artifacts (when `save_model_checkpoint = True`)

    Example:
        ```python
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            .
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()])

        log_summary(gbm)
        ```
    """
    if wandb.run is None:
        raise wandb.Error("You must call wandb.init() before WandbCallback()")

    if not isinstance(model, Booster):
        raise wandb.Error(
            "Model should be an instance of lightgbm.basic.Booster")

    wandb.run.summary["best_iteration"] = model.best_iteration
    wandb.run.summary["best_score"] = model.best_score

    # Log feature importance
    if feature_importance:
        _log_feature_importance(model)

    if save_model_checkpoint:
        _checkpoint_artifact(model, model.best_iteration, aliases=["best"])

    with wb_telemetry.context() as tel:
        tel.feature.lightgbm_log_summary = True
Exemple #5
0
        def wrapper(self, *args, settings=settings, **kwargs):
            if not isinstance(settings, wandb.sdk.wandb_settings.Settings):
                settings = wandb.Settings()

            settings.update(
                run_group=coalesce(settings.run_group,
                                   f"{current.flow_name}/{current.run_id}"),
                source=wandb.sdk.wandb_settings.Source.INIT,
            )
            settings.update(
                run_job_type=coalesce(settings.run_job_type,
                                      current.step_name),
                source=wandb.sdk.wandb_settings.Source.INIT,
            )

            with wandb.init(settings=settings) as run:
                with wb_telemetry.context(run=run) as tel:
                    tel.feature.metaflow = True
                proxy = ArtifactProxy(self)
                run.config.update(proxy.params)
                func(proxy, *args, **kwargs)

                for name, data in proxy.inputs.items():
                    wandb_use(
                        name,
                        data,
                        datasets=datasets,
                        models=models,
                        others=others,
                        run=run,
                    )

                for name, data in proxy.outputs.items():
                    wandb_track(
                        name,
                        data,
                        datasets=datasets,
                        models=models,
                        others=others,
                        run=run,
                    )
Exemple #6
0
 def __init__(
     self,
     verbose: int = 0,
     model_save_path: str = None,
     model_save_freq: int = 0,
     gradient_save_freq: int = 0,
 ):
     super(WandbCallback, self).__init__(verbose)
     if wandb.run is None:
         raise wandb.Error(
             "You must call wandb.init() before WandbCallback()")
     with wb_telemetry.context() as tel:
         tel.feature.sb3 = True
     self.model_save_freq = model_save_freq
     self.model_save_path = model_save_path
     self.gradient_save_freq = gradient_save_freq
     # Create folder if needed
     if self.model_save_path is not None:
         os.makedirs(self.model_save_path, exist_ok=True)
         self.path = os.path.join(self.model_save_path, "model.zip")
     else:
         assert (
             self.model_save_freq == 0
         ), "to use the `model_save_freq` you have to set the `model_save_path` parameter"
Exemple #7
0
def log_summary(
    model: Union[CatBoostClassifier, CatBoostRegressor],
    log_all_params: bool = True,
    save_model_checkpoint: bool = False,
    log_feature_importance: bool = True,
) -> None:
    """`log_summary` logs useful metrics about catboost model after training is done

    Arguments:
        model: it can be CatBoostClassifier or CatBoostRegressor.
        log_all_params: (boolean) if True (default) log the model hyperparameters as W&B config.
        save_model_checkpoint: (boolean) if True saves the model upload as W&B artifacts.
        log_feature_importance: (boolean) if True (default) logs feature importance as W&B bar chart using the default setting of `get_feature_importance`.

    Using this along with `wandb_callback` will:

    - save the hyperparameters as W&B config,
    - log `best_iteration` and `best_score` as `wandb.summary`,
    - save and upload your trained model to Weights & Biases Artifacts (when `save_model_checkpoint = True`)
    - log feature importance plot.

    Example:
        ```python
        train_pool = Pool(train[features], label=train['label'], cat_features=cat_features)
        test_pool = Pool(test[features], label=test['label'], cat_features=cat_features)

        model = CatBoostRegressor(
            iterations=100,
            loss_function='Cox',
            eval_metric='Cox',
        )

        model.fit(
            train_pool,
            eval_set=test_pool,
            callbacks=[WandbCallback()],
        )

        log_summary(model)
        ```
    """
    if wandb.run is None:
        raise wandb.Error(
            "You must call `wandb.init()` before `log_summary()`")

    if not (isinstance(model, (CatBoostClassifier, CatBoostRegressor))):
        raise wandb.Error(
            "Model should be an instance of CatBoostClassifier or CatBoostRegressor"
        )

    with wb_telemetry.context() as tel:
        tel.feature.catboost_log_summary = True

    # log configs
    params = model.get_all_params()
    if log_all_params:
        wandb.config.update(params)

    # log best score and iteration
    wandb.run.summary["best_iteration"] = model.get_best_iteration()
    wandb.run.summary["best_score"] = model.get_best_score()

    # log model
    if save_model_checkpoint:
        aliases = ["best"] if params["use_best_model"] else ["last"]
        _checkpoint_artifact(model, aliases=aliases)

    # Feature importance
    if log_feature_importance:
        _log_feature_importance(model)
    def wandb_save(
        glob_str: Optional[str] = None,
        base_path: Optional[str] = None,
        policy: str = "live",
    ) -> Union[bool, List[str]]:
        """
        NOTE: This reimplements wandb.save, but copies files instead of symlinking.
        The symlinks have caused many issues on Windows and google colab.

        ORIGINAL DOCS:
        Ensure all files matching `glob_str` are synced to wandb with the policy specified.

        Arguments:
            glob_str: (string) a relative or absolute path to a unix glob or regular
                path.  If this isn't specified the method is a noop.
            base_path: (string) the base path to run the glob relative to
            policy: (string) on of `live`, `now`, or `end`
                - live: upload the file as it changes, overwriting the previous version
                - now: upload the file once now
                - end: only upload file when the run ends
        """
        if glob_str is None:
            # noop for historical reasons, run.save() may be called in legacy code
            wandb.termwarn(
                ("Calling run.save without any arguments is deprecated."
                 "Changes to attributes are automatically persisted."))
            return True
        if policy not in ("live", "end", "now"):
            raise ValueError(
                'Only "live" "end" and "now" policies are currently supported.'
            )
        if isinstance(glob_str, bytes):
            glob_str = glob_str.decode("utf-8")
        if not isinstance(glob_str, string_types):
            raise ValueError(
                "Must call wandb.save(glob_str) with glob_str a str")

        if base_path is None:
            if os.path.isabs(glob_str):
                base_path = os.path.dirname(glob_str)
                wandb.termwarn(
                    ("Saving files without folders. If you want to preserve "
                     "sub directories pass base_path to wandb.save, i.e. "
                     'wandb.save("/mnt/folder/file.h5", base_path="/mnt")'))
            else:
                base_path = ""
        wandb_glob_str = os.path.relpath(glob_str, base_path)
        if ".." + os.sep in wandb_glob_str:
            raise ValueError("globs can't walk above base_path")

        with telemetry.context(run=wandb.run) as tel:
            tel.feature.save = True

        if glob_str.startswith("gs://") or glob_str.startswith("s3://"):
            wandb.termlog(
                "%s is a cloud storage url, can't save file to wandb." %
                glob_str)
            return []
        files = glob.glob(os.path.join(wandb.run.dir, wandb_glob_str))
        warn = False
        if len(files) == 0 and "*" in wandb_glob_str:
            warn = True
        for path in glob.glob(glob_str):
            file_name = os.path.relpath(path, base_path)
            abs_path = os.path.abspath(path)
            wandb_path = os.path.join(wandb.run.dir, file_name)
            wandb.util.mkdir_exists_ok(os.path.dirname(wandb_path))
            # We overwrite symlinks because namespaces can change in Tensorboard
            if os.path.islink(
                    wandb_path) and abs_path != os.readlink(wandb_path):
                os.remove(wandb_path)
                shutil.copy(abs_path,
                            wandb.run.dir)  # os.symlink(abs_path, wandb_path)
            elif not os.path.exists(wandb_path):
                shutil.copy(abs_path,
                            wandb.run.dir)  # os.symlink(abs_path, wandb_path)
            files.append(wandb_path)
        if warn:
            file_str = "%i file" % len(files)
            if len(files) > 1:
                file_str += "s"
            wandb.termwarn(
                ("Symlinked %s into the W&B run directory, "
                 "call wandb.save again to sync new files.") % file_str)
        files_dict = dict(files=[(wandb_glob_str, policy)])
        if wandb.run._backend:
            wandb.run._backend.interface.publish_files(files_dict)
        return files
Exemple #9
0
def torch_trace_handler():
    """Creates a trace handler for traces generated by the profiler.

     Provide as an argument to `torch.profiler.profile`:
     ```python
     torch.profiler.profile(..., on_trace_ready = wandb.profiler.torch_trace_handler())
     ```

    Calling this function ensures that profiler charts & tables can be viewed in your run dashboard
    on wandb.ai.

    Please note that `wandb.init()` must be called before this function is invoked.
    The PyTorch (torch) version must also be at least 1.9, in order to ensure stability
    of their Profiler API.

    Args:
        None

    Returns:
        None

    Raises:
        UsageError if wandb.init() hasn't been called before profiling.
        Error if torch version is less than 1.9.0.

    Examples:
    ```python
    run = wandb.init()
    run.config.id = "profile_code"

    with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=wandb.profiler.torch_trace_handler(),
        record_shapes=True,
        with_stack=True,
    ) as prof:
        for i, batch in enumerate(dataloader):
            if step >= 5:
                break
            train(batch)
            prof.step()
    ```
    """
    torch = wandb.util.get_module(PYTORCH_MODULE, required=True)
    torch_profiler = wandb.util.get_module(PYTORCH_PROFILER_MODULE,
                                           required=True)
    version = tuple(
        map(lambda x: int(x),
            torch.__version__.replace("+cpu", "").split(".")))

    if version < (1, 9, 0):
        raise Error(
            f"torch version must be at least 1.9 in order to use the PyTorch Profiler API.\
            \nVersion of torch currently installed: {torch.__version__}")

    try:
        logdir = os.path.join(wandb.run.dir, "pytorch_traces")  # type: ignore
        os.mkdir(logdir)
    except AttributeError:
        raise UsageError(
            "Please call `wandb.init()` before `wandb.profiler.torch_trace_handler()`"
        ) from None

    with telemetry.context() as tel:
        tel.feature.torch_profiler_trace = True

    return torch_profiler.tensorboard_trace_handler(logdir)