Exemple #1
0
    def __init__(
        self,
        data_dir: Optional[Path] = None,
        load_existing: bool = False,
        use_gpu: bool = False,
        nvidia_visible_devices: str = "all",
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        """
        Create a model.

        Args:
          data_dir: Optional path to a directory used to store model data.  If not given,
            a unique directory under GOBBLI_DIR will be created and used.
          load_existing: If True, ``data_dir`` should be a directory that was previously used
            to create a model.  Parameters will be loaded to match the original model, and
            user-specified model parameters will be ignored.  If False, the data_dir must
            be empty if it already exists.
          use_gpu: If True, use the
            nvidia-docker runtime (https://github.com/NVIDIA/nvidia-docker) to expose
            NVIDIA GPU(s) to the container.  Will cause an error if the computer you're running
            on doesn't have an NVIDIA GPU and/or doesn't have the nvidia-docker runtime installed.
          nvidia_visible_devices: Which GPUs to make available to the container; ignored if
            ``use_gpu`` is False.  If not 'all', should be a comma-separated string: ex. ``1,2``.
          logger: If passed, use this logger for logging instead of the default module-level logger.
          **kwargs: Additional model-specific parameters to be passed to the model's :meth:`init` method.
        """
        if data_dir is None:
            self._data_dir = self.model_class_dir() / generate_uuid()
        else:
            self._data_dir = data_dir
        self._data_dir.mkdir(parents=True, exist_ok=True)

        if load_existing and self.metadata_path.exists():
            params = read_metadata(self.metadata_path)
            if len(kwargs) > 0:
                warnings.warn(
                    "User-passed params ignored due to existing model being "
                    f"loaded: {kwargs}")
        else:
            if not is_dir_empty(self._data_dir):
                raise ValueError(
                    f"data_dir '{self._data_dir}' is non-empty;"
                    " it must be empty to avoid overwriting data.")
            params = kwargs
            write_metadata(params, self.metadata_path)

        self.use_gpu = use_gpu
        self.nvidia_visible_devices = nvidia_visible_devices

        self._logger = LOGGER
        if logger is not None:
            self._logger = logger

        self.docker_client = docker.from_env()

        self.init(params)
Exemple #2
0
def persist_estimator(estimator: BaseEstimator) -> Path:
    """
    Saves the given estimator to a gobbli-managed filepath, where it can be loaded from
    disk by the SKLearnClassifier.  This is useful if you want to use an estimator but
    don't want to bother with saving it to disk on your own.

    Args:
      estimator: The estimator to load.

    Returns:
      The path where the estimator was saved.
    """
    estimator_dir = (SKLearnClassifier.model_class_dir() / "user_estimators" /
                     generate_uuid())
    estimator_dir.mkdir(exist_ok=True, parents=True)

    estimator_path = estimator_dir / SKLearnClassifier._TRAIN_OUTPUT_CHECKPOINT
    SKLearnClassifier._dump_estimator(estimator, estimator_path)

    return estimator_path
Exemple #3
0
def _run_task(
    task_func: Callable[[Any, ContainerTaskContext], Any],
    task_input: gobbli.io.TaskIO,
    root_dir: Path,
    dir_name: Optional[str] = None,
) -> gobbli.io.TaskIO:
    """
    Run a task function that generates some output.  Can create a unique id
    to name the directory storing the input/output or use a user-provided name.
    Generate a context object to pass to the task.
    """
    if dir_name is None:
        task_id = generate_uuid()
        task_root_dir = root_dir / task_id
    else:
        task_root_dir = root_dir / dir_name

    if task_root_dir.exists():
        raise ValueError(
            f"Directory '{task_root_dir}' already exists.  Supply a different `dir_name`."
        )
    context = ContainerTaskContext(task_root_dir=task_root_dir)

    write_metadata(
        task_input.metadata(),
        context.host_input_dir / gobbli.io.TaskIO._METADATA_FILENAME,
    )

    task_output = cast(gobbli.io.TaskIO, task_func(task_input, context))

    write_metadata(
        task_output.metadata(),
        context.host_output_dir / gobbli.io.TaskIO._METADATA_FILENAME,
    )

    return task_output
Exemple #4
0
    def __init__(
        self,
        data_dir: Optional[Path] = None,
        load_existing: bool = False,
        use_gpu: bool = False,
        nvidia_visible_devices: str = "all",
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        """
        Create a model.

        Args:
          data_dir: Optional path to a directory used to store model data.  If not given,
            a unique directory under GOBBLI_DIR will be created and used.
          load_existing: If True, ``data_dir`` should be a directory that was previously used
            to create a model.  Parameters will be loaded to match the original model, and
            user-specified model parameters will be ignored.  If False, the data_dir must
            be empty if it already exists.
          use_gpu: If True, use the
            nvidia-docker runtime (https://github.com/NVIDIA/nvidia-docker) to expose
            NVIDIA GPU(s) to the container.  Will cause an error if the computer you're running
            on doesn't have an NVIDIA GPU and/or doesn't have the nvidia-docker runtime installed.
          nvidia_visible_devices: Which GPUs to make available to the container; ignored if
            ``use_gpu`` is False.  If not 'all', should be a comma-separated string: ex. ``1,2``.
          logger: If passed, use this logger for logging instead of the default module-level logger.
          **kwargs: Additional model-specific parameters to be passed to the model's :meth:`init` method.
        """
        self._logger = LOGGER
        if logger is not None:
            self._logger = logger

        if data_dir is None:
            self._data_dir = self.model_class_dir() / generate_uuid()
        else:
            self._data_dir = data_dir
        # Ensure we have an absolute data dir so any derived paths used in metadata files, etc
        # aren't ambiguous
        self._data_dir = self._data_dir.resolve()
        self._data_dir.mkdir(parents=True, exist_ok=True)

        class_name = self.__class__.__name__
        cur_gobbli_version = gobbli_version()

        if self.info_path.exists():
            info = read_metadata(self.info_path)
            if not info["class"] == class_name:
                raise ValueError(
                    f"Model class mismatch: the model stored in {data_dir} is of "
                    f"class '{info['class']}'.  Expected '{class_name}'.")
            if not info["gobbli_version"] == cur_gobbli_version:
                warnings.warn(
                    f"The model stored in {data_dir} was created with gobbli version "
                    f"{info['gobbli_version']}, but you're running version {cur_gobbli_version}. "
                    "You may encounter compatibility issues.")

        if load_existing and self.metadata_path.exists():
            params = read_metadata(self.metadata_path)
            if len(kwargs) > 0:
                warnings.warn(
                    "User-passed params ignored due to existing model being "
                    f"loaded: {kwargs}")

        else:
            if not is_dir_empty(self._data_dir):
                raise ValueError(
                    f"data_dir '{self._data_dir}' is non-empty;"
                    " it must be empty to avoid overwriting data.")
            params = kwargs
            write_metadata(params, self.metadata_path)
            write_metadata(
                {
                    "class": class_name,
                    "gobbli_version": cur_gobbli_version
                },
                self.info_path,
            )

        self.use_gpu = use_gpu
        self.nvidia_visible_devices = nvidia_visible_devices

        self.docker_client = docker.from_env()

        self.init(params)

        self._logger.info(
            f"{class_name} initialized with data directory '{self._data_dir}'")
Exemple #5
0
    def __init__(
        self,
        model_cls: Any,
        dataset: Union[Tuple[List[str], List[str]], BaseDataset],
        test_dataset: Optional[Tuple[List[str], List[str]]] = None,
        data_dir: Optional[Path] = None,
        name: Optional[str] = None,
        param_grid: Optional[Dict[str, List[Any]]] = None,
        task_num_cpus: int = 1,
        task_num_gpus: int = 0,
        worker_gobbli_dir: Optional[Path] = None,
        worker_log_level: Union[int, str] = logging.WARNING,
        limit: Optional[int] = None,
        overwrite_existing: bool = False,
        ignore_ray_initialized_error: bool = False,
        distributed: bool = False,
        ray_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """
        Construct an experiment.

        Args:
          model_cls: The class of model to be used for the experiment.
          dataset: Dataset to be used for the experiment.  Can be either a 2-tuple
            containing a list of texts and a corresponding list of labels
            or a :class:`gobbli.dataset.base.BaseDataset`.
          test_dataset: An optional separate dataset to be used for calculating test metrics.
            If passed, should be a 2-tuple containing a list of texts and corresponding list
            of labels.  If not passed, a test dataset will be automatically split out of
            the `dataset`.
          data_dir: Optional path to a directory used to store data for the experiment.
            If not given, a directory under GOBBLI_DIR will be created and used.
          name: A descriptive name for the experiment, used to label directories
            in the filesystem.  If not passed, a random name will be generated and used.
            The name must be unique (i.e., there should not be another experiment with the
            same name).
          param_grid: Optional grid of parameters.  If passed, it should be a dictionary
            with keys being valid parameter names for the passed model and values being lists
            of parameter values.  Every combination of parameter values will be tried in the
            experiment, and the results for the best combination will be returned.  If not passed,
            only the model's default parameters will be used.
          task_num_cpus: Number of CPUs to reserve per task.
          task_num_gpus: Number of GPUs to reserve per task.
          worker_gobbli_dir: Directory to use for gobbli file storage by workers.
          worker_log_level: Logging level to use for logs output by workers running
            training tasks.
          limit: Read up to this many rows from the passed dataset.  Useful for debugging.
          overwrite_existing: If True, don't fail if there's an existing experiment in
            the same directory.
          ignore_ray_initialized_error: If True, don't error when a ray connection is already
            initialized; instead, shut it down and restart it with the passed `ray_kwargs`.
          distributed: If True, run the ray cluster assuming workers are distributed over
            multiple nodes.  This requires model weights for all trials to fit in the ray
            object store, which requires a lot of memory.  If False, run the ray cluster
            assuming all workers are on the master node, and weights will be passed around as
            filepaths; an error will be thrown if a remote worker tries to run a task.
          ray_kwargs: Dictionary containing keyword arguments to be passed directly to
            :func:`ray.init`.  By default, a new ray cluster will be initialized on the current
            node using all available CPUs and no GPUs, but these arguments can be used to connect
            to a remote cluster, limit resource usage, and much more.
        """
        self.model_cls = model_cls
        self.worker_gobbli_dir = worker_gobbli_dir

        self.name = name
        if self.name is None:
            self.name = generate_uuid()

        if data_dir is None:
            self._data_dir = experiment_dir() / self.__class__.__name__ / self.name
        else:
            self._data_dir = data_dir
        self._data_dir.mkdir(parents=True, exist_ok=True)

        if not overwrite_existing and not is_dir_empty(self._data_dir):
            raise ValueError(f"Experiment already exists for name '{self.name}'")

        if isinstance(dataset, BaseDataset):
            self.X = dataset.X_train() + dataset.X_test()
            self.y = dataset.y_train() + dataset.y_test()
        elif isinstance(dataset, tuple):
            if len(dataset) != 2:
                raise ValueError(
                    f"`dataset` must be a 2-tuple, got length {len(dataset)}"
                )
            self.X, self.y = dataset
        else:
            raise TypeError(f"Invalid type for dataset: {type(dataset)}")

        self.X_test = None  # type: Optional[List[str]]
        self.y_test = None  # type: Optional[List[str]]
        if test_dataset is not None:
            if not (isinstance(dataset, tuple) and len(dataset) == 2):
                raise ValueError(f"`test_dataset` must be a 2-tuple")
            self.X_test, self.y_test = test_dataset

        if limit is not None:
            self.X = self.X[:limit]
            self.y = self.y[:limit]

        self.param_grid = param_grid
        if param_grid is None:
            self.param_grid = {}

        self.task_num_cpus = task_num_cpus
        self.task_num_gpus = task_num_gpus
        self.worker_log_level = worker_log_level
        self.distributed = distributed

        if self.model_cls is SKLearnClassifier and distributed:
            raise ValueError(
                "The scikit-learn classifier is not supported for distributed "
                "experiments, since it needs to load a pickle from a file path "
                "which may not be on a given worker node."
            )

        _ray_kwargs = ray_kwargs
        if _ray_kwargs is None:
            _ray_kwargs = {}

        self.is_ray_local_mode = _ray_kwargs.get("local_mode", False)

        # We may have an existing ray connection active -- throw an error or
        # clear it out to ensure it's re-initialized with the passed params
        if ray.is_initialized():
            if ignore_ray_initialized_error:
                ray.shutdown()
            else:
                raise RuntimeError(
                    "A ray connection is already initialized. To ignore this error"
                    " and shut down the existing connection, pass"
                    " `ignore_ray_initialized_error=True`."
                )

        ray.init(**_ray_kwargs)

        metadata = {
            "model": model_cls.__name__,
            "len_X": len(self.X),
            "len_y": len(self.y),
            "param_grid": self.param_grid,
        }

        write_metadata(metadata, self.metadata_path)