Esempio n. 1
0
        def train(
            X_train: Any,
            y_train: Any,
            X_valid: Any,
            y_valid: Any,
            train_batch_size: int,
            valid_batch_size: int,
            num_train_epochs: int,
            model_cls: Any,
            model_params: Dict[str, Any],
            master_ip: str,
            gobbli_dir: Optional[Path] = None,
            log_level: Union[int, str] = logging.WARNING,
            distributed: bool = False,
        ) -> RemoteTrainResult:

            logger = init_worker_env(gobbli_dir=gobbli_dir,
                                     log_level=log_level)
            use_gpu, nvidia_visible_devices = init_gpu_config()

            worker_ip = get_worker_ip()
            if not distributed and worker_ip != master_ip:
                raise RuntimeError(
                    "Experiments must be started with distributed = True to run "
                    "tasks on remote workers.")

            clf = model_cls(
                **model_params,
                use_gpu=use_gpu,
                nvidia_visible_devices=nvidia_visible_devices,
                logger=logger,
            )

            clf.build()

            train_input = gobbli.io.TrainInput(
                X_train=X_train,
                y_train=y_train,
                X_valid=X_valid,
                y_valid=y_valid,
                train_batch_size=train_batch_size,
                valid_batch_size=valid_batch_size,
                num_train_epochs=num_train_epochs,
            )
            train_output = clf.train(train_input)
            checkpoint = train_output.checkpoint
            checkpoint_name = getattr(checkpoint, "name", None)

            if distributed:
                # Copy weights into the object store, since we don't share a filesystem
                # with the master node
                checkpoint = (dir_to_blob(checkpoint.parent)
                              if checkpoint is not None else None)

            if not is_ray_local_mode():
                checkpoint = ray.put(checkpoint)

            return RemoteTrainResult(
                metadata=train_output.metadata(),
                labels=train_output.labels,
                checkpoint_name=checkpoint_name,
                checkpoint_id=checkpoint,
                model_params=model_params,
                ip_address=worker_ip,
            )
Esempio n. 2
0
        def predict(
            X_test: List[str],
            test_batch_size: int,
            model_cls: Any,
            model_params: Dict[str, Any],
            labels: List[str],
            checkpoint: Union[bytes, Path],
            checkpoint_name: Optional[str],
            master_ip: str,
            gobbli_dir: Optional[Path] = None,
            log_level: Union[int, str] = logging.WARNING,
            distributed: bool = False,
        ) -> pd.DataFrame:

            logger = init_worker_env(gobbli_dir=gobbli_dir,
                                     log_level=log_level)
            use_gpu, nvidia_visible_devices = init_gpu_config()

            worker_ip = get_worker_ip()
            if not distributed and worker_ip != master_ip:
                raise RuntimeError(
                    "Experiments must be started with distributed = True to run "
                    "tasks on remote workers.")

            clf = model_cls(
                **model_params,
                use_gpu=use_gpu,
                nvidia_visible_devices=nvidia_visible_devices,
                logger=logger,
            )

            # This step isn't necessary in all cases if the build step just downloads
            # pretrained weights we weren't going to use anyway, but sometimes it's needed
            # Ex. for BERT to download vocabulary files and config
            clf.build()

            # Use the current working directory (CWD) as the base for the tempdir, under the
            # assumption that the CWD is included in any bind mounts/volumes the user may have
            # created if they're running this in a Docker container
            # If it's not part of a host mount, the files won't be mounted properly in the container
            with tempfile.TemporaryDirectory(dir=".") as tempdir:
                tempdir_path = Path(tempdir)

                checkpoint_path = None  # type: Optional[Path]
                if isinstance(checkpoint, bytes):
                    if checkpoint_name is not None:
                        blob_to_dir(checkpoint, tempdir_path)
                        checkpoint_path = tempdir_path / checkpoint_name
                elif isinstance(checkpoint, Path):
                    checkpoint_path = checkpoint
                elif checkpoint is None:
                    pass
                else:
                    raise TypeError(
                        f"invalid checkpoint type: '{type(checkpoint)}'")

                predict_input = gobbli.io.PredictInput(
                    X=X_test,
                    labels=labels,
                    checkpoint=checkpoint_path,
                    predict_batch_size=test_batch_size,
                )
                predict_output = clf.predict(predict_input)

            return predict_output.y_pred_proba
Esempio n. 3
0
    def run(
        self,
        dataset_split: Optional[Union[Tuple[float, float],
                                      Tuple[float, float, float]]] = None,
        seed: int = 1,
        train_batch_size: int = 32,
        valid_batch_size: int = 32,
        test_batch_size: int = 32,
        num_train_epochs: int = 5,
    ) -> ClassificationExperimentResults:
        """
        Run the experiment.

        Args:
          dataset_split: A tuple describing the proportion of the dataset
            to be added to the train/validation/test splits.  If the experiment uses an explicit
            test set (passes :paramref:`BaseExperiment.params.test_dataset`), this should be a
            2-tuple describing the train/validation split.  Otherwise, it should be a 3-tuple
            describing the train/validation/test split. The tuple must sum to 1.
          seed: Random seed to be used for dataset splitting for reproducibility.
          train_batch_size: Number of observations per batch on the training dataset.
          valid_batch_size: Number of observations per batch on the validation dataset.
          test_batch_size: Number of observations per batch on the test dataset.
          num_train_epochs: Number of epochs to use for training.

        Returns:
          The results of the experiment.
        """
        _dataset_split = dataset_split

        # If the user didn't pass an explicit test set, create one
        # using a split
        if self.X_test is None:
            if _dataset_split is None:
                _dataset_split = (
                    ClassificationExperiment._DEFAULT_TRAIN_VALID_TEST_SPLIT)

            ClassificationExperiment._validate_split(_dataset_split,
                                                     expected_len=3)

            # cast needed to satisfy mypy
            train_prop, valid_prop, test_prop = cast(
                Tuple[float, float, float], _dataset_split)
            train_valid_prop = train_prop + valid_prop

            X_train_valid, X_test, y_train_valid, y_test = train_test_split(
                self.X,
                self.y,
                train_size=train_valid_prop,
                test_size=test_prop)

        else:
            if _dataset_split is None:
                _dataset_split = ClassificationExperiment._DEFAULT_TRAIN_VALID_SPLIT

            ClassificationExperiment._validate_split(_dataset_split,
                                                     expected_len=2)

            # cast needed to satisfy mypy
            train_prop, valid_prop = cast(Tuple[float, float], _dataset_split)
            train_valid_prop = 1

            X_train_valid, y_train_valid = self.X, self.y
            X_test, y_test = self.X_test, self.y_test

        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train_valid,
            y_train_valid,
            # Round to prevent floating point imprecision errors
            train_size=round(train_prop / train_valid_prop, 4),
            test_size=round(valid_prop / train_valid_prop, 4),
        )

        if self.param_grid is not None:
            for param, values in self.param_grid.items():
                if isinstance(values, str):
                    raise TypeError(
                        f"String detected in parameter grid values for parameter '{param}'. "
                        "This will be treated as a list of character parameter values, "
                        "which probably isn't what you want.  If you're really sure, "
                        "convert the string to a list of characters and try again."
                    )
        grid = ParameterGrid(self.param_grid)
        if len(grid) == 0:
            raise ValueError("empty parameter grid")

        # Transfer datasets to the Ray distributed object store
        # if not running in local mode
        # In local mode, this causes problems: https://github.com/ray-project/ray/issues/5379
        if is_ray_local_mode():
            dataset_ids = [X_train, y_train, X_valid, y_valid]
        else:
            dataset_ids = [
                ray.put(d) for d in (X_train, y_train, X_valid, y_valid)
            ]

        # Return the checkpoint blob separately from the train result so it doesn't
        # have to be copied to the object store again when used by the predict function
        @ray.remote(num_cpus=self.task_num_cpus, num_gpus=self.task_num_gpus)
        def train(
            X_train: Any,
            y_train: Any,
            X_valid: Any,
            y_valid: Any,
            train_batch_size: int,
            valid_batch_size: int,
            num_train_epochs: int,
            model_cls: Any,
            model_params: Dict[str, Any],
            master_ip: str,
            gobbli_dir: Optional[Path] = None,
            log_level: Union[int, str] = logging.WARNING,
            distributed: bool = False,
        ) -> RemoteTrainResult:

            logger = init_worker_env(gobbli_dir=gobbli_dir,
                                     log_level=log_level)
            use_gpu, nvidia_visible_devices = init_gpu_config()

            worker_ip = get_worker_ip()
            if not distributed and worker_ip != master_ip:
                raise RuntimeError(
                    "Experiments must be started with distributed = True to run "
                    "tasks on remote workers.")

            clf = model_cls(
                **model_params,
                use_gpu=use_gpu,
                nvidia_visible_devices=nvidia_visible_devices,
                logger=logger,
            )

            clf.build()

            train_input = gobbli.io.TrainInput(
                X_train=X_train,
                y_train=y_train,
                X_valid=X_valid,
                y_valid=y_valid,
                train_batch_size=train_batch_size,
                valid_batch_size=valid_batch_size,
                num_train_epochs=num_train_epochs,
            )
            train_output = clf.train(train_input)
            checkpoint = train_output.checkpoint
            checkpoint_name = getattr(checkpoint, "name", None)

            if distributed:
                # Copy weights into the object store, since we don't share a filesystem
                # with the master node
                checkpoint = (dir_to_blob(checkpoint.parent)
                              if checkpoint is not None else None)

            if not is_ray_local_mode():
                checkpoint = ray.put(checkpoint)

            return RemoteTrainResult(
                metadata=train_output.metadata(),
                labels=train_output.labels,
                checkpoint_name=checkpoint_name,
                checkpoint_id=checkpoint,
                model_params=model_params,
                ip_address=worker_ip,
            )

        @ray.remote(num_cpus=self.task_num_cpus, num_gpus=self.task_num_gpus)
        def predict(
            X_test: List[str],
            test_batch_size: int,
            model_cls: Any,
            model_params: Dict[str, Any],
            labels: List[str],
            checkpoint: Union[bytes, Path],
            checkpoint_name: Optional[str],
            master_ip: str,
            gobbli_dir: Optional[Path] = None,
            log_level: Union[int, str] = logging.WARNING,
            distributed: bool = False,
        ) -> pd.DataFrame:

            logger = init_worker_env(gobbli_dir=gobbli_dir,
                                     log_level=log_level)
            use_gpu, nvidia_visible_devices = init_gpu_config()

            worker_ip = get_worker_ip()
            if not distributed and worker_ip != master_ip:
                raise RuntimeError(
                    "Experiments must be started with distributed = True to run "
                    "tasks on remote workers.")

            clf = model_cls(
                **model_params,
                use_gpu=use_gpu,
                nvidia_visible_devices=nvidia_visible_devices,
                logger=logger,
            )

            # This step isn't necessary in all cases if the build step just downloads
            # pretrained weights we weren't going to use anyway, but sometimes it's needed
            # Ex. for BERT to download vocabulary files and config
            clf.build()

            # Use the current working directory (CWD) as the base for the tempdir, under the
            # assumption that the CWD is included in any bind mounts/volumes the user may have
            # created if they're running this in a Docker container
            # If it's not part of a host mount, the files won't be mounted properly in the container
            with tempfile.TemporaryDirectory(dir=".") as tempdir:
                tempdir_path = Path(tempdir)

                checkpoint_path = None  # type: Optional[Path]
                if isinstance(checkpoint, bytes):
                    if checkpoint_name is not None:
                        blob_to_dir(checkpoint, tempdir_path)
                        checkpoint_path = tempdir_path / checkpoint_name
                elif isinstance(checkpoint, Path):
                    checkpoint_path = checkpoint
                elif checkpoint is None:
                    pass
                else:
                    raise TypeError(
                        f"invalid checkpoint type: '{type(checkpoint)}'")

                predict_input = gobbli.io.PredictInput(
                    X=X_test,
                    labels=labels,
                    checkpoint=checkpoint_path,
                    predict_batch_size=test_batch_size,
                )
                predict_output = clf.predict(predict_input)

            return predict_output.y_pred_proba

        # Record the IP address of the master node so workers can detect
        # whether they're remote and not running in distributed mode, at which
        # point they should raise an error
        master_ip = get_worker_ip()

        # Run training in parallel using the Ray cluster
        raw_results = ray.get([
            train.remote(
                *dataset_ids,
                train_batch_size,
                valid_batch_size,
                num_train_epochs,
                self.model_cls,
                params,
                master_ip,
                self.worker_gobbli_dir,
                self.worker_log_level,
                self.distributed,
            ) for params in grid
        ])

        training_results = []  # type: List[Dict[str, Any]]
        best_valid_loss = math.inf
        best_result = None  # type: Optional[RemoteTrainResult]
        best_checkpoint_id = None  # type: Optional[ray.ObjectID]

        for train_results in raw_results:
            result = {
                **train_results.metadata,
                "node_ip_address": train_results.ip_address,
                "model_params": train_results.model_params,
            }
            if result["valid_loss"] < best_valid_loss:
                best_result = train_results
                best_checkpoint_id = train_results.checkpoint_id
                best_valid_loss = result["valid_loss"]

            training_results.append(result)

        if best_result is None:
            raise ValueError(
                "failed to find parameter combination with finite validation loss"
            )

        # Evaluate the best model on the test set
        if is_ray_local_mode():
            X_test_id = X_test
        else:
            X_test_id = ray.put(X_test)
        y_pred_proba = ray.get(
            predict.remote(
                X_test_id,
                test_batch_size,
                self.model_cls,
                best_result.model_params,
                best_result.labels,
                best_checkpoint_id,
                best_result.checkpoint_name,
                master_ip,
                self.worker_gobbli_dir,
                self.worker_log_level,
                self.distributed,
            ))

        best_checkpoint = best_checkpoint_id
        if not is_ray_local_mode():
            best_checkpoint = ray.get(best_checkpoint_id)

        return ClassificationExperimentResults(
            X=X_test,
            labels=best_result.labels,  # type: ignore
            y_true=y_test,
            y_pred_proba=y_pred_proba,
            training_results=training_results,
            best_model_checkpoint=cast(Union[bytes, Path], best_checkpoint),
            best_model_checkpoint_name=best_result.checkpoint_name,
        )