def training_loop(self) -> None: register_ray() self.estimator.set_params(**self.params) datasets = self._get_datasets() X_train, y_train = datasets.pop(TRAIN_DATASET_KEY) groups = None if "cv_groups" in X_train.columns: groups = X_train["cv_groups"] X_train = X_train.drop("cv_groups", axis=1) scaling_config_dataclass = self._validate_and_get_scaling_config_data_class( self.scaling_config) num_workers = scaling_config_dataclass.num_workers or 0 assert num_workers == 0 # num_workers is not in scaling config allowed_keys trainer_resources = scaling_config_dataclass.trainer_resources or { "CPU": 1 } has_gpus = bool(trainer_resources.get("GPU", 0)) num_cpus = int(trainer_resources.get("CPU", 1)) # see https://scikit-learn.org/stable/computing/parallelism.html os.environ["OMP_NUM_THREADS"] = str(num_cpus) os.environ["MKL_NUM_THREADS"] = str(num_cpus) os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus) os.environ["BLIS_NUM_THREADS"] = str(num_cpus) parallelize_cv = self._get_cv_parallelism(has_gpus) if self.set_estimator_cpus: num_estimator_cpus = 1 if parallelize_cv else num_cpus set_cpu_params(self.estimator, num_estimator_cpus) with parallel_backend("ray", n_jobs=num_cpus): start_time = time() self.estimator.fit(X_train, y_train, **self.fit_params) fit_time = time() - start_time with tune.checkpoint_dir(step=1) as checkpoint_dir: with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f: cpickle.dump(self.estimator, f) if self.preprocessor: save_preprocessor_to_dir(self.preprocessor, checkpoint_dir) if self.label_column: validation_set_scores = self._score_on_validation_sets( self.estimator, datasets) cv_scores = self._score_cv( self.estimator, X_train, y_train, groups, # if estimator has parallelism, use that. Otherwise, # parallelize CV n_jobs=1 if not parallelize_cv else num_cpus, ) else: validation_set_scores = {} cv_scores = {} # cv_scores will not override validation_set_scores as we # check for that during initialization results = { **validation_set_scores, **cv_scores, "fit_time": fit_time, } tune.report(**results)
def predict( self, data: DataBatchType, feature_columns: Optional[Union[List[str], List[int]]] = None, num_estimator_cpus: Optional[int] = 1, **predict_kwargs, ) -> pd.DataFrame: """Run inference on data batch. Args: data: A batch of input data. Either a pandas DataFrame or numpy array. feature_columns: The names or indices of the columns in the data to use as features to predict on. If None, then use all columns in ``data``. num_estimator_cpus: If set to a value other than None, will set the values of all ``n_jobs`` and ``thread_count`` parameters in the estimator (including in nested objects) to the given value. **predict_kwargs: Keyword arguments passed to ``estimator.predict``. Examples: .. code-block:: python import numpy as np from sklearn.ensemble import RandomForestClassifier from ray.ml.predictors.sklearn import SklearnPredictor train_X = np.array([[1, 2], [3, 4]]) train_y = np.array([0, 1]) model = RandomForestClassifier().fit(train_X, train_y) predictor = SklearnPredictor(model=model) data = np.array([[1, 2], [3, 4]]) predictions = predictor.predict(data) # Only use first and second column as the feature data = np.array([[1, 2, 8], [3, 4, 9]]) predictions = predictor.predict(data, feature_columns=[0, 1]) .. code-block:: python import pandas as pd from sklearn.ensemble import RandomForestClassifier from ray.ml.predictors.sklearn import SklearnPredictor train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) train_y = pd.Series([0, 1]) model = RandomForestClassifier().fit(train_X, train_y) predictor = SklearnPredictor(model=model) # Pandas dataframe. data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) predictions = predictor.predict(data) # Only use first and second column as the feature data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"]) predictions = predictor.predict(data, feature_columns=["A", "B"]) Returns: pd.DataFrame: Prediction result. """ register_ray() if self.preprocessor: data = self.preprocessor.transform_batch(data) if num_estimator_cpus: set_cpu_params(self.estimator, num_estimator_cpus) if feature_columns: if isinstance(data, np.ndarray): data = data[:, feature_columns] else: data = data[feature_columns] with parallel_backend("ray", n_jobs=num_estimator_cpus): df = pd.DataFrame(self.estimator.predict(data, **predict_kwargs)) df.columns = (["predictions"] if len(df.columns) == 1 else [f"predictions_{i}" for i in range(len(df.columns))]) return df