def commit(self, path: Optional[Path] = None) -> None: if (self.storage_mode == CheckpointStorage.MEMORY or not path or not isinstance(self.dir_or_data, dict)): return source_ip = self.dir_or_data[NODE_IP_KEY] source_path = self.dir_or_data[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: # Move contents of source_path, but not source_path # itself. shutil.move is already recursive. for inner in Path(source_path).iterdir(): shutil.move(str(inner.absolute()), str(path)) shutil.rmtree(source_path, ignore_errors=True) else: sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=str(path), return_futures=False, max_size_bytes=None, ) delete_on_node(node_ip=source_ip, path=source_path) save_preprocessor_to_dir(self.dir_or_data.pop(PREPROCESSOR_KEY, None), path) # add tune checkpoint id with open(path.joinpath(TUNE_CHECKPOINT_ID), "w") as f: f.write(str(self.id))
def save_checkpoint(self, tmp_checkpoint_dir: str = ""): checkpoint_path = super().save_checkpoint() parent_dir = TrainableUtil.find_checkpoint_dir(checkpoint_path) preprocessor = self._merged_config.get("preprocessor", None) if parent_dir and preprocessor: save_preprocessor_to_dir(preprocessor, parent_dir) return checkpoint_path
def save_checkpoint(self, checkpoint_dir: str): checkpoint_path = super(AIRRLTrainer, self).save_checkpoint(checkpoint_dir) trainer_class_path = os.path.join(checkpoint_dir, RL_TRAINER_CLASS_FILE) with open(trainer_class_path, "wb") as fp: cpickle.dump(self.__class__, fp) config_path = os.path.join(checkpoint_dir, RL_CONFIG_FILE) with open(config_path, "wb") as fp: cpickle.dump(self.config, fp) if preprocessor: save_preprocessor_to_dir(preprocessor, checkpoint_dir) return checkpoint_path
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) save_preprocessor_to_dir(preprocessor, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = SklearnPredictor.from_checkpoint(checkpoint) assert np.allclose( checkpoint_predictor.estimator.feature_importances_, predictor.estimator.feature_importances_, ) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = XGBoostPredictor(model=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: # This somewhat convoluted procedure is the same as in the # Trainers. The reason for saving model to disk instead # of directly to the dict as bytes is due to all callbacks # following save to disk logic. GBDT models are small # enough that IO should not be an issue. model.save_model(os.path.join(tmpdir, MODEL_KEY)) save_preprocessor_to_dir(preprocessor, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = XGBoostPredictor.from_checkpoint(checkpoint) assert get_num_trees(checkpoint_predictor.model) == get_num_trees(predictor.model) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def from_estimator( cls, estimator: BaseEstimator, *, path: os.PathLike, preprocessor: Optional["Preprocessor"] = None, ) -> "SklearnCheckpoint": """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores an sklearn ``Estimator``. Args: estimator: The ``Estimator`` to store in the checkpoint. path: The directory where the checkpoint will be stored. preprocessor: A fitted preprocessor to be applied before inference. Returns: An :py:class:`SklearnCheckpoint` containing the specified ``Estimator``. Examples: >>> from ray.train.sklearn import SklearnCheckpoint >>> from sklearn.ensemble import RandomForestClassifier >>> >>> estimator = RandomForestClassifier() >>> checkpoint = SklearnCheckpoint.from_estimator(estimator, path=".") You can use a :py:class:`SklearnCheckpoint` to create an :py:class:`~ray.train.sklearn.SklearnPredictor` and preform inference. >>> from ray.train.sklearn import SklearnPredictor >>> >>> predictor = SklearnPredictor.from_checkpoint(checkpoint) """ with open(os.path.join(path, MODEL_KEY), "wb") as f: cpickle.dump(estimator, f) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = cls.from_directory(path) return checkpoint
def from_model( cls, booster: xgboost.Booster, *, path: os.PathLike, preprocessor: Optional["Preprocessor"] = None, ) -> "XGBoostCheckpoint": """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores an XGBoost model. Args: booster: The XGBoost model to store in the checkpoint. path: The directory where the checkpoint will be stored. preprocessor: A fitted preprocessor to be applied before inference. Returns: An :py:class:`XGBoostCheckpoint` containing the specified ``Estimator``. Examples: >>> from ray.train.xgboost import XGBoostCheckpoint >>> import xgboost >>> >>> booster = xgboost.Booster() >>> checkpoint = XGBoostCheckpoint.from_model(booster, path=".") # doctest: +SKIP # noqa: E501 You can use a :py:class:`XGBoostCheckpoint` to create an :py:class:`~ray.train.xgboost.XGBoostPredictor` and preform inference. >>> from ray.train.xgboost import XGBoostPredictor >>> >>> predictor = XGBoostPredictor.from_checkpoint(checkpoint) # doctest: +SKIP # noqa: E501 """ booster.save_model(os.path.join(path, MODEL_KEY)) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = cls.from_directory(path) return checkpoint
def to_air_checkpoint( path: str, booster: lightgbm.Booster, preprocessor: Optional["Preprocessor"] = None, ) -> Checkpoint: """Convert a pretrained model to AIR checkpoint for serve or inference. Args: path: The directory path where model and preprocessor steps are stored to. booster: A pretrained lightgbm model. preprocessor: A fitted preprocessor. The preprocessing logic will be applied to serve/inference. Returns: A Ray Air checkpoint. """ booster.save_model(os.path.join(path, MODEL_KEY)) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = Checkpoint.from_directory(path) return checkpoint
def from_model( cls, model: Union[transformers.modeling_utils.PreTrainedModel, torch.nn.Module], tokenizer: Optional[transformers.PreTrainedTokenizer] = None, *, path: os.PathLike, preprocessor: Optional["Preprocessor"] = None, ) -> "HuggingFaceCheckpoint": """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores a HuggingFace model. Args: model: The pretrained transformer or Torch model to store in the checkpoint. tokenizer: The Tokenizer to use in the Transformers pipeline for inference. path: The directory where the checkpoint will be stored. preprocessor: A fitted preprocessor to be applied before inference. Returns: A :py:class:`HuggingFaceCheckpoint` containing the specified model. """ if not isinstance(model, transformers.modeling_utils.PreTrainedModel): state_dict = model.state_dict() torch.save(state_dict, os.path.join(path, WEIGHTS_NAME)) else: model.save_pretrained(path) if tokenizer: tokenizer.save_pretrained(path) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = cls.from_directory(path) return checkpoint
def to_air_checkpoint( path: str, estimator: BaseEstimator, preprocessor: Optional["Preprocessor"] = None, ) -> Checkpoint: """Convert a pretrained model to AIR checkpoint for serve or inference. Args: path: The directory path where model and preprocessor steps are stored to. estimator: A pretrained model. preprocessor: A fitted preprocessor. The preprocessing logic will be applied to serve/inference. Returns: A Ray Air checkpoint. """ with open(os.path.join(path, MODEL_KEY), "wb") as f: cpickle.dump(estimator, f) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = Checkpoint.from_directory(path) return checkpoint
def training_loop(self) -> None: register_ray() self.estimator.set_params(**self.params) datasets = self._get_datasets() X_train, y_train = datasets.pop(TRAIN_DATASET_KEY) groups = None if "cv_groups" in X_train.columns: groups = X_train["cv_groups"] X_train = X_train.drop("cv_groups", axis=1) scaling_config_dataclass = self._validate_and_get_scaling_config_data_class( self.scaling_config ) num_workers = scaling_config_dataclass.num_workers or 0 assert num_workers == 0 # num_workers is not in scaling config allowed_keys trainer_resources = scaling_config_dataclass.trainer_resources or {"CPU": 1} has_gpus = bool(trainer_resources.get("GPU", 0)) num_cpus = int(trainer_resources.get("CPU", 1)) # see https://scikit-learn.org/stable/computing/parallelism.html os.environ["OMP_NUM_THREADS"] = str(num_cpus) os.environ["MKL_NUM_THREADS"] = str(num_cpus) os.environ["OPENBLAS_NUM_THREADS"] = str(num_cpus) os.environ["BLIS_NUM_THREADS"] = str(num_cpus) parallelize_cv = self._get_cv_parallelism(has_gpus) if self.set_estimator_cpus: num_estimator_cpus = 1 if parallelize_cv else num_cpus _set_cpu_params(self.estimator, num_estimator_cpus) with parallel_backend("ray", n_jobs=num_cpus): start_time = time() self.estimator.fit(X_train, y_train, **self.fit_params) fit_time = time() - start_time with tune.checkpoint_dir(step=1) as checkpoint_dir: with open(os.path.join(checkpoint_dir, MODEL_KEY), "wb") as f: cpickle.dump(self.estimator, f) if self.preprocessor: save_preprocessor_to_dir(self.preprocessor, checkpoint_dir) if self.label_column: validation_set_scores = self._score_on_validation_sets( self.estimator, datasets ) cv_scores = self._score_cv( self.estimator, X_train, y_train, groups, # if estimator has parallelism, use that. Otherwise, # parallelize CV n_jobs=1 if not parallelize_cv else num_cpus, ) else: validation_set_scores = {} cv_scores = {} # cv_scores will not override validation_set_scores as we # check for that during initialization results = { **validation_set_scores, **cv_scores, "fit_time": fit_time, } tune.report(**results)