def test_dict_checkpoint_fs(self): """Test conversion from dict to FS checkpoint and back.""" checkpoint = self._prepare_dict_checkpoint() # Convert into fs checkpoint path = checkpoint.to_directory() self.assertIsInstance(path, str) # Create from path checkpoint = Checkpoint.from_directory(path) self.assertTrue(checkpoint._local_path) self._assert_dict_checkpoint(checkpoint)
def test_init(): preprocessor = DummyPreprocessor() predictor = TensorflowPredictor(model_definition=build_model, preprocessor=preprocessor, model_weights=weights) checkpoint = {MODEL_KEY: weights, PREPROCESSOR_KEY: preprocessor} checkpoint_predictor = TensorflowPredictor.from_checkpoint( Checkpoint.from_dict(checkpoint), build_model) assert checkpoint_predictor.model_definition == predictor.model_definition assert checkpoint_predictor.model_weights == predictor.model_weights assert checkpoint_predictor.preprocessor == predictor.preprocessor
def test_dict_checkpoint_dict(self): """Test conversion from dict to dict checkpoint and back.""" checkpoint = self._prepare_dict_checkpoint() # Convert into dict checkpoint data_dict = checkpoint.to_dict() self.assertIsInstance(data_dict, dict) # Create from dict checkpoint = Checkpoint.from_dict(data_dict) self.assertTrue(checkpoint._data_dict) self._assert_dict_checkpoint(checkpoint)
def test_dict_checkpoint_bytes(self): """Test conversion from dict to bytes checkpoint and back.""" checkpoint = self._prepare_dict_checkpoint() # Convert into bytes checkpoint blob = checkpoint.to_bytes() self.assertIsInstance(blob, bytes) # Create from bytes checkpoint = Checkpoint.from_bytes(blob) self.assertTrue(checkpoint._data_dict) self._assert_dict_checkpoint(checkpoint)
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, ) result = trainer.fit() checkpoint = result.checkpoint xgb_model, _ = load_checkpoint(checkpoint) assert get_num_trees(xgb_model) == 5 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, resume_from_checkpoint=resume_from, ) result = trainer.fit() checkpoint = result.checkpoint model, _ = load_checkpoint(checkpoint) assert get_num_trees(model) == 10
def test_batch_prediction_fs(): batch_predictor = BatchPredictor.from_checkpoint( Checkpoint.from_dict({"factor": 2.0}), DummyPredictorFS) test_dataset = ray.data.from_items([1.0, 2.0, 3.0, 4.0] * 32).repartition(8) assert (batch_predictor.predict( test_dataset, min_scoring_workers=4).to_pandas().to_numpy().squeeze().tolist() == [ 4.0, 8.0, 12.0, 16.0, ] * 32)
def test_fs_checkpoint_uri(self): """Test conversion from fs to cloud checkpoint and back.""" checkpoint = self._prepare_fs_checkpoint() # Convert into dict checkpoint location = checkpoint.to_uri(self.cloud_uri) self.assertIsInstance(location, str) self.assertIn("memory://", location) # Create from dict checkpoint = Checkpoint.from_uri(location) self.assertTrue(checkpoint._uri) self._assert_fs_checkpoint(checkpoint)
def test_obj_store_cp_as_directory(self): checkpoint = self._prepare_dict_checkpoint() # Convert into obj ref checkpoint obj_ref = checkpoint.to_object_ref() # Create from object ref checkpoint = Checkpoint.from_object_ref(obj_ref) with checkpoint.as_directory() as checkpoint_dir: assert os.path.exists(checkpoint_dir) assert checkpoint_dir.endswith(obj_ref.hex()) assert not os.path.exists(checkpoint_dir)
def from_checkpoint( cls, checkpoint: Checkpoint, *, pipeline_cls: Optional[Type[Pipeline]] = None, **pipeline_kwargs, ) -> "HuggingFacePredictor": """Instantiate the predictor from a Checkpoint. The checkpoint is expected to be a result of ``HuggingFaceTrainer``. Args: checkpoint: The checkpoint to load the model, tokenizer and preprocessor from. It is expected to be from the result of a ``HuggingFaceTrainer`` run. pipeline_cls: A ``transformers.pipelines.Pipeline`` class to use. If not specified, will use the ``pipeline`` abstraction wrapper. **pipeline_kwargs: Any kwargs to pass to the pipeline initialization. If ``pipeline`` is None, this must contain the 'task' argument. Cannot contain 'model'. Can be used to override the tokenizer with 'tokenizer'. """ if not pipeline_cls and "task" not in pipeline_kwargs: raise ValueError( "If `pipeline_cls` is not specified, 'task' must be passed as a kwarg." ) pipeline_cls = pipeline_cls or pipeline_factory preprocessor = checkpoint.get_preprocessor() with checkpoint.as_directory() as checkpoint_path: # Tokenizer will be loaded automatically (no need to specify # `tokenizer=checkpoint_path`) pipeline = pipeline_cls(model=checkpoint_path, **pipeline_kwargs) return cls( pipeline=pipeline, preprocessor=preprocessor, )
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = LightGBMTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert get_num_trees(model) == 10 assert preprocessor.is_same assert preprocessor.fitted_
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = SklearnTrainer( estimator=RandomForestClassifier(), scaling_config=scale_config, label_column="target", datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert hasattr(model, "feature_importances_") assert preprocessor.is_same assert preprocessor.fitted_
def _trial_to_result(self, trial: Trial) -> Result: if trial.checkpoint.dir_or_data: checkpoint_dir = TrainableUtil.find_checkpoint_dir( trial.checkpoint.dir_or_data ) checkpoint = Checkpoint.from_directory(checkpoint_dir) else: checkpoint = None result = Result( checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), ) return result
def testDirCheckpointWithPreprocessor(self): with tempfile.TemporaryDirectory() as tmpdir: preprocessor = DummyPreprocessor(1) data = {"metric": 5} checkpoint_dir = os.path.join(tmpdir, "existing_checkpoint") os.mkdir(checkpoint_dir, 0o755) with open(os.path.join(checkpoint_dir, "test_data.pkl"), "wb") as fp: pickle.dump(data, fp) with open(os.path.join(checkpoint_dir, PREPROCESSOR_KEY), "wb") as fp: pickle.dump(preprocessor, fp) checkpoint = Checkpoint.from_directory(checkpoint_dir) preprocessor = checkpoint.get_preprocessor() assert preprocessor.multiplier == 1
def from_checkpoint( cls, checkpoint: Checkpoint, model: Optional[torch.nn.Module] = None, use_gpu: bool = False, ) -> "TorchPredictor": """Instantiate the predictor from a Checkpoint. The checkpoint is expected to be a result of ``TorchTrainer``. Args: checkpoint: The checkpoint to load the model and preprocessor from. It is expected to be from the result of a ``TorchTrainer`` run. model: If the checkpoint contains a model state dict, and not the model itself, then the state dict will be loaded to this ``model``. use_gpu: If set, the model will be moved to GPU on instantiation and prediction happens on GPU. """ checkpoint = TorchCheckpoint.from_checkpoint(checkpoint) model = checkpoint.get_model(model) preprocessor = checkpoint.get_preprocessor() return cls(model=model, preprocessor=preprocessor, use_gpu=use_gpu)
def test_fs_checkpoint_obj_store(self): """Test conversion from fs to obj store checkpoint and back.""" if not ray.is_initialized(): ray.init() checkpoint = self._prepare_fs_checkpoint() # Convert into obj ref checkpoint obj_ref = checkpoint.to_object_ref() # Create from object ref checkpoint = Checkpoint.from_object_ref(obj_ref) self.assertIsInstance(checkpoint._obj_ref, ray.ObjectRef) self._assert_fs_checkpoint(checkpoint)
def save_to_object(self): """Saves the current model state to a Python object. It also saves to disk but does not return the checkpoint path. Returns: Object holding checkpoint data. """ temp_container_dir = tempfile.mkdtemp("save_to_object", dir=self.logdir) checkpoint_dir = self.save(temp_container_dir) obj_ref = Checkpoint.from_directory(checkpoint_dir).to_bytes() shutil.rmtree(temp_container_dir) return obj_ref
def test_batch_prediction_with_set_cpus(ray_start_4_cpus): with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) checkpoint = Checkpoint.from_directory(tmpdir) batch_predictor = BatchPredictor.from_checkpoint(checkpoint, SklearnPredictor) test_dataset = ray.data.from_pandas( pd.DataFrame(dummy_data, columns=["A", "B"]) ) batch_predictor.predict( test_dataset, num_cpus_per_worker=2, num_estimator_cpus=2 )
def to_air_checkpoint( model: torch.nn.Module, preprocessor: Optional["Preprocessor"] = None ) -> Checkpoint: """Convert a pretrained model to AIR checkpoint for serve or inference. Args: model: A pretrained model. preprocessor: A fitted preprocessor. The preprocessing logic will be applied to serve/inference. Returns: A Ray Air checkpoint. """ checkpoint = Checkpoint.from_dict( {PREPROCESSOR_KEY: preprocessor, MODEL_KEY: model} ) return checkpoint
def train_func(config): step = 0 width, height = config["width"], config["height"] if session.get_checkpoint(): loaded_checkpoint = session.get_checkpoint() step = loaded_checkpoint.to_dict()["step"] + 1 for step in range(step, 100): intermediate_score = evaluation_fn(step, width, height) checkpoint = Checkpoint.from_dict({"step": step}) session.report({ "iterations": step, "mean_loss": intermediate_score }, checkpoint=checkpoint)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_shard = session.get_dataset_shard("train") validation_dataset = session.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] for _ in range(epochs): train_torch_dataset = train_dataset_shard.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) if session.get_world_rank() == 0: result = validate_epoch(validation_torch_dataset, model, loss_fn, device) else: result = {} results.append(result) session.report(result, checkpoint=Checkpoint.from_dict(dict(model=model))) return results
def test_batch_prediction_feature_cols(): batch_predictor = BatchPredictor.from_checkpoint( Checkpoint.from_dict({ "factor": 2.0, PREPROCESSOR_KEY: DummyPreprocessor() }), DummyPredictor, ) test_dataset = ray.data.from_pandas( pd.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] })) assert batch_predictor.predict(test_dataset, feature_columns=[ "a" ]).to_pandas().to_numpy().squeeze().tolist() == [4.0, 8.0, 12.0]
def _maybe_save_to_cloud(self, checkpoint_dir: str) -> bool: if not self.uses_cloud_checkpointing: return False if self.custom_syncer: self.custom_syncer.sync_up(checkpoint_dir, self._storage_path(checkpoint_dir)) self.custom_syncer.wait_or_retry() return True checkpoint = Checkpoint.from_directory(checkpoint_dir) retry_fn( lambda: checkpoint.to_uri(self._storage_path(checkpoint_dir)), subprocess.CalledProcessError, num_retries=3, sleep_time=1, ) return True
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) save_preprocessor_to_dir(preprocessor, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = SklearnPredictor.from_checkpoint(checkpoint) assert np.allclose( checkpoint_predictor.estimator.feature_importances_, predictor.estimator.feature_importances_, ) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def test_fs_checkpoint_uri_pa(self): """Test conversion from fs to cloud checkpoint and back.""" checkpoint = self._prepare_fs_checkpoint() # Clean up mock bucket delete_at_uri(self.cloud_uri_pa) _ensure_directory(self.cloud_uri_pa) # Convert into dict checkpoint location = checkpoint.to_uri(self.cloud_uri_pa) self.assertIsInstance(location, str) self.assertIn("mock://", location) # Create from dict checkpoint = Checkpoint.from_uri(location) self.assertTrue(checkpoint._uri) self._assert_fs_checkpoint(checkpoint)
def _maybe_save_to_cloud(self, checkpoint_dir: str): # Derived classes like the FunctionRunner might call this if self.uses_cloud_checkpointing: if self.storage_client: # Keep for backwards compatibility, remove after deprecation self.storage_client.sync_up( checkpoint_dir, self._storage_path(checkpoint_dir) ) self.storage_client.wait_or_retry() return checkpoint = Checkpoint.from_directory(checkpoint_dir) retry_fn( lambda: checkpoint.to_uri(self._storage_path(checkpoint_dir)), subprocess.CalledProcessError, num_retries=3, sleep_time=1, )
def load_checkpoint( checkpoint: Checkpoint, ) -> Tuple[xgboost.Booster, Optional["Preprocessor"]]: """Load a Checkpoint from ``XGBoostTrainer``. Args: checkpoint: The checkpoint to load the model and preprocessor from. It is expected to be from the result of a ``XGBoostTrainer`` run. Returns: The model and AIR preprocessor contained within. """ with checkpoint.as_directory() as checkpoint_path: xgb_model = xgboost.Booster() xgb_model.load_model(os.path.join(checkpoint_path, MODEL_KEY)) preprocessor = load_preprocessor_from_dir(checkpoint_path) return xgb_model, preprocessor
def load_checkpoint( checkpoint: Checkpoint, ) -> Tuple[lightgbm.Booster, Optional["Preprocessor"]]: """Load a Checkpoint from ``LightGBMTrainer``. Args: checkpoint: The checkpoint to load the model and preprocessor from. It is expected to be from the result of a ``LightGBMTrainer`` run. Returns: The model and AIR preprocessor contained within. """ with checkpoint.as_directory() as checkpoint_path: lgbm_model = lightgbm.Booster( model_file=os.path.join(checkpoint_path, MODEL_KEY)) preprocessor = load_preprocessor_from_dir(checkpoint_path) return lgbm_model, preprocessor
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = XGBoostPredictor(model=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: # This somewhat convoluted procedure is the same as in the # Trainers. The reason for saving model to disk instead # of directly to the dict as bytes is due to all callbacks # following save to disk logic. GBDT models are small # enough that IO should not be an issue. model.save_model(os.path.join(tmpdir, MODEL_KEY)) save_preprocessor_to_dir(preprocessor, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = XGBoostPredictor.from_checkpoint(checkpoint) assert get_num_trees(checkpoint_predictor.model) == get_num_trees(predictor.model) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def test_automatic_enable_gpu_from_num_gpus_per_worker(shutdown_only): """ Test we automatically set underlying Predictor creation use_gpu to True if we found num_gpus_per_worker > 0 in BatchPredictor's predict() call. """ ray.init(num_gpus=1) batch_predictor = BatchPredictor.from_checkpoint( Checkpoint.from_dict({ "factor": 2.0, PREPROCESSOR_KEY: DummyPreprocessor() }), DummyPredictor, ) test_dataset = ray.data.range_table(4) with pytest.raises(ValueError, match="DummyPredictor does not support GPU prediction"): _ = batch_predictor.predict(test_dataset, num_gpus_per_worker=1)
def train_convnet(config): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If `session.get_checkpoint()` is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if session.get_checkpoint(): print("Loading from checkpoint.") loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) checkpoint = None if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. # Need to create a directory under current working directory # to construct an AIR Checkpoint object from. os.makedirs("my_model", exist_ok=True) torch.save( { "step": step, "model_state_dict": model.state_dict(), }, "my_model/checkpoint.pt", ) checkpoint = Checkpoint.from_directory("my_model") step += 1 session.report({"mean_accuracy": acc}, checkpoint=checkpoint)