def test_fs_checkpoint_additional_fields(self): checkpoint = self._prepare_fs_checkpoint() # Convert to dict checkpoint_dict = checkpoint.to_dict() # Add field to dict checkpoint_dict["additional_field"] = "data" # Create new checkpoint object checkpoint = Checkpoint.from_dict(checkpoint_dict) # Turn into FS checkpoint_dir = checkpoint.to_directory() assert os.path.exists(os.path.join(checkpoint_dir, "test_data.pkl")) assert os.path.exists( os.path.join(checkpoint_dir, "additional_field.meta.pkl")) # Add new file with open(os.path.join(checkpoint_dir, "even_more.txt"), "w") as f: f.write("More\n") # Turn into dict new_dict = Checkpoint.from_directory(checkpoint_dir).to_dict() assert new_dict["additional_field"] == "data" # Turn into fs new_dir = Checkpoint.from_dict(new_dict).to_directory() assert os.path.exists(os.path.join(new_dir, "test_data.pkl")) assert os.path.exists( os.path.join(new_dir, "additional_field.meta.pkl")) assert os.path.exists(os.path.join(new_dir, "even_more.txt"))
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): def train_func(): checkpoint = session.get_checkpoint() if checkpoint: epoch = checkpoint.to_dict()["epoch"] else: epoch = 0 for i in range(epoch, epoch + 2): session.report({"epoch": i}, checkpoint=Checkpoint.from_dict({"epoch": i})) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config ) result = trainer.fit() assert result.checkpoint.to_dict()["epoch"] == 1 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config, resume_from_checkpoint=resume_from, ) result = trainer.fit() assert result.checkpoint.to_dict()["epoch"] == 2
def test_metadata(self): """Test conversion with metadata involved. a. from fs to dict checkpoint; b. drop some marker to dict checkpoint; c. convert back to fs checkpoint; d. convert back to dict checkpoint. Assert that the marker should still be there.""" checkpoint = self._prepare_fs_checkpoint() # Convert into dict checkpoint data_dict = checkpoint.to_dict() self.assertIsInstance(data_dict, dict) data_dict["my_marker"] = "marked" # Create from dict checkpoint = Checkpoint.from_dict(data_dict) self.assertTrue(checkpoint._data_dict) self._assert_fs_checkpoint(checkpoint) # Convert back to dict data_dict_2 = Checkpoint.from_directory( checkpoint.to_directory()).to_dict() assert data_dict_2["my_marker"] == "marked"
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert get_num_trees(model) == 10 assert preprocessor.is_same assert preprocessor.fitted_
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, ) result = trainer.fit() checkpoint = result.checkpoint xgb_model, _ = load_checkpoint(checkpoint) assert get_num_trees(xgb_model) == 5 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, resume_from_checkpoint=resume_from, ) result = trainer.fit() checkpoint = result.checkpoint model, _ = load_checkpoint(checkpoint) assert get_num_trees(model) == 10
def _handle(self, logs: Dict, when: str = None): self._counter[when] += 1 if isinstance(self._frequency, list): index = self._on.index(when) freq = self._frequency[index] else: freq = self._frequency checkpoint = None if freq > 0 and self._counter[when] % freq == 0: self.model.save("my_model", overwrite=True) checkpoint = Checkpoint.from_directory("my_model") if not self._metrics: report_dict = logs else: report_dict = {} for key in self._metrics: if isinstance(self._metrics, dict): metric = self._metrics[key] else: metric = key report_dict[key] = logs[metric] session.report(report_dict, checkpoint=checkpoint)
def testDictCheckpointWithPreprocessorAsDir(self): preprocessor = DummyPreprocessor(1) data = {"metric": 5, PREPROCESSOR_KEY: preprocessor} checkpoint = Checkpoint.from_dict(data) checkpoint_path = checkpoint.to_directory() checkpoint = Checkpoint.from_directory(checkpoint_path) preprocessor = checkpoint.get_preprocessor() assert preprocessor.multiplier == 1
def testLocalCheckpointSerde(self): # Local checkpoints are converted to bytes on serialization. Currently # this is a pickled dict, so we compare with a dict checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) with source_checkpoint.as_directory() as tmpdir: checkpoint = Checkpoint.from_directory(tmpdir) self._testCheckpointSerde( checkpoint, *source_checkpoint.get_internal_representation())
def _prepare_fs_checkpoint(self) -> Checkpoint: # Create checkpoint from fs checkpoint = Checkpoint.from_directory(self.checkpoint_dir) self.assertIsInstance(checkpoint, Checkpoint) self.assertTrue(checkpoint._local_path, str) self.assertEqual(checkpoint._local_path, self.checkpoint_dir) return checkpoint
def get_best_checkpoint( self, trial: Trial, metric: Optional[str] = None, mode: Optional[str] = None) -> Optional[Checkpoint]: """Gets best persistent checkpoint path of provided trial. Any checkpoints with an associated metric value of ``nan`` will be filtered out. Args: trial: The log directory of a trial, or a trial instance. metric: key of trial info to return, e.g. "mean_accuracy". "training_iteration" is used by default if no value was passed to ``self.default_metric``. mode: One of [min, max]. Defaults to ``self.default_mode``. Returns: :class:`Checkpoint <ray.air.Checkpoint>` object. """ metric = metric or self.default_metric or TRAINING_ITERATION mode = self._validate_mode(mode) checkpoint_paths = self.get_trial_checkpoints_paths(trial, metric) # Filter out nan. Sorting nan values leads to undefined behavior. checkpoint_paths = [(path, metric) for path, metric in checkpoint_paths if not is_nan(metric)] if not checkpoint_paths: logger.error(f"No checkpoints have been found for trial {trial}.") return None a = -1 if mode == "max" else 1 best_path_metrics = sorted(checkpoint_paths, key=lambda x: a * x[1]) best_path, best_metric = best_path_metrics[0] cloud_path = self._parse_cloud_path(best_path) if self._legacy_checkpoint: return TrialCheckpoint(local_path=best_path, cloud_path=cloud_path) if cloud_path: # Prefer cloud path over local path for downsteam processing return Checkpoint.from_uri(cloud_path) elif os.path.exists(best_path): return Checkpoint.from_directory(best_path) else: logger.error( f"No checkpoint locations for {trial} available on " f"this node. To avoid this, you " f"should enable checkpoint synchronization with the" f"`sync_config` argument in Ray Tune. " f"The checkpoint may be available on a different node - " f"please check this location on worker nodes: {best_path}") return None
def testDirCheckpointWithoutPreprocessor(self): with tempfile.TemporaryDirectory() as tmpdir: data = {"metric": 5} checkpoint_dir = os.path.join(tmpdir, "existing_checkpoint") os.mkdir(checkpoint_dir, 0o755) with open(os.path.join(checkpoint_dir, "test_data.pkl"), "wb") as fp: pickle.dump(data, fp) checkpoint = Checkpoint.from_directory(checkpoint_dir) preprocessor = checkpoint.get_preprocessor() assert preprocessor is None
def test_dict_checkpoint_additional_files(self): checkpoint = self._prepare_dict_checkpoint() # Convert to directory checkpoint_dir = checkpoint.to_directory() # Add file into checkpoint directory with open(os.path.join(checkpoint_dir, "additional_file.txt"), "w") as f: f.write("Additional data\n") os.mkdir(os.path.join(checkpoint_dir, "subdir")) with open(os.path.join(checkpoint_dir, "subdir", "another.txt"), "w") as f: f.write("Another additional file\n") # Create new checkpoint object checkpoint = Checkpoint.from_directory(checkpoint_dir) new_dir = checkpoint.to_directory() assert os.path.exists(os.path.join(new_dir, "additional_file.txt")) with open(os.path.join(new_dir, "additional_file.txt"), "r") as f: assert f.read() == "Additional data\n" assert os.path.exists(os.path.join(new_dir, "subdir", "another.txt")) with open(os.path.join(new_dir, "subdir", "another.txt"), "r") as f: assert f.read() == "Another additional file\n" checkpoint_dict = checkpoint.to_dict() for k, v in self.checkpoint_dict_data.items(): assert checkpoint_dict[k] == v assert _DICT_CHECKPOINT_ADDITIONAL_FILE_KEY in checkpoint_dict # Add another field checkpoint_dict["new_field"] = "Data" another_dict = Checkpoint.from_directory( Checkpoint.from_dict(checkpoint_dict).to_directory()).to_dict() assert _DICT_CHECKPOINT_ADDITIONAL_FILE_KEY in another_dict assert another_dict["new_field"] == "Data"
def test_predict_no_preprocessor(): with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) checkpoint = Checkpoint.from_directory(tmpdir) predictor = SklearnPredictor.from_checkpoint(checkpoint) data_batch = np.array([[1, 2], [3, 4], [5, 6]]) predictions = predictor.predict(data_batch) assert len(predictions) == 3
def train_func(): if session.get_checkpoint(): with session.get_checkpoint().as_directory() as checkpoint_dir: import tensorflow as tf model = tf.keras.models.load_model(checkpoint_dir) else: model = build_model() model.save("my_model", overwrite=True) session.report(metrics={"iter": 1}, checkpoint=Checkpoint.from_directory("my_model"))
def train_func(config, checkpoint_dir=None): # config already contains merged values. # Instantiate new Trainer in Trainable. trainer = trainer_cls(**config) if checkpoint_dir: trainer.resume_from_checkpoint = Checkpoint.from_directory( checkpoint_dir) trainer.setup() trainer.preprocess_datasets() trainer.training_loop()
def test_dict_checkpoint_fs(self): """Test conversion from dict to FS checkpoint and back.""" checkpoint = self._prepare_dict_checkpoint() # Convert into fs checkpoint path = checkpoint.to_directory() self.assertIsInstance(path, str) # Create from path checkpoint = Checkpoint.from_directory(path) self.assertTrue(checkpoint._local_path) self._assert_dict_checkpoint(checkpoint)
def save_to_object(self): """Saves the current model state to a Python object. It also saves to disk but does not return the checkpoint path. Returns: Object holding checkpoint data. """ temp_container_dir = tempfile.mkdtemp("save_to_object", dir=self.logdir) checkpoint_dir = self.save(temp_container_dir) obj_ref = Checkpoint.from_directory(checkpoint_dir).to_bytes() shutil.rmtree(temp_container_dir) return obj_ref
def testDirCheckpointWithPreprocessor(self): with tempfile.TemporaryDirectory() as tmpdir: preprocessor = DummyPreprocessor(1) data = {"metric": 5} checkpoint_dir = os.path.join(tmpdir, "existing_checkpoint") os.mkdir(checkpoint_dir, 0o755) with open(os.path.join(checkpoint_dir, "test_data.pkl"), "wb") as fp: pickle.dump(data, fp) with open(os.path.join(checkpoint_dir, PREPROCESSOR_KEY), "wb") as fp: pickle.dump(preprocessor, fp) checkpoint = Checkpoint.from_directory(checkpoint_dir) preprocessor = checkpoint.get_preprocessor() assert preprocessor.multiplier == 1
def test_batch_prediction_with_set_cpus(ray_start_4_cpus): with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) checkpoint = Checkpoint.from_directory(tmpdir) batch_predictor = BatchPredictor.from_checkpoint(checkpoint, SklearnPredictor) test_dataset = ray.data.from_pandas( pd.DataFrame(dummy_data, columns=["A", "B"]) ) batch_predictor.predict( test_dataset, num_cpus_per_worker=2, num_estimator_cpus=2 )
def _trial_to_result(self, trial: Trial) -> Result: if trial.checkpoint.dir_or_data: checkpoint_dir = TrainableUtil.find_checkpoint_dir( trial.checkpoint.dir_or_data ) checkpoint = Checkpoint.from_directory(checkpoint_dir) else: checkpoint = None result = Result( checkpoint=checkpoint, metrics=trial.last_result.copy(), error=self._populate_exception(trial), ) return result
def create_checkpoint(preprocessor: Optional[Preprocessor] = None, config: Optional[dict] = None) -> Checkpoint: rl_trainer = RLTrainer( algorithm=_DummyAlgo, config=config or {}, preprocessor=preprocessor, ) rl_trainable_cls = rl_trainer.as_trainable() rl_trainable = rl_trainable_cls() with tempfile.TemporaryDirectory() as checkpoint_dir: checkpoint_file = rl_trainable.save(checkpoint_dir) checkpoint_path = TrainableUtil.find_checkpoint_dir(checkpoint_file) checkpoint_data = Checkpoint.from_directory(checkpoint_path).to_dict() return Checkpoint.from_dict(checkpoint_data)
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f: cpickle.dump(model, f) save_preprocessor_to_dir(preprocessor, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = SklearnPredictor.from_checkpoint(checkpoint) assert np.allclose( checkpoint_predictor.estimator.feature_importances_, predictor.estimator.feature_importances_, ) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def _maybe_save_to_cloud(self, checkpoint_dir: str) -> bool: if not self.uses_cloud_checkpointing: return False if self.custom_syncer: self.custom_syncer.sync_up(checkpoint_dir, self._storage_path(checkpoint_dir)) self.custom_syncer.wait_or_retry() return True checkpoint = Checkpoint.from_directory(checkpoint_dir) retry_fn( lambda: checkpoint.to_uri(self._storage_path(checkpoint_dir)), subprocess.CalledProcessError, num_retries=3, sleep_time=1, ) return True
def _maybe_save_to_cloud(self, checkpoint_dir: str): # Derived classes like the FunctionRunner might call this if self.uses_cloud_checkpointing: if self.storage_client: # Keep for backwards compatibility, remove after deprecation self.storage_client.sync_up( checkpoint_dir, self._storage_path(checkpoint_dir) ) self.storage_client.wait_or_retry() return checkpoint = Checkpoint.from_directory(checkpoint_dir) retry_fn( lambda: checkpoint.to_uri(self._storage_path(checkpoint_dir)), subprocess.CalledProcessError, num_retries=3, sleep_time=1, )
def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 predictor = XGBoostPredictor(model=model, preprocessor=preprocessor) with tempfile.TemporaryDirectory() as tmpdir: # This somewhat convoluted procedure is the same as in the # Trainers. The reason for saving model to disk instead # of directly to the dict as bytes is due to all callbacks # following save to disk logic. GBDT models are small # enough that IO should not be an issue. model.save_model(os.path.join(tmpdir, MODEL_KEY)) save_preprocessor_to_dir(preprocessor, tmpdir) checkpoint = Checkpoint.from_directory(tmpdir) checkpoint_predictor = XGBoostPredictor.from_checkpoint(checkpoint) assert get_num_trees(checkpoint_predictor.model) == get_num_trees(predictor.model) assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr
def train_convnet(config): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If `session.get_checkpoint()` is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if session.get_checkpoint(): print("Loading from checkpoint.") loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) checkpoint = None if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. # Need to create a directory under current working directory # to construct an AIR Checkpoint object from. os.makedirs("my_model", exist_ok=True) torch.save( { "step": step, "model_state_dict": model.state_dict(), }, "my_model/checkpoint.pt", ) checkpoint = Checkpoint.from_directory("my_model") step += 1 session.report({"mean_accuracy": acc}, checkpoint=checkpoint)
def test_batchpredictor_runtime(num_scoring_workers, ray_start_4_cpus, create_data, tmpdir): """Test BatchPredictor runtimes with different number of workers. Should always be below 15s on an m5 instance.""" data = ray.data.range(1024).repartition(64) path = create_data checkpoint = Checkpoint.from_directory(str(path)) results = [] print("start test") for i in range(NUM_REPEATS): start_time = time.time() run_predictor(checkpoint, data, BatchPredictor, num_scoring_workers) runtime = time.time() - start_time results.append(runtime) gc.collect() print(f"results: {results} min: {min(results)}, stddev: {np.std(results)}") # should take less than 15 seconds in min case assert min(results) < 15
def to_air_checkpoint( path: str, booster: lightgbm.Booster, preprocessor: Optional["Preprocessor"] = None, ) -> Checkpoint: """Convert a pretrained model to AIR checkpoint for serve or inference. Args: path: The directory path where model and preprocessor steps are stored to. booster: A pretrained lightgbm model. preprocessor: A fitted preprocessor. The preprocessing logic will be applied to serve/inference. Returns: A Ray Air checkpoint. """ booster.save_model(os.path.join(path, MODEL_KEY)) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = Checkpoint.from_directory(path) return checkpoint
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = SklearnTrainer( estimator=RandomForestClassifier(), scaling_config=scale_config, label_column="target", datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert hasattr(model, "feature_importances_") assert preprocessor.is_same assert preprocessor.fitted_
def to_air_checkpoint( path: str, estimator: BaseEstimator, preprocessor: Optional["Preprocessor"] = None, ) -> Checkpoint: """Convert a pretrained model to AIR checkpoint for serve or inference. Args: path: The directory path where model and preprocessor steps are stored to. estimator: A pretrained model. preprocessor: A fitted preprocessor. The preprocessing logic will be applied to serve/inference. Returns: A Ray Air checkpoint. """ with open(os.path.join(path, MODEL_KEY), "wb") as f: cpickle.dump(estimator, f) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = Checkpoint.from_directory(path) return checkpoint