Python Checkpoint Examples, ray.air.checkpoint.Checkpoint Python Examples

Example #1

0

Show file

File: test_checkpoints.py Project: vishalbelsare/ray

    def test_dict_checkpoint_fs(self):
        """Test conversion from dict to FS checkpoint and back."""
        checkpoint = self._prepare_dict_checkpoint()

        # Convert into fs checkpoint
        path = checkpoint.to_directory()
        self.assertIsInstance(path, str)

        # Create from path
        checkpoint = Checkpoint.from_directory(path)
        self.assertTrue(checkpoint._local_path)

        self._assert_dict_checkpoint(checkpoint)

Example #2

0

Show file

def test_init():
    preprocessor = DummyPreprocessor()
    predictor = TensorflowPredictor(model_definition=build_model,
                                    preprocessor=preprocessor,
                                    model_weights=weights)

    checkpoint = {MODEL_KEY: weights, PREPROCESSOR_KEY: preprocessor}
    checkpoint_predictor = TensorflowPredictor.from_checkpoint(
        Checkpoint.from_dict(checkpoint), build_model)

    assert checkpoint_predictor.model_definition == predictor.model_definition
    assert checkpoint_predictor.model_weights == predictor.model_weights
    assert checkpoint_predictor.preprocessor == predictor.preprocessor

Example #3

0

Show file

File: test_checkpoints.py Project: vishalbelsare/ray

    def test_dict_checkpoint_dict(self):
        """Test conversion from dict to dict checkpoint and back."""
        checkpoint = self._prepare_dict_checkpoint()

        # Convert into dict checkpoint
        data_dict = checkpoint.to_dict()
        self.assertIsInstance(data_dict, dict)

        # Create from dict
        checkpoint = Checkpoint.from_dict(data_dict)
        self.assertTrue(checkpoint._data_dict)

        self._assert_dict_checkpoint(checkpoint)

Example #4

0

Show file

File: test_checkpoints.py Project: vishalbelsare/ray

    def test_dict_checkpoint_bytes(self):
        """Test conversion from dict to bytes checkpoint and back."""
        checkpoint = self._prepare_dict_checkpoint()

        # Convert into bytes checkpoint
        blob = checkpoint.to_bytes()
        self.assertIsInstance(blob, bytes)

        # Create from bytes
        checkpoint = Checkpoint.from_bytes(blob)
        self.assertTrue(checkpoint._data_dict)

        self._assert_dict_checkpoint(checkpoint)

Example #5

0

Show file

def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir):
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)
    trainer = XGBoostTrainer(
        scaling_config=scale_config,
        label_column="target",
        params=params,
        num_boost_round=5,
        datasets={
            TRAIN_DATASET_KEY: train_dataset,
            "valid": valid_dataset
        },
    )
    result = trainer.fit()
    checkpoint = result.checkpoint
    xgb_model, _ = load_checkpoint(checkpoint)
    assert get_num_trees(xgb_model) == 5

    # Move checkpoint to a different directory.
    checkpoint_dict = result.checkpoint.to_dict()
    checkpoint = Checkpoint.from_dict(checkpoint_dict)
    checkpoint_path = checkpoint.to_directory(tmpdir)
    resume_from = Checkpoint.from_directory(checkpoint_path)

    trainer = XGBoostTrainer(
        scaling_config=scale_config,
        label_column="target",
        params=params,
        num_boost_round=5,
        datasets={
            TRAIN_DATASET_KEY: train_dataset,
            "valid": valid_dataset
        },
        resume_from_checkpoint=resume_from,
    )
    result = trainer.fit()
    checkpoint = result.checkpoint
    model, _ = load_checkpoint(checkpoint)
    assert get_num_trees(model) == 10

Example #6

0

Show file

def test_batch_prediction_fs():
    batch_predictor = BatchPredictor.from_checkpoint(
        Checkpoint.from_dict({"factor": 2.0}), DummyPredictorFS)

    test_dataset = ray.data.from_items([1.0, 2.0, 3.0, 4.0] *
                                       32).repartition(8)
    assert (batch_predictor.predict(
        test_dataset,
        min_scoring_workers=4).to_pandas().to_numpy().squeeze().tolist() == [
            4.0,
            8.0,
            12.0,
            16.0,
        ] * 32)

Example #7

0

Show file

    def test_fs_checkpoint_uri(self):
        """Test conversion from fs to cloud checkpoint and back."""
        checkpoint = self._prepare_fs_checkpoint()

        # Convert into dict checkpoint
        location = checkpoint.to_uri(self.cloud_uri)
        self.assertIsInstance(location, str)
        self.assertIn("memory://", location)

        # Create from dict
        checkpoint = Checkpoint.from_uri(location)
        self.assertTrue(checkpoint._uri)

        self._assert_fs_checkpoint(checkpoint)

Example #8

0

Show file

    def test_obj_store_cp_as_directory(self):
        checkpoint = self._prepare_dict_checkpoint()

        # Convert into obj ref checkpoint
        obj_ref = checkpoint.to_object_ref()

        # Create from object ref
        checkpoint = Checkpoint.from_object_ref(obj_ref)

        with checkpoint.as_directory() as checkpoint_dir:
            assert os.path.exists(checkpoint_dir)
            assert checkpoint_dir.endswith(obj_ref.hex())

        assert not os.path.exists(checkpoint_dir)

Example #9

0

Show file

File: huggingface_predictor.py Project: parasj/ray

    def from_checkpoint(
        cls,
        checkpoint: Checkpoint,
        *,
        pipeline_cls: Optional[Type[Pipeline]] = None,
        **pipeline_kwargs,
    ) -> "HuggingFacePredictor":
        """Instantiate the predictor from a Checkpoint.

        The checkpoint is expected to be a result of ``HuggingFaceTrainer``.

        Args:
            checkpoint: The checkpoint to load the model, tokenizer and
                preprocessor from. It is expected to be from the result of a
                ``HuggingFaceTrainer`` run.
            pipeline_cls: A ``transformers.pipelines.Pipeline`` class to use.
                If not specified, will use the ``pipeline`` abstraction
                wrapper.
            **pipeline_kwargs: Any kwargs to pass to the pipeline
                initialization. If ``pipeline`` is None, this must contain
                the 'task' argument. Cannot contain 'model'. Can be used
                to override the tokenizer with 'tokenizer'.
        """
        if not pipeline_cls and "task" not in pipeline_kwargs:
            raise ValueError(
                "If `pipeline_cls` is not specified, 'task' must be passed as a kwarg."
            )
        pipeline_cls = pipeline_cls or pipeline_factory
        preprocessor = checkpoint.get_preprocessor()
        with checkpoint.as_directory() as checkpoint_path:
            # Tokenizer will be loaded automatically (no need to specify
            # `tokenizer=checkpoint_path`)
            pipeline = pipeline_cls(model=checkpoint_path, **pipeline_kwargs)
        return cls(
            pipeline=pipeline,
            preprocessor=preprocessor,
        )

Example #10

0

Show file

def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir):
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)

    class DummyPreprocessor(Preprocessor):
        def __init__(self):
            super().__init__()
            self.is_same = True

        def fit(self, dataset):
            self.fitted_ = True

        def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame":
            return df

    trainer = LightGBMTrainer(
        scaling_config=scale_config,
        label_column="target",
        params=params,
        datasets={
            TRAIN_DATASET_KEY: train_dataset,
            "valid": valid_dataset
        },
        preprocessor=DummyPreprocessor(),
    )
    result = trainer.fit()

    # Move checkpoint to a different directory.
    checkpoint_dict = result.checkpoint.to_dict()
    checkpoint = Checkpoint.from_dict(checkpoint_dict)
    checkpoint_path = checkpoint.to_directory(tmpdir)
    resume_from = Checkpoint.from_directory(checkpoint_path)

    model, preprocessor = load_checkpoint(resume_from)
    assert get_num_trees(model) == 10
    assert preprocessor.is_same
    assert preprocessor.fitted_

Example #11

0

Show file

def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir):
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)

    class DummyPreprocessor(Preprocessor):
        def __init__(self):
            super().__init__()
            self.is_same = True

        def fit(self, dataset):
            self.fitted_ = True

        def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame":
            return df

    trainer = SklearnTrainer(
        estimator=RandomForestClassifier(),
        scaling_config=scale_config,
        label_column="target",
        datasets={
            TRAIN_DATASET_KEY: train_dataset,
            "valid": valid_dataset
        },
        preprocessor=DummyPreprocessor(),
    )
    result = trainer.fit()

    # Move checkpoint to a different directory.
    checkpoint_dict = result.checkpoint.to_dict()
    checkpoint = Checkpoint.from_dict(checkpoint_dict)
    checkpoint_path = checkpoint.to_directory(tmpdir)
    resume_from = Checkpoint.from_directory(checkpoint_path)

    model, preprocessor = load_checkpoint(resume_from)
    assert hasattr(model, "feature_importances_")
    assert preprocessor.is_same
    assert preprocessor.fitted_

Example #12

0

Show file

    def _trial_to_result(self, trial: Trial) -> Result:
        if trial.checkpoint.dir_or_data:
            checkpoint_dir = TrainableUtil.find_checkpoint_dir(
                trial.checkpoint.dir_or_data
            )
            checkpoint = Checkpoint.from_directory(checkpoint_dir)
        else:
            checkpoint = None

        result = Result(
            checkpoint=checkpoint,
            metrics=trial.last_result.copy(),
            error=self._populate_exception(trial),
        )
        return result

Example #13

0

Show file

File: test_checkpoints.py Project: parasj/ray

 def testDirCheckpointWithPreprocessor(self):
     with tempfile.TemporaryDirectory() as tmpdir:
         preprocessor = DummyPreprocessor(1)
         data = {"metric": 5}
         checkpoint_dir = os.path.join(tmpdir, "existing_checkpoint")
         os.mkdir(checkpoint_dir, 0o755)
         with open(os.path.join(checkpoint_dir, "test_data.pkl"),
                   "wb") as fp:
             pickle.dump(data, fp)
         with open(os.path.join(checkpoint_dir, PREPROCESSOR_KEY),
                   "wb") as fp:
             pickle.dump(preprocessor, fp)
         checkpoint = Checkpoint.from_directory(checkpoint_dir)
         preprocessor = checkpoint.get_preprocessor()
         assert preprocessor.multiplier == 1

Example #14

0

Show file

File: torch_predictor.py Project: parasj/ray

    def from_checkpoint(
        cls,
        checkpoint: Checkpoint,
        model: Optional[torch.nn.Module] = None,
        use_gpu: bool = False,
    ) -> "TorchPredictor":
        """Instantiate the predictor from a Checkpoint.

        The checkpoint is expected to be a result of ``TorchTrainer``.

        Args:
            checkpoint: The checkpoint to load the model and
                preprocessor from. It is expected to be from the result of a
                ``TorchTrainer`` run.
            model: If the checkpoint contains a model state dict, and not
                the model itself, then the state dict will be loaded to this
                ``model``.
            use_gpu: If set, the model will be moved to GPU on instantiation and
                prediction happens on GPU.
        """
        checkpoint = TorchCheckpoint.from_checkpoint(checkpoint)
        model = checkpoint.get_model(model)
        preprocessor = checkpoint.get_preprocessor()
        return cls(model=model, preprocessor=preprocessor, use_gpu=use_gpu)

Example #15

0

Show file

    def test_fs_checkpoint_obj_store(self):
        """Test conversion from fs to obj store checkpoint and back."""
        if not ray.is_initialized():
            ray.init()

        checkpoint = self._prepare_fs_checkpoint()

        # Convert into obj ref checkpoint
        obj_ref = checkpoint.to_object_ref()

        # Create from object ref
        checkpoint = Checkpoint.from_object_ref(obj_ref)
        self.assertIsInstance(checkpoint._obj_ref, ray.ObjectRef)

        self._assert_fs_checkpoint(checkpoint)

Example #16

0

Show file

File: trainable.py Project: parasj/ray

    def save_to_object(self):
        """Saves the current model state to a Python object.

        It also saves to disk but does not return the checkpoint path.

        Returns:
            Object holding checkpoint data.
        """
        temp_container_dir = tempfile.mkdtemp("save_to_object",
                                              dir=self.logdir)
        checkpoint_dir = self.save(temp_container_dir)

        obj_ref = Checkpoint.from_directory(checkpoint_dir).to_bytes()
        shutil.rmtree(temp_container_dir)
        return obj_ref

Example #17

0

Show file

File: test_sklearn_predictor.py Project: parasj/ray

def test_batch_prediction_with_set_cpus(ray_start_4_cpus):
    with tempfile.TemporaryDirectory() as tmpdir:
        with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f:
            cpickle.dump(model, f)

        checkpoint = Checkpoint.from_directory(tmpdir)

        batch_predictor = BatchPredictor.from_checkpoint(checkpoint, SklearnPredictor)

        test_dataset = ray.data.from_pandas(
            pd.DataFrame(dummy_data, columns=["A", "B"])
        )
        batch_predictor.predict(
            test_dataset, num_cpus_per_worker=2, num_estimator_cpus=2
        )

Example #18

0

Show file

def to_air_checkpoint(
    model: torch.nn.Module, preprocessor: Optional["Preprocessor"] = None
) -> Checkpoint:
    """Convert a pretrained model to AIR checkpoint for serve or inference.

    Args:
        model: A pretrained model.
        preprocessor: A fitted preprocessor. The preprocessing logic will
            be applied to serve/inference.
    Returns:
        A Ray Air checkpoint.
    """
    checkpoint = Checkpoint.from_dict(
        {PREPROCESSOR_KEY: preprocessor, MODEL_KEY: model}
    )
    return checkpoint

Example #19

0

Show file

def train_func(config):
    step = 0
    width, height = config["width"], config["height"]

    if session.get_checkpoint():
        loaded_checkpoint = session.get_checkpoint()
        step = loaded_checkpoint.to_dict()["step"] + 1

    for step in range(step, 100):
        intermediate_score = evaluation_fn(step, width, height)
        checkpoint = Checkpoint.from_dict({"step": step})
        session.report({
            "iterations": step,
            "mean_loss": intermediate_score
        },
                       checkpoint=checkpoint)

Example #20

0

Show file

def train_func(config):
    batch_size = config.get("batch_size", 32)
    hidden_size = config.get("hidden_size", 1)
    lr = config.get("lr", 1e-2)
    epochs = config.get("epochs", 3)

    train_dataset_shard = session.get_dataset_shard("train")
    validation_dataset = session.get_dataset_shard("validation")

    model = nn.Linear(1, hidden_size)
    model = train.torch.prepare_model(model)

    loss_fn = nn.MSELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    results = []

    for _ in range(epochs):
        train_torch_dataset = train_dataset_shard.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )
        validation_torch_dataset = validation_dataset.to_torch(
            label_column="y",
            feature_columns=["x"],
            label_column_dtype=torch.float,
            feature_column_dtypes=torch.float,
            batch_size=batch_size,
        )

        device = train.torch.get_device()

        train_epoch(train_torch_dataset, model, loss_fn, optimizer, device)
        if session.get_world_rank() == 0:
            result = validate_epoch(validation_torch_dataset, model, loss_fn,
                                    device)
        else:
            result = {}
        results.append(result)
        session.report(result,
                       checkpoint=Checkpoint.from_dict(dict(model=model)))

    return results

Example #21

0

Show file

File: test_batch_predictor.py Project: parasj/ray

def test_batch_prediction_feature_cols():
    batch_predictor = BatchPredictor.from_checkpoint(
        Checkpoint.from_dict({
            "factor": 2.0,
            PREPROCESSOR_KEY: DummyPreprocessor()
        }),
        DummyPredictor,
    )

    test_dataset = ray.data.from_pandas(
        pd.DataFrame({
            "a": [1, 2, 3],
            "b": [4, 5, 6]
        }))

    assert batch_predictor.predict(test_dataset, feature_columns=[
        "a"
    ]).to_pandas().to_numpy().squeeze().tolist() == [4.0, 8.0, 12.0]

Example #22

0

Show file

File: trainable.py Project: parasj/ray

    def _maybe_save_to_cloud(self, checkpoint_dir: str) -> bool:
        if not self.uses_cloud_checkpointing:
            return False

        if self.custom_syncer:
            self.custom_syncer.sync_up(checkpoint_dir,
                                       self._storage_path(checkpoint_dir))
            self.custom_syncer.wait_or_retry()
            return True

        checkpoint = Checkpoint.from_directory(checkpoint_dir)
        retry_fn(
            lambda: checkpoint.to_uri(self._storage_path(checkpoint_dir)),
            subprocess.CalledProcessError,
            num_retries=3,
            sleep_time=1,
        )
        return True

Example #23

0

Show file

def test_init():
    preprocessor = DummyPreprocessor()
    preprocessor.attr = 1
    predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor)

    with tempfile.TemporaryDirectory() as tmpdir:
        with open(os.path.join(tmpdir, MODEL_KEY), "wb") as f:
            cpickle.dump(model, f)
        save_preprocessor_to_dir(preprocessor, tmpdir)

        checkpoint = Checkpoint.from_directory(tmpdir)
        checkpoint_predictor = SklearnPredictor.from_checkpoint(checkpoint)

    assert np.allclose(
        checkpoint_predictor.estimator.feature_importances_,
        predictor.estimator.feature_importances_,
    )
    assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr

Example #24

0

Show file

    def test_fs_checkpoint_uri_pa(self):
        """Test conversion from fs to cloud checkpoint and back."""
        checkpoint = self._prepare_fs_checkpoint()

        # Clean up mock bucket
        delete_at_uri(self.cloud_uri_pa)
        _ensure_directory(self.cloud_uri_pa)

        # Convert into dict checkpoint
        location = checkpoint.to_uri(self.cloud_uri_pa)
        self.assertIsInstance(location, str)
        self.assertIn("mock://", location)

        # Create from dict
        checkpoint = Checkpoint.from_uri(location)
        self.assertTrue(checkpoint._uri)

        self._assert_fs_checkpoint(checkpoint)

Example #25

0

Show file

    def _maybe_save_to_cloud(self, checkpoint_dir: str):
        # Derived classes like the FunctionRunner might call this
        if self.uses_cloud_checkpointing:
            if self.storage_client:
                # Keep for backwards compatibility, remove after deprecation
                self.storage_client.sync_up(
                    checkpoint_dir, self._storage_path(checkpoint_dir)
                )
                self.storage_client.wait_or_retry()
                return

            checkpoint = Checkpoint.from_directory(checkpoint_dir)
            retry_fn(
                lambda: checkpoint.to_uri(self._storage_path(checkpoint_dir)),
                subprocess.CalledProcessError,
                num_retries=3,
                sleep_time=1,
            )

Example #26

0

Show file

def load_checkpoint(
    checkpoint: Checkpoint,
) -> Tuple[xgboost.Booster, Optional["Preprocessor"]]:
    """Load a Checkpoint from ``XGBoostTrainer``.

    Args:
        checkpoint: The checkpoint to load the model and
            preprocessor from. It is expected to be from the result of a
            ``XGBoostTrainer`` run.

    Returns:
        The model and AIR preprocessor contained within.
    """
    with checkpoint.as_directory() as checkpoint_path:
        xgb_model = xgboost.Booster()
        xgb_model.load_model(os.path.join(checkpoint_path, MODEL_KEY))
        preprocessor = load_preprocessor_from_dir(checkpoint_path)

    return xgb_model, preprocessor

Example #27

0

Show file

def load_checkpoint(
    checkpoint: Checkpoint,
) -> Tuple[lightgbm.Booster, Optional["Preprocessor"]]:
    """Load a Checkpoint from ``LightGBMTrainer``.

    Args:
        checkpoint: The checkpoint to load the model and
            preprocessor from. It is expected to be from the result of a
            ``LightGBMTrainer`` run.

    Returns:
        The model and AIR preprocessor contained within.
    """
    with checkpoint.as_directory() as checkpoint_path:
        lgbm_model = lightgbm.Booster(
            model_file=os.path.join(checkpoint_path, MODEL_KEY))
        preprocessor = load_preprocessor_from_dir(checkpoint_path)

    return lgbm_model, preprocessor

Example #28

0

Show file

def test_init():
    preprocessor = DummyPreprocessor()
    preprocessor.attr = 1
    predictor = XGBoostPredictor(model=model, preprocessor=preprocessor)

    with tempfile.TemporaryDirectory() as tmpdir:
        # This somewhat convoluted procedure is the same as in the
        # Trainers. The reason for saving model to disk instead
        # of directly to the dict as bytes is due to all callbacks
        # following save to disk logic. GBDT models are small
        # enough that IO should not be an issue.
        model.save_model(os.path.join(tmpdir, MODEL_KEY))
        save_preprocessor_to_dir(preprocessor, tmpdir)

        checkpoint = Checkpoint.from_directory(tmpdir)
        checkpoint_predictor = XGBoostPredictor.from_checkpoint(checkpoint)

    assert get_num_trees(checkpoint_predictor.model) == get_num_trees(predictor.model)
    assert checkpoint_predictor.preprocessor.attr == predictor.preprocessor.attr

Example #29

0

Show file

File: test_batch_predictor.py Project: parasj/ray

def test_automatic_enable_gpu_from_num_gpus_per_worker(shutdown_only):
    """
    Test we automatically set underlying Predictor creation use_gpu to True if
    we found num_gpus_per_worker > 0 in BatchPredictor's predict() call.
    """
    ray.init(num_gpus=1)

    batch_predictor = BatchPredictor.from_checkpoint(
        Checkpoint.from_dict({
            "factor": 2.0,
            PREPROCESSOR_KEY: DummyPreprocessor()
        }),
        DummyPredictor,
    )
    test_dataset = ray.data.range_table(4)

    with pytest.raises(ValueError,
                       match="DummyPredictor does not support GPU prediction"):
        _ = batch_predictor.predict(test_dataset, num_gpus_per_worker=1)

Example #30

0

Show file

File: pbt_convnet_function_example.py Project: parasj/ray

def train_convnet(config):
    # Create our data loaders, model, and optmizer.
    step = 0
    train_loader, test_loader = get_data_loaders()
    model = ConvNet()
    optimizer = optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.01),
        momentum=config.get("momentum", 0.9),
    )

    # If `session.get_checkpoint()` is not None, then we are resuming from a checkpoint.
    # Load model state and iteration step from checkpoint.
    if session.get_checkpoint():
        print("Loading from checkpoint.")
        loaded_checkpoint = session.get_checkpoint()
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
            checkpoint = torch.load(path)
            model.load_state_dict(checkpoint["model_state_dict"])
            step = checkpoint["step"]

    while True:
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        checkpoint = None
        if step % 5 == 0:
            # Every 5 steps, checkpoint our current state.
            # First get the checkpoint directory from tune.
            # Need to create a directory under current working directory
            # to construct an AIR Checkpoint object from.
            os.makedirs("my_model", exist_ok=True)
            torch.save(
                {
                    "step": step,
                    "model_state_dict": model.state_dict(),
                },
                "my_model/checkpoint.pt",
            )
            checkpoint = Checkpoint.from_directory("my_model")

        step += 1
        session.report({"mean_accuracy": acc}, checkpoint=checkpoint)