def test_fs_checkpoint_additional_fields(self): checkpoint = self._prepare_fs_checkpoint() # Convert to dict checkpoint_dict = checkpoint.to_dict() # Add field to dict checkpoint_dict["additional_field"] = "data" # Create new checkpoint object checkpoint = Checkpoint.from_dict(checkpoint_dict) # Turn into FS checkpoint_dir = checkpoint.to_directory() assert os.path.exists(os.path.join(checkpoint_dir, "test_data.pkl")) assert os.path.exists( os.path.join(checkpoint_dir, "additional_field.meta.pkl")) # Add new file with open(os.path.join(checkpoint_dir, "even_more.txt"), "w") as f: f.write("More\n") # Turn into dict new_dict = Checkpoint.from_directory(checkpoint_dir).to_dict() assert new_dict["additional_field"] == "data" # Turn into fs new_dir = Checkpoint.from_dict(new_dict).to_directory() assert os.path.exists(os.path.join(new_dir, "test_data.pkl")) assert os.path.exists( os.path.join(new_dir, "additional_field.meta.pkl")) assert os.path.exists(os.path.join(new_dir, "even_more.txt"))
def test_air_integrations_reconfigure(serve_instance): path = tempfile.mkdtemp() uri = f"file://{path}/test_uri" Checkpoint.from_dict({"increment": 2}).to_uri(uri) predictor_cls = "ray.serve.tests.test_air_integrations.AdderPredictor" additional_config = { "checkpoint": {"increment": 5}, "predictor_cls": "ray.serve.tests.test_air_integrations.AdderPredictor", } with InputNode() as dag_input: m1 = PredictorDeployment.options(user_config=additional_config).bind( predictor_cls=predictor_cls, checkpoint=uri, ) dag = m1.predict.bind(dag_input) deployments = build(Ingress.bind(dag)) for d in deployments: d.deploy() resp = requests.post("http://127.0.0.1:8000/ingress", json={"array": [40]}) print(resp.text) resp.raise_for_status() return resp.json() == {"value": [45], "batch_size": 1}
def test_preprocessor_in_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def fit(self, dataset): self.fitted_ = True def _transform_pandas(self, df: "pd.DataFrame") -> "pd.DataFrame": return df trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, preprocessor=DummyPreprocessor(), ) result = trainer.fit() # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) model, preprocessor = load_checkpoint(resume_from) assert get_num_trees(model) == 10 assert preprocessor.is_same assert preprocessor.fitted_
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, ) result = trainer.fit() checkpoint = result.checkpoint xgb_model, _ = load_checkpoint(checkpoint) assert get_num_trees(xgb_model) == 5 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = XGBoostTrainer( scaling_config=scale_config, label_column="target", params=params, num_boost_round=5, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, resume_from_checkpoint=resume_from, ) result = trainer.fit() checkpoint = result.checkpoint model, _ = load_checkpoint(checkpoint) assert get_num_trees(model) == 10
def test_metadata(self): """Test conversion with metadata involved. a. from fs to dict checkpoint; b. drop some marker to dict checkpoint; c. convert back to fs checkpoint; d. convert back to dict checkpoint. Assert that the marker should still be there.""" checkpoint = self._prepare_fs_checkpoint() # Convert into dict checkpoint data_dict = checkpoint.to_dict() self.assertIsInstance(data_dict, dict) data_dict["my_marker"] = "marked" # Create from dict checkpoint = Checkpoint.from_dict(data_dict) self.assertTrue(checkpoint._data_dict) self._assert_fs_checkpoint(checkpoint) # Convert back to dict data_dict_2 = Checkpoint.from_directory( checkpoint.to_directory()).to_dict() assert data_dict_2["my_marker"] == "marked"
def _convert_directory_checkpoint_to_sync_if_needed( self, checkpoint: Checkpoint) -> Checkpoint: """Replace the directory checkpoint with a node ip & path dict checkpoint. This dict checkpoint will be used to sync the directory. If we were to use a directory checkpoint directly, it would get deepcopied & serialized unnecessarily.""" with checkpoint.as_directory() as checkpoint_path: # Load checkpoint from path. checkpoint_path = Path(checkpoint_path).expanduser().absolute() if not checkpoint_path.joinpath(TUNE_CHECKPOINT_ID).exists(): # If the ID file is missing, we assume that this is already # a sync checkpoint dict_checkpoint = checkpoint.to_dict() if (NODE_IP_KEY not in dict_checkpoint or CHECKPOINT_PATH_ON_NODE_KEY not in dict_checkpoint): raise ValueError( "Wrong checkpoint format. Ensure the checkpoint is a " "result of `HuggingFaceTrainer`.") return checkpoint with open(checkpoint_path.joinpath(TUNE_CHECKPOINT_ID), "r") as f: tune_checkpoint_id = int(f.read()) return Checkpoint.from_dict({ NODE_IP_KEY: get_node_ip_address(), CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path), TUNE_CHECKPOINT_ID: tune_checkpoint_id, })
def testDataCheckpointSerde(self): # Data checkpoints keep the same internal representation, including # their data. checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) self._testCheckpointSerde(checkpoint, *checkpoint.get_internal_representation())
def test_simple_adder(serve_instance): ModelWrapperDeployment.options(name="Adder").deploy( predictor_cls=AdderPredictor, checkpoint=Checkpoint.from_dict({"increment": 2}), ) resp = ray.get(send_request.remote(json={"array": [40]})) assert resp == {"value": [42], "batch_size": 1}
def reconfigure(self, config): """Reconfigure model from config checkpoint""" from ray.air.checkpoint import Checkpoint predictor_cls = _load_predictor_cls(config["predictor_cls"]) self.model = predictor_cls.from_checkpoint( Checkpoint.from_dict(config["checkpoint"]))
def _handle(self, logs: Dict, when: str = None): self._counter[when] += 1 if isinstance(self._frequency, list): index = self._on.index(when) freq = self._frequency[index] else: freq = self._frequency checkpoint = None if freq > 0 and self._counter[when] % freq == 0: checkpoint = Checkpoint.from_dict( {MODEL_KEY: self.model.get_weights()}) if not self._metrics: report_dict = logs else: report_dict = {} for key in self._metrics: if isinstance(self._metrics, dict): metric = self._metrics[key] else: metric = key report_dict[key] = logs[metric] session.report(report_dict, checkpoint=checkpoint)
def test_run(ray_start_4_cpus): """Tests that Train can be run without any specific backends.""" num_workers = 2 key = "value" value = 1 config = TestConfig() def train_func(): checkpoint = session.get_checkpoint() session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint) return checkpoint.to_dict()[key] checkpoint = Checkpoint.from_dict({ # this would be set during checkpoint saving "_current_checkpoint_id": 1, key: value, }) trainer = DataParallelTrainer( train_func, backend_config=config, resume_from_checkpoint=checkpoint, scaling_config=ScalingConfig(num_workers=num_workers), ) results = trainer.fit() assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key]
def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): def train_func(): checkpoint = session.get_checkpoint() if checkpoint: epoch = checkpoint.to_dict()["epoch"] else: epoch = 0 for i in range(epoch, epoch + 2): session.report({"epoch": i}, checkpoint=Checkpoint.from_dict({"epoch": i})) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config ) result = trainer.fit() assert result.checkpoint.to_dict()["epoch"] == 1 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config, resume_from_checkpoint=resume_from, ) result = trainer.fit() assert result.checkpoint.to_dict()["epoch"] == 2
def test_separate_gpu_stage(shutdown_only): ray.init(num_gpus=1) batch_predictor = BatchPredictor.from_checkpoint( Checkpoint.from_dict({ "factor": 2.0, PREPROCESSOR_KEY: DummyPreprocessor() }), DummyPredictor, ) ds = batch_predictor.predict( ray.data.range_table(10), num_gpus_per_worker=1, separate_gpu_stage=True, allow_gpu=True, ) stats = ds.stats() assert "Stage 1 read->map_batches:" in stats, stats assert "Stage 2 map_batches:" in stats, stats assert ds.max("value") == 36.0, ds ds = batch_predictor.predict( ray.data.range_table(10), num_gpus_per_worker=1, separate_gpu_stage=False, allow_gpu=True, ) stats = ds.stats() assert "Stage 1 read:" in stats, stats assert "Stage 2 map_batches:" in stats, stats assert ds.max("value") == 36.0, ds
def test_get_and_set_preprocessor(): """Test preprocessor can be set and get.""" preprocessor = DummyPreprocessor(1) batch_predictor = BatchPredictor.from_checkpoint( Checkpoint.from_dict({ "factor": 2.0, PREPROCESSOR_KEY: preprocessor }), DummyPredictor, ) assert batch_predictor.get_preprocessor() == preprocessor test_dataset = ray.data.range(4) output_ds = batch_predictor.predict(test_dataset) assert output_ds.to_pandas().to_numpy().squeeze().tolist() == [ 0.0, 2.0, 4.0, 6.0, ] preprocessor2 = DummyPreprocessor(2) batch_predictor.set_preprocessor(preprocessor2) assert batch_predictor.get_preprocessor() == preprocessor2 output_ds = batch_predictor.predict(test_dataset) assert output_ds.to_pandas().to_numpy().squeeze().tolist() == [ 0.0, 4.0, 8.0, 12.0, ]
def test_batch_prediction(): batch_predictor = BatchPredictor.from_checkpoint( Checkpoint.from_dict({ "factor": 2.0, PREPROCESSOR_KEY: DummyPreprocessor() }), DummyPredictor, ) test_dataset = ray.data.range(4) ds = batch_predictor.predict(test_dataset) # Check fusion occurred. assert "read->map_batches" in ds.stats(), ds.stats() assert ds.to_pandas().to_numpy().squeeze().tolist() == [ 0.0, 4.0, 8.0, 12.0, ] test_dataset = ray.data.from_items([1.0, 2.0, 3.0, 4.0]) assert next( batch_predictor.predict_pipelined(test_dataset, blocks_per_window=2). iter_datasets()).to_pandas().to_numpy().squeeze().tolist() == [ 4.0, 8.0, ]
def test_get_and_set_preprocessor(): """Test preprocessor can be set and get.""" preprocessor = DummyPreprocessor(1) predictor = DummyPredictor.from_checkpoint( Checkpoint.from_dict({ "factor": 2.0, PREPROCESSOR_KEY: preprocessor }), ) assert predictor.get_preprocessor() == preprocessor test_dataset = pd.DataFrame(range(4)) output_df = predictor.predict(test_dataset) assert output_df.to_numpy().squeeze().tolist() == [ 0.0, 2.0, 4.0, 6.0, ] preprocessor2 = DummyPreprocessor(2) predictor.set_preprocessor(preprocessor2) assert predictor.get_preprocessor() == preprocessor2 output_df = predictor.predict(test_dataset) assert output_df.to_numpy().squeeze().tolist() == [ 0.0, 4.0, 8.0, 12.0, ]
def train_func(): checkpoint = session.get_checkpoint() if checkpoint: epoch = checkpoint.to_dict()["epoch"] else: epoch = 0 for i in range(epoch, epoch + 2): session.report({"epoch": i}, checkpoint=Checkpoint.from_dict({"epoch": i}))
def testDictCheckpointWithPreprocessorAsDir(self): preprocessor = DummyPreprocessor(1) data = {"metric": 5, PREPROCESSOR_KEY: preprocessor} checkpoint = Checkpoint.from_dict(data) checkpoint_path = checkpoint.to_directory() checkpoint = Checkpoint.from_directory(checkpoint_path) preprocessor = checkpoint.get_preprocessor() assert preprocessor.multiplier == 1
def testLocalCheckpointSerde(self): # Local checkpoints are converted to bytes on serialization. Currently # this is a pickled dict, so we compare with a dict checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) with source_checkpoint.as_directory() as tmpdir: checkpoint = Checkpoint.from_directory(tmpdir) self._testCheckpointSerde( checkpoint, *source_checkpoint.get_internal_representation())
def test_predictor_from_checkpoint_kwargs(serve_instance): PredictorDeployment.options(name="Adder").deploy( predictor_cls=AdderPredictor, checkpoint=Checkpoint.from_dict({"increment": 2}), do_double=True, ) resp = ray.get(send_request.remote(json={"array": [40]})) assert resp == {"value": [84], "batch_size": 1}
def _prepare_dict_checkpoint(self) -> Checkpoint: # Create checkpoint from dict checkpoint = Checkpoint.from_dict(self.checkpoint_dict_data) self.assertIsInstance(checkpoint, Checkpoint) self.assertTrue(checkpoint._data_dict) self.assertEqual(checkpoint._data_dict["metric"], self.checkpoint_dict_data["metric"]) return checkpoint
def testObjRefCheckpointSerde(self): # Obj ref checkpoints are dict checkpoints put into the Ray object # store, but they have their own data representation (the obj ref). # We thus compare with the actual obj ref checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) obj_ref = source_checkpoint.to_object_ref() checkpoint = Checkpoint.from_object_ref(obj_ref) self._testCheckpointSerde(checkpoint, *checkpoint.get_internal_representation())
def test_init(model, preprocessor): predictor = TorchPredictor(model=model, preprocessor=preprocessor) checkpoint = {MODEL_KEY: model, PREPROCESSOR_KEY: preprocessor} checkpoint_predictor = TorchPredictor.from_checkpoint( Checkpoint.from_dict(checkpoint)) assert checkpoint_predictor.model == predictor.model assert checkpoint_predictor.preprocessor == predictor.preprocessor
def testBytesCheckpointSerde(self): # Bytes checkpoints are just dict checkpoints constructed # from pickled data, so we compare with the source dict checkpoint. source_checkpoint = Checkpoint.from_dict({"checkpoint_data": 5}) blob = source_checkpoint.to_bytes() checkpoint = Checkpoint.from_bytes(blob) self._testCheckpointSerde( checkpoint, *source_checkpoint.get_internal_representation())
def test_mixed_input_output_type_with_batching(serve_instance): ModelWrapperDeployment.options(name="Adder").deploy( predictor_cls=TakeArrayReturnDataFramePredictor, checkpoint=Checkpoint.from_dict({"increment": 2}), batching_params=dict(max_batch_size=2, batch_wait_timeout_s=1000), ) refs = [send_request.remote(json={"array": [40, 45]}) for _ in range(2)] for resp in ray.get(refs): assert resp == [{"col_a": 42.0, "col_b": 47.0}]
def test_batching(serve_instance): ModelWrapperDeployment.options(name="Adder").deploy( predictor_cls=AdderPredictor, checkpoint=Checkpoint.from_dict({"increment": 2}), batching_params=dict(max_batch_size=2, batch_wait_timeout_s=1000), ) refs = [send_request.remote(json={"array": [40]}) for _ in range(2)] for resp in ray.get(refs): assert resp == {"value": [42], "batch_size": 2}
def test_kwargs(predict_pandas_mock): checkpoint = Checkpoint.from_dict({"factor": 2.0}) predictor = DummyPredictor.from_checkpoint(checkpoint) input = pd.DataFrame({"x": [1, 2, 3]}) predictor.predict(input, extra_arg=1) # Second element in call_args is the kwargs. assert "extra_arg" in predict_pandas_mock.call_args[1] assert predict_pandas_mock.call_args[1]["extra_arg"] == 1
def test_predict_array(): checkpoint = {MODEL_KEY: weights} predictor = TensorflowPredictor.from_checkpoint( Checkpoint.from_dict(checkpoint), build_model) data_batch = np.array([[1], [2], [3]]) predictions = predictor.predict(data_batch) assert len(predictions) == 3 assert predictions.to_numpy().flatten().tolist() == [1, 2, 3]
def test_predict(convert_from_pandas_mock, convert_to_pandas_mock): checkpoint = Checkpoint.from_dict({"factor": 2.0}) predictor = DummyPredictor.from_checkpoint(checkpoint) input = pd.DataFrame({"x": [1, 2, 3]}) expected_output = input * 4.0 actual_output = predictor.predict(input) assert actual_output.equals(expected_output) # Ensure the proper conversion functions are called. convert_to_pandas_mock.assert_called_once() convert_from_pandas_mock.assert_called_once()
def test_model_wrappers_in_pipeline(serve_instance): path = tempfile.mkdtemp() uri = f"file://{path}/test_uri" Checkpoint.from_dict({"increment": 2}).to_uri(uri) predictor_cls = "ray.serve.tests.test_model_wrappers.AdderPredictor" with InputNode() as dag_input: m1 = ModelWrapperDeployment.bind( predictor_cls=predictor_cls, checkpoint=uri, ) dag = m1.predict.bind(dag_input) deployments = build(Ingress.bind(dag)) for d in deployments: d.deploy() resp = requests.post("http://127.0.0.1:8000/ingress", json={"array": [40]}) print(resp.text) resp.raise_for_status() return resp.json() == {"value": [42], "batch_size": 1}