def predict(self, data: DataBatchType, **kwargs) -> DataBatchType: """Perform inference on a batch of data. Args: data: A batch of input data of type ``DataBatchType``. kwargs: Arguments specific to predictor implementations. These are passed directly to ``_predict_pandas``. Returns: DataBatchType: Prediction result. The return type will be the same as the input type. """ data_df = convert_batch_type_to_pandas(data) if not hasattr(self, "_preprocessor"): raise NotImplementedError( "Subclasses of Predictor must call Predictor.__init__(preprocessor)." ) if self._preprocessor: data_df = self._preprocessor.transform_batch(data_df) predictions_df = self._predict_pandas(data_df, **kwargs) return convert_pandas_to_batch_type( predictions_df, type=TYPE_TO_ENUM[type(data)] )
def test_predict(tmpdir, ray_start_runtime_env, batch_type): dtype_prompts = convert_pandas_to_batch_type(prompts, type=TYPE_TO_ENUM[batch_type]) @ray.remote def test(use_preprocessor): os.chdir(tmpdir) if use_preprocessor: preprocessor = DummyPreprocessor() else: preprocessor = None model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) predictor = HuggingFacePredictor( pipeline=pipeline( task="text-generation", model=model, tokenizer=AutoTokenizer.from_pretrained(tokenizer_checkpoint), ), preprocessor=preprocessor, ) predictions = predictor.predict(dtype_prompts) assert len(predictions) == 3 if preprocessor: assert hasattr(predictor.get_preprocessor(), "_batch_transformed") ray.get(test.remote(use_preprocessor=True)) ray.get(test.remote(use_preprocessor=False))
def test_pandas_pandas(): input_data = pd.DataFrame({"x": [1, 2, 3]}) expected_output = input_data actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) assert convert_pandas_to_batch_type( actual_output, type=DataType.PANDAS).equals(input_data)
def test_dict_pandas(): input_data = {"x": np.array([1, 2, 3])} expected_output = pd.DataFrame({"x": TensorArray(input_data["x"])}) actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) output_array = convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY) assert np.array_equal(output_array, input_data["x"])
def test_arrow_pandas(): df = pd.DataFrame({"x": [1, 2, 3]}) input_data = pa.Table.from_pandas(df) expected_output = df actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) assert convert_pandas_to_batch_type(actual_output, type=DataType.ARROW).equals(input_data)
def test_numpy_multi_dim_pandas(): input_data = np.arange(12).reshape((3, 2, 2)) expected_output = pd.DataFrame( {TENSOR_COLUMN_NAME: TensorArray(input_data)}) actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) assert np.array_equal( convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY), input_data)
def test_predict(batch_type): preprocessor = DummyPreprocessor() predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor) raw_batch = pd.DataFrame([[1, 2], [3, 4], [5, 6]]) data_batch = convert_pandas_to_batch_type(raw_batch, type=TYPE_TO_ENUM[batch_type]) predictions = predictor.predict(data_batch) assert len(predictions) == 3 assert hasattr(predictor.get_preprocessor(), "_batch_transformed")
def test_numpy_object_pandas(): input_data = np.array([[1, 2, 3], [1]], dtype=object) expected_output = pd.DataFrame( {TENSOR_COLUMN_NAME: TensorArray(input_data)}) actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) assert np.array_equal( convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY), input_data)
def test_dict_multi_dim_to_pandas(): tensor = np.arange(12).reshape((3, 2, 2)) input_data = {"x": tensor} expected_output = pd.DataFrame({"x": TensorArray(tensor)}) actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) output_array = convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY) assert np.array_equal(output_array, input_data["x"])
def test_predict(batch_type): predictor = TorchPredictor(model=DummyModelMultiInput()) raw_batch = pd.DataFrame({"X0": [0.0, 0.0, 0.0], "X1": [1.0, 2.0, 3.0]}) data_batch = convert_pandas_to_batch_type(raw_batch, type=TYPE_TO_ENUM[batch_type]) raw_predictions = predictor.predict(data_batch, dtype=torch.float) predictions = convert_batch_type_to_pandas(raw_predictions) assert len(predictions) == 3 assert predictions.to_numpy().flatten().tolist() == [1.0, 2.0, 3.0]
def test_predict(batch_type): predictor = TensorflowPredictor(model_definition=build_model_multi_input) raw_batch = pd.DataFrame({"A": [0.0, 0.0, 0.0], "B": [1.0, 2.0, 3.0]}) data_batch = convert_pandas_to_batch_type(raw_batch, type=TYPE_TO_ENUM[batch_type]) raw_predictions = predictor.predict(data_batch) predictions = convert_batch_type_to_pandas(raw_predictions) assert len(predictions) == 3 assert predictions.to_numpy().flatten().tolist() == [1.0, 2.0, 3.0]
def test_arrow_tensor_pandas(): np_array = np.array([1, 2, 3]) df = pd.DataFrame({"x": TensorArray(np_array)}) input_data = pa.Table.from_arrays([ArrowTensorArray.from_numpy(np_array)], names=["x"]) expected_output = df actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) assert convert_pandas_to_batch_type(actual_output, type=DataType.ARROW).equals(input_data)
def test_dict_pandas_multi_column(): array_dict = {"x": np.array([1, 2, 3]), "y": np.array([4, 5, 6])} expected_output = pd.DataFrame( {k: TensorArray(v) for k, v in array_dict.items()}) actual_output = convert_batch_type_to_pandas(array_dict) assert expected_output.equals(actual_output) output_dict = convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY) for k, v in output_dict.items(): assert np.array_equal(v, array_dict[k])
def _predict_pandas( self, data: pd.DataFrame, dtype: Optional[Union[torch.dtype, Dict[str, torch.dtype]]] = None, ) -> pd.DataFrame: def tensorize(numpy_array, dtype): torch_tensor = torch.from_numpy(numpy_array).to(dtype) # Off-the-shelf torch Modules expect the input size to have at least 2 # dimensions (batch_size, feature_size). If the tensor for the column # is flattened, then we unqueeze it to add an extra dimension. if len(torch_tensor.size()) == 1: torch_tensor = torch_tensor.unsqueeze(dim=1) return torch_tensor tensors = convert_pandas_to_batch_type(data, DataType.NUMPY) # Single numpy array. if isinstance(tensors, np.ndarray): column_name = data.columns[0] if isinstance(dtype, dict): dtype = dtype[column_name] model_input = tensorize(tensors, dtype) else: model_input = { k: tensorize(v, dtype=dtype[k] if isinstance(dtype, dict) else dtype) for k, v in tensors.items() } with torch.no_grad(): self.model.eval() output = self.model(model_input) def untensorize(torch_tensor): numpy_array = torch_tensor.cpu().detach().numpy() return TensorArray(numpy_array) # Handle model multi-output. For example if model outputs 2 images. if isinstance(output, dict): return pd.DataFrame({k: untensorize(v) for k, v in output}) elif isinstance(output, list) or isinstance(output, tuple): tensor_name = "output_" output_dict = {} for i in range(len(output)): output_dict[tensor_name + str(i + 1)] = untensorize(output[i]) return pd.DataFrame(output_dict) else: return pd.DataFrame( {"predictions": untensorize(output)}, columns=["predictions"] )
def test_predict_no_preprocessor(batch_type, batch_size): checkpoint = create_checkpoint() predictor = RLPredictor.from_checkpoint(checkpoint) # Observations data = pd.DataFrame([[1.0] * 10] * batch_size) obs = convert_pandas_to_batch_type(data, type=TYPE_TO_ENUM[batch_type]) # Predictions predictions = predictor.predict(obs) actions = convert_batch_type_to_pandas(predictions) assert len(actions) == batch_size # We add [0., 1.) to 1.0, so actions should be in [1., 2.) assert all(1.0 <= action.item() < 2.0 for action in np.array(actions))
def test_predict_with_preprocessor(batch_type, batch_size): preprocessor = _DummyPreprocessor() checkpoint = create_checkpoint(preprocessor=preprocessor) predictor = RLPredictor.from_checkpoint(checkpoint) # Observations data = pd.DataFrame([[1.0] * 10] * batch_size) obs = convert_pandas_to_batch_type(data, type=TYPE_TO_ENUM[batch_type]) # Predictions predictions = predictor.predict(obs) actions = convert_batch_type_to_pandas(predictions) assert len(actions) == batch_size # Preprocessor doubles observations to 2.0, then we add [0., 1.), # so actions should be in [2., 3.) assert all(2.0 <= action.item() < 3.0 for action in np.array(actions))
def predict(self, data: DataBatchType, **kwargs) -> DataBatchType: """Perform inference on a batch of data. Args: data: A batch of input data of type ``DataBatchType``. kwargs: Arguments specific to predictor implementations. These are passed directly to ``_predict_pandas``. Returns: DataBatchType: Prediction result. """ data_df = convert_batch_type_to_pandas(data) if getattr(self, "preprocessor", None): data_df = self.preprocessor.transform_batch(data_df) predictions_df = self._predict_pandas(data_df, **kwargs) return convert_pandas_to_batch_type(predictions_df, type=TYPE_TO_ENUM[type(data)])
def _predict_pandas( self, data: pd.DataFrame, dtype: Union[TensorDtype, Dict[str, TensorDtype]]) -> pd.DataFrame: tensors = convert_pandas_to_batch_type(data, DataType.NUMPY) # Single numpy array. if isinstance(tensors, np.ndarray): column_name = data.columns[0] if isinstance(dtype, dict): dtype = dtype[column_name] model_input = self._array_to_tensor(tensors, dtype) else: model_input = { k: self._array_to_tensor( v, dtype=dtype[k] if isinstance(dtype, dict) else dtype) for k, v in tensors.items() } output = self._model_predict(model_input) # Handle model multi-output. For example if model outputs 2 images. if isinstance(output, dict): return pd.DataFrame( {k: TensorArray(self._tensor_to_array(v)) for k, v in output}) elif isinstance(output, list) or isinstance(output, tuple): tensor_name = "output_" output_dict = {} for i in range(len(output)): output_dict[tensor_name + str(i + 1).zfill(5)] = TensorArray( self._tensor_to_array(output[i])) return pd.DataFrame(output_dict) else: return pd.DataFrame( {"predictions": TensorArray(self._tensor_to_array(output))}, columns=["predictions"], )