Example #1
0
    def predict(self, data: DataBatchType, **kwargs) -> DataBatchType:
        """Perform inference on a batch of data.

        Args:
            data: A batch of input data of type ``DataBatchType``.
            kwargs: Arguments specific to predictor implementations. These are passed
            directly to ``_predict_pandas``.

        Returns:
            DataBatchType: Prediction result. The return type will be the same as the
                input type.
        """
        data_df = convert_batch_type_to_pandas(data)

        if not hasattr(self, "_preprocessor"):
            raise NotImplementedError(
                "Subclasses of Predictor must call Predictor.__init__(preprocessor)."
            )

        if self._preprocessor:
            data_df = self._preprocessor.transform_batch(data_df)

        predictions_df = self._predict_pandas(data_df, **kwargs)
        return convert_pandas_to_batch_type(
            predictions_df, type=TYPE_TO_ENUM[type(data)]
        )
Example #2
0
def test_predict(tmpdir, ray_start_runtime_env, batch_type):
    dtype_prompts = convert_pandas_to_batch_type(prompts,
                                                 type=TYPE_TO_ENUM[batch_type])

    @ray.remote
    def test(use_preprocessor):
        os.chdir(tmpdir)
        if use_preprocessor:
            preprocessor = DummyPreprocessor()
        else:
            preprocessor = None
        model_config = AutoConfig.from_pretrained(model_checkpoint)
        model = AutoModelForCausalLM.from_config(model_config)
        predictor = HuggingFacePredictor(
            pipeline=pipeline(
                task="text-generation",
                model=model,
                tokenizer=AutoTokenizer.from_pretrained(tokenizer_checkpoint),
            ),
            preprocessor=preprocessor,
        )

        predictions = predictor.predict(dtype_prompts)

        assert len(predictions) == 3
        if preprocessor:
            assert hasattr(predictor.get_preprocessor(), "_batch_transformed")

    ray.get(test.remote(use_preprocessor=True))
    ray.get(test.remote(use_preprocessor=False))
Example #3
0
def test_pandas_pandas():
    input_data = pd.DataFrame({"x": [1, 2, 3]})
    expected_output = input_data
    actual_output = convert_batch_type_to_pandas(input_data)
    assert expected_output.equals(actual_output)

    assert convert_pandas_to_batch_type(
        actual_output, type=DataType.PANDAS).equals(input_data)
Example #4
0
def test_dict_pandas():
    input_data = {"x": np.array([1, 2, 3])}
    expected_output = pd.DataFrame({"x": TensorArray(input_data["x"])})
    actual_output = convert_batch_type_to_pandas(input_data)
    assert expected_output.equals(actual_output)

    output_array = convert_pandas_to_batch_type(actual_output,
                                                type=DataType.NUMPY)
    assert np.array_equal(output_array, input_data["x"])
Example #5
0
def test_arrow_pandas():
    df = pd.DataFrame({"x": [1, 2, 3]})
    input_data = pa.Table.from_pandas(df)
    expected_output = df
    actual_output = convert_batch_type_to_pandas(input_data)
    assert expected_output.equals(actual_output)

    assert convert_pandas_to_batch_type(actual_output,
                                        type=DataType.ARROW).equals(input_data)
Example #6
0
def test_numpy_multi_dim_pandas():
    input_data = np.arange(12).reshape((3, 2, 2))
    expected_output = pd.DataFrame(
        {TENSOR_COLUMN_NAME: TensorArray(input_data)})
    actual_output = convert_batch_type_to_pandas(input_data)
    assert expected_output.equals(actual_output)

    assert np.array_equal(
        convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY),
        input_data)
Example #7
0
def test_predict(batch_type):
    preprocessor = DummyPreprocessor()
    predictor = SklearnPredictor(estimator=model, preprocessor=preprocessor)

    raw_batch = pd.DataFrame([[1, 2], [3, 4], [5, 6]])
    data_batch = convert_pandas_to_batch_type(raw_batch, type=TYPE_TO_ENUM[batch_type])
    predictions = predictor.predict(data_batch)

    assert len(predictions) == 3
    assert hasattr(predictor.get_preprocessor(), "_batch_transformed")
Example #8
0
def test_numpy_object_pandas():
    input_data = np.array([[1, 2, 3], [1]], dtype=object)
    expected_output = pd.DataFrame(
        {TENSOR_COLUMN_NAME: TensorArray(input_data)})
    actual_output = convert_batch_type_to_pandas(input_data)
    assert expected_output.equals(actual_output)

    assert np.array_equal(
        convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY),
        input_data)
Example #9
0
def test_dict_multi_dim_to_pandas():
    tensor = np.arange(12).reshape((3, 2, 2))
    input_data = {"x": tensor}
    expected_output = pd.DataFrame({"x": TensorArray(tensor)})
    actual_output = convert_batch_type_to_pandas(input_data)
    assert expected_output.equals(actual_output)

    output_array = convert_pandas_to_batch_type(actual_output,
                                                type=DataType.NUMPY)
    assert np.array_equal(output_array, input_data["x"])
Example #10
0
def test_predict(batch_type):
    predictor = TorchPredictor(model=DummyModelMultiInput())

    raw_batch = pd.DataFrame({"X0": [0.0, 0.0, 0.0], "X1": [1.0, 2.0, 3.0]})
    data_batch = convert_pandas_to_batch_type(raw_batch, type=TYPE_TO_ENUM[batch_type])
    raw_predictions = predictor.predict(data_batch, dtype=torch.float)
    predictions = convert_batch_type_to_pandas(raw_predictions)

    assert len(predictions) == 3
    assert predictions.to_numpy().flatten().tolist() == [1.0, 2.0, 3.0]
Example #11
0
def test_predict(batch_type):
    predictor = TensorflowPredictor(model_definition=build_model_multi_input)

    raw_batch = pd.DataFrame({"A": [0.0, 0.0, 0.0], "B": [1.0, 2.0, 3.0]})
    data_batch = convert_pandas_to_batch_type(raw_batch,
                                              type=TYPE_TO_ENUM[batch_type])
    raw_predictions = predictor.predict(data_batch)
    predictions = convert_batch_type_to_pandas(raw_predictions)

    assert len(predictions) == 3
    assert predictions.to_numpy().flatten().tolist() == [1.0, 2.0, 3.0]
Example #12
0
def test_arrow_tensor_pandas():
    np_array = np.array([1, 2, 3])
    df = pd.DataFrame({"x": TensorArray(np_array)})
    input_data = pa.Table.from_arrays([ArrowTensorArray.from_numpy(np_array)],
                                      names=["x"])
    expected_output = df
    actual_output = convert_batch_type_to_pandas(input_data)
    assert expected_output.equals(actual_output)

    assert convert_pandas_to_batch_type(actual_output,
                                        type=DataType.ARROW).equals(input_data)
Example #13
0
def test_dict_pandas_multi_column():
    array_dict = {"x": np.array([1, 2, 3]), "y": np.array([4, 5, 6])}
    expected_output = pd.DataFrame(
        {k: TensorArray(v)
         for k, v in array_dict.items()})
    actual_output = convert_batch_type_to_pandas(array_dict)
    assert expected_output.equals(actual_output)

    output_dict = convert_pandas_to_batch_type(actual_output,
                                               type=DataType.NUMPY)
    for k, v in output_dict.items():
        assert np.array_equal(v, array_dict[k])
Example #14
0
    def _predict_pandas(
        self,
        data: pd.DataFrame,
        dtype: Optional[Union[torch.dtype, Dict[str, torch.dtype]]] = None,
    ) -> pd.DataFrame:
        def tensorize(numpy_array, dtype):
            torch_tensor = torch.from_numpy(numpy_array).to(dtype)

            # Off-the-shelf torch Modules expect the input size to have at least 2
            # dimensions (batch_size, feature_size). If the tensor for the column
            # is flattened, then we unqueeze it to add an extra dimension.
            if len(torch_tensor.size()) == 1:
                torch_tensor = torch_tensor.unsqueeze(dim=1)

            return torch_tensor

        tensors = convert_pandas_to_batch_type(data, DataType.NUMPY)

        # Single numpy array.
        if isinstance(tensors, np.ndarray):
            column_name = data.columns[0]
            if isinstance(dtype, dict):
                dtype = dtype[column_name]
            model_input = tensorize(tensors, dtype)

        else:
            model_input = {
                k: tensorize(v, dtype=dtype[k] if isinstance(dtype, dict) else dtype)
                for k, v in tensors.items()
            }

        with torch.no_grad():
            self.model.eval()
            output = self.model(model_input)

        def untensorize(torch_tensor):
            numpy_array = torch_tensor.cpu().detach().numpy()
            return TensorArray(numpy_array)

        # Handle model multi-output. For example if model outputs 2 images.
        if isinstance(output, dict):
            return pd.DataFrame({k: untensorize(v) for k, v in output})
        elif isinstance(output, list) or isinstance(output, tuple):
            tensor_name = "output_"
            output_dict = {}
            for i in range(len(output)):
                output_dict[tensor_name + str(i + 1)] = untensorize(output[i])
            return pd.DataFrame(output_dict)
        else:
            return pd.DataFrame(
                {"predictions": untensorize(output)}, columns=["predictions"]
            )
Example #15
0
def test_predict_no_preprocessor(batch_type, batch_size):
    checkpoint = create_checkpoint()
    predictor = RLPredictor.from_checkpoint(checkpoint)

    # Observations
    data = pd.DataFrame([[1.0] * 10] * batch_size)
    obs = convert_pandas_to_batch_type(data, type=TYPE_TO_ENUM[batch_type])

    # Predictions
    predictions = predictor.predict(obs)
    actions = convert_batch_type_to_pandas(predictions)

    assert len(actions) == batch_size
    # We add [0., 1.) to 1.0, so actions should be in [1., 2.)
    assert all(1.0 <= action.item() < 2.0 for action in np.array(actions))
Example #16
0
def test_predict_with_preprocessor(batch_type, batch_size):
    preprocessor = _DummyPreprocessor()
    checkpoint = create_checkpoint(preprocessor=preprocessor)
    predictor = RLPredictor.from_checkpoint(checkpoint)

    # Observations
    data = pd.DataFrame([[1.0] * 10] * batch_size)
    obs = convert_pandas_to_batch_type(data, type=TYPE_TO_ENUM[batch_type])

    # Predictions
    predictions = predictor.predict(obs)
    actions = convert_batch_type_to_pandas(predictions)

    assert len(actions) == batch_size
    # Preprocessor doubles observations to 2.0, then we add [0., 1.),
    # so actions should be in [2., 3.)
    assert all(2.0 <= action.item() < 3.0 for action in np.array(actions))
Example #17
0
    def predict(self, data: DataBatchType, **kwargs) -> DataBatchType:
        """Perform inference on a batch of data.

        Args:
            data: A batch of input data of type ``DataBatchType``.
            kwargs: Arguments specific to predictor implementations. These are passed
            directly to ``_predict_pandas``.

        Returns:
            DataBatchType: Prediction result.
        """
        data_df = convert_batch_type_to_pandas(data)

        if getattr(self, "preprocessor", None):
            data_df = self.preprocessor.transform_batch(data_df)

        predictions_df = self._predict_pandas(data_df, **kwargs)
        return convert_pandas_to_batch_type(predictions_df,
                                            type=TYPE_TO_ENUM[type(data)])
Example #18
0
    def _predict_pandas(
            self, data: pd.DataFrame,
            dtype: Union[TensorDtype, Dict[str, TensorDtype]]) -> pd.DataFrame:
        tensors = convert_pandas_to_batch_type(data, DataType.NUMPY)

        # Single numpy array.
        if isinstance(tensors, np.ndarray):
            column_name = data.columns[0]
            if isinstance(dtype, dict):
                dtype = dtype[column_name]
            model_input = self._array_to_tensor(tensors, dtype)

        else:
            model_input = {
                k: self._array_to_tensor(
                    v, dtype=dtype[k] if isinstance(dtype, dict) else dtype)
                for k, v in tensors.items()
            }

        output = self._model_predict(model_input)

        # Handle model multi-output. For example if model outputs 2 images.
        if isinstance(output, dict):
            return pd.DataFrame(
                {k: TensorArray(self._tensor_to_array(v))
                 for k, v in output})
        elif isinstance(output, list) or isinstance(output, tuple):
            tensor_name = "output_"
            output_dict = {}
            for i in range(len(output)):
                output_dict[tensor_name + str(i + 1).zfill(5)] = TensorArray(
                    self._tensor_to_array(output[i]))
            return pd.DataFrame(output_dict)
        else:
            return pd.DataFrame(
                {"predictions": TensorArray(self._tensor_to_array(output))},
                columns=["predictions"],
            )