Esempio n. 1
0
    def run_inference(
        self,
        batch: Sequence[torch.Tensor],
        model: torch.nn.Module,
        inference_args: Optional[Dict[str, Any]] = None
    ) -> Iterable[PredictionResult]:
        """
    Runs inferences on a batch of Tensors and returns an Iterable of
    Tensor Predictions.

    This method stacks the list of Tensors in a vectorized format to optimize
    the inference call.

    Args:
      batch: A sequence of Tensors. These Tensors should be batchable, as this
        method will call `torch.stack()` and pass in batched Tensors with
        dimensions (batch_size, n_features, etc.) into the model's forward()
        function.
      model: A PyTorch model.
      inference_args: Non-batchable arguments required as inputs to the model's
        forward() function. Unlike Tensors in `batch`, these parameters will
        not be dynamically batched

    Returns:
      An Iterable of type PredictionResult.
    """
        inference_args = {} if not inference_args else inference_args

        batched_tensors = torch.stack(batch)
        batched_tensors = _convert_to_device(batched_tensors, self._device)
        predictions = model(batched_tensors, **inference_args)
        return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
Esempio n. 2
0
    def test_pipeline_gcs_model(self):
        with TestPipeline() as pipeline:
            examples = torch.from_numpy(
                np.array([1, 5, 3, 10], dtype="float32").reshape(-1, 1))
            expected_predictions = [
                PredictionResult(ex, pred) for ex, pred in zip(
                    examples,
                    torch.Tensor([example * 2.0 + 0.5
                                  for example in examples]).reshape(-1, 1))
            ]

            gs_pth = 'gs://apache-beam-ml/models/' \
                'pytorch_lin_reg_model_2x+0.5_state_dict.pth'
            model_handler = PytorchModelHandlerTensor(
                state_dict_path=gs_pth,
                model_class=PytorchLinearRegression,
                model_params={
                    'input_dim': 1,
                    'output_dim': 1
                })

            pcoll = pipeline | 'start' >> beam.Create(examples)
            predictions = pcoll | RunInference(model_handler)
            assert_that(
                predictions,
                equal_to(expected_predictions,
                         equals_fn=_compare_prediction_result))
Esempio n. 3
0
    def run_inference(self, batch: Sequence[Union[torch.Tensor,
                                                  Dict[str, torch.Tensor]]],
                      model: torch.nn.Module,
                      **kwargs) -> Iterable[PredictionResult]:
        """
    Runs inferences on a batch of Tensors and returns an Iterable of
    Tensor Predictions.

    This method stacks the list of Tensors in a vectorized format to optimize
    the inference call.
    """
        prediction_params = kwargs.get('prediction_params', {})

        # If elements in `batch` are provided as a dictionaries from key to Tensors,
        # then iterate through the batch list, and group Tensors to the same key
        if isinstance(batch[0], dict):
            key_to_tensor_list = defaultdict(list)
            for example in batch:
                for key, tensor in example.items():
                    key_to_tensor_list[key].append(tensor)
            key_to_batched_tensors = {}
            for key in key_to_tensor_list:
                batched_tensors = torch.stack(key_to_tensor_list[key])
                batched_tensors = self._convert_to_device(batched_tensors)
                key_to_batched_tensors[key] = batched_tensors
            predictions = model(**key_to_batched_tensors, **prediction_params)
        else:
            # If elements in `batch` are provided as Tensors, then do a regular stack
            batched_tensors = torch.stack(batch)
            batched_tensors = self._convert_to_device(batched_tensors)
            predictions = model(batched_tensors, **prediction_params)
        return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
Esempio n. 4
0
    def test_run_inference_single_tensor_feature(self):
        examples = [
            torch.from_numpy(np.array([1], dtype="float32")),
            torch.from_numpy(np.array([5], dtype="float32")),
            torch.from_numpy(np.array([-3], dtype="float32")),
            torch.from_numpy(np.array([10.0], dtype="float32")),
        ]
        expected_predictions = [
            PredictionResult(ex, pred) for ex, pred in zip(
                examples,
                torch.Tensor([example * 2.0 + 0.5
                              for example in examples]).reshape(-1, 1))
        ]

        model = PytorchLinearRegression(input_dim=1, output_dim=1)
        model.load_state_dict(
            OrderedDict([('linear.weight', torch.Tensor([[2.0]])),
                         ('linear.bias', torch.Tensor([0.5]))]))
        model.eval()

        inference_runner = TestPytorchModelHandlerForInferenceOnly(
            torch.device('cpu'))
        predictions = inference_runner.run_inference(examples, model)
        for actual, expected in zip(predictions, expected_predictions):
            self.assertEqual(actual, expected)
Esempio n. 5
0
 def run_inference(
     self, batch: Sequence[numpy.ndarray], model: BaseEstimator,
     **kwargs) -> Iterable[PredictionResult]:
   # vectorize data for better performance
   vectorized_batch = numpy.stack(batch, axis=0)
   predictions = model.predict(vectorized_batch)
   return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
Esempio n. 6
0
    def test_pipeline_pickled(self):
        temp_file_name = self.tmpdir + os.sep + 'pickled_file'
        with open(temp_file_name, 'wb') as file:
            pickle.dump(build_model(), file)
        with TestPipeline() as pipeline:
            examples = [numpy.array([0, 0]), numpy.array([1, 1])]

            pcoll = pipeline | 'start' >> beam.Create(examples)
            actual = pcoll | RunInference(
                SklearnModelHandlerNumpy(model_uri=temp_file_name))
            expected = [
                PredictionResult(numpy.array([0, 0]), 0),
                PredictionResult(numpy.array([1, 1]), 1)
            ]
            assert_that(
                actual, equal_to(expected,
                                 equals_fn=_compare_prediction_result))
Esempio n. 7
0
 def test_predict_output(self):
     fake_model = FakeModel()
     inference_runner = SklearnModelHandlerNumpy(model_uri='unused')
     batched_examples = [
         numpy.array([1, 2, 3]),
         numpy.array([4, 5, 6]),
         numpy.array([7, 8, 9])
     ]
     expected_predictions = [
         PredictionResult(numpy.array([1, 2, 3]), 6),
         PredictionResult(numpy.array([4, 5, 6]), 15),
         PredictionResult(numpy.array([7, 8, 9]), 24)
     ]
     inferences = inference_runner.run_inference(batched_examples,
                                                 fake_model)
     for actual, expected in zip(inferences, expected_predictions):
         self.assertTrue(_compare_prediction_result(actual, expected))
Esempio n. 8
0
    def test_pipeline_pandas(self):
        temp_file_name = self.tmpdir + os.sep + 'pickled_file'
        with open(temp_file_name, 'wb') as file:
            pickle.dump(build_pandas_pipeline(), file)
        with TestPipeline() as pipeline:
            dataframe = pandas_dataframe()
            splits = [dataframe.loc[[i]] for i in dataframe.index]
            pcoll = pipeline | 'start' >> beam.Create(splits)
            actual = pcoll | RunInference(
                SklearnModelHandlerPandas(model_uri=temp_file_name))

            expected = [
                PredictionResult(splits[0], 5),
                PredictionResult(splits[1], 8),
                PredictionResult(splits[2], 1),
                PredictionResult(splits[3], 1),
                PredictionResult(splits[4], 2),
            ]
            assert_that(
                actual,
                equal_to(expected, equals_fn=_compare_dataframe_predictions))
Esempio n. 9
0
    def test_pipeline_pandas_with_keys(self):
        temp_file_name = self.tmpdir + os.sep + 'pickled_file'
        with open(temp_file_name, 'wb') as file:
            pickle.dump(build_pandas_pipeline(), file)
        with TestPipeline() as pipeline:
            data_frame = pandas_dataframe()
            keys = [str(i) for i in range(5)]
            splits = [data_frame.loc[[i]] for i in data_frame.index]
            keyed_rows = [(key, value) for key, value in zip(keys, splits)]

            pcoll = pipeline | 'start' >> beam.Create(keyed_rows)
            actual = pcoll | RunInference(
                KeyedModelHandler(
                    SklearnModelHandlerPandas(model_uri=temp_file_name)))
            expected = [
                ('0', PredictionResult(splits[0], 5)),
                ('1', PredictionResult(splits[1], 8)),
                ('2', PredictionResult(splits[2], 1)),
                ('3', PredictionResult(splits[3], 1)),
                ('4', PredictionResult(splits[4], 2)),
            ]
            assert_that(
                actual,
                equal_to(expected, equals_fn=_compare_dataframe_predictions))
Esempio n. 10
0
  def run_inference(
      self, batch: Sequence[pandas.DataFrame], model: BaseEstimator,
      **kwargs) -> Iterable[PredictionResult]:
    # sklearn_inference currently only supports single rowed dataframes.
    for dataframe in batch:
      if dataframe.shape[0] != 1:
        raise ValueError('Only dataframes with single rows are supported.')

    # vectorize data for better performance
    vectorized_batch = pandas.concat(batch, axis=0)
    predictions = model.predict(vectorized_batch)
    splits = [
        vectorized_batch.iloc[[i]] for i in range(vectorized_batch.shape[0])
    ]
    return [
        PredictionResult(example, inference) for example,
        inference in zip(splits, predictions)
    ]
Esempio n. 11
0
    def run_inference(
        self,
        batch: Sequence[Dict[str, torch.Tensor]],
        model: torch.nn.Module,
        inference_args: Optional[Dict[str, Any]] = None
    ) -> Iterable[PredictionResult]:
        """
    Runs inferences on a batch of Keyed Tensors and returns an Iterable of
    Tensor Predictions.

    For the same key across all examples, this will stack all Tensors values
    in a vectorized format to optimize the inference call.

    Args:
      batch: A sequence of keyed Tensors. These Tensors should be batchable,
        as this method will call `torch.stack()` and pass in batched Tensors
        with dimensions (batch_size, n_features, etc.) into the model's
        forward() function.
      model: A PyTorch model.
      inference_args: Non-batchable arguments required as inputs to the model's
        forward() function. Unlike Tensors in `batch`, these parameters will
        not be dynamically batched

    Returns:
      An Iterable of type PredictionResult.
    """
        inference_args = {} if not inference_args else inference_args

        # If elements in `batch` are provided as a dictionaries from key to Tensors,
        # then iterate through the batch list, and group Tensors to the same key
        key_to_tensor_list = defaultdict(list)
        for example in batch:
            for key, tensor in example.items():
                key_to_tensor_list[key].append(tensor)
        key_to_batched_tensors = {}
        for key in key_to_tensor_list:
            batched_tensors = torch.stack(key_to_tensor_list[key])
            batched_tensors = _convert_to_device(batched_tensors, self._device)
            key_to_batched_tensors[key] = batched_tensors
        predictions = model(**key_to_batched_tensors, **inference_args)
        return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
Esempio n. 12
0
    def run_inference(
        self,
        batch: Sequence[numpy.ndarray],
        model: BaseEstimator,
        inference_args: Optional[Dict[str, Any]] = None
    ) -> Iterable[PredictionResult]:
        """Runs inferences on a batch of numpy arrays.

    Args:
      batch: A sequence of examples as numpy arrays. They should
        be single examples.
      model: A numpy model or pipeline. Must implement predict(X).
        Where the parameter X is a numpy array.
      inference_args: Any additional arguments for an inference.

    Returns:
      An Iterable of type PredictionResult.
    """
        _validate_inference_args(inference_args)
        # vectorize data for better performance
        vectorized_batch = numpy.stack(batch, axis=0)
        predictions = model.predict(vectorized_batch)
        return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
Esempio n. 13
0
    def run_inference(
        self,
        batch: Sequence[pandas.DataFrame],
        model: BaseEstimator,
        inference_args: Optional[Dict[str, Any]] = None
    ) -> Iterable[PredictionResult]:
        """
    Runs inferences on a batch of pandas dataframes.

    Args:
      batch: A sequence of examples as numpy arrays. They should
        be single examples.
      model: A dataframe model or pipeline. Must implement predict(X).
        Where the parameter X is a pandas dataframe.
      inference_args: Any additional arguments for an inference.

    Returns:
      An Iterable of type PredictionResult.
    """
        _validate_inference_args(inference_args)
        # sklearn_inference currently only supports single rowed dataframes.
        for dataframe in iter(batch):
            if dataframe.shape[0] != 1:
                raise ValueError(
                    'Only dataframes with single rows are supported.')

        # vectorize data for better performance
        vectorized_batch = pandas.concat(batch, axis=0)
        predictions = model.predict(vectorized_batch)
        splits = [
            vectorized_batch.iloc[[i]]
            for i in range(vectorized_batch.shape[0])
        ]
        return [
            PredictionResult(example, inference)
            for example, inference in zip(splits, predictions)
        ]
Esempio n. 14
0
    raise unittest.SkipTest('PyTorch dependencies are not installed')

try:
    from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
except ImportError:
    GCSFileSystem = None  # type: ignore

TWO_FEATURES_EXAMPLES = [
    torch.from_numpy(np.array([1, 5], dtype="float32")),
    torch.from_numpy(np.array([3, 10], dtype="float32")),
    torch.from_numpy(np.array([-14, 0], dtype="float32")),
    torch.from_numpy(np.array([0.5, 0.5], dtype="float32")),
]

TWO_FEATURES_PREDICTIONS = [
    PredictionResult(ex, pred) for ex, pred in zip(
        TWO_FEATURES_EXAMPLES,
        torch.Tensor(
            [f1 * 2.0 + f2 * 3 + 0.5
             for f1, f2 in TWO_FEATURES_EXAMPLES]).reshape(-1, 1))
]

KEYED_TORCH_EXAMPLES = [
    {
        'k1': torch.from_numpy(np.array([1], dtype="float32")),
        'k2': torch.from_numpy(np.array([1.5], dtype="float32"))
    },
    {
        'k1': torch.from_numpy(np.array([5], dtype="float32")),
        'k2': torch.from_numpy(np.array([5.5], dtype="float32"))
    },