Example #1
0
    def test_pipeline_local_model_extra_inference_args(self):
        with TestPipeline() as pipeline:
            inference_args = {
                'prediction_param_array':
                torch.from_numpy(np.array([1, 2], dtype="float32")),
                'prediction_param_bool':
                True
            }

            state_dict = OrderedDict([('linear.weight', torch.Tensor([[2.0]])),
                                      ('linear.bias', torch.Tensor([0.5]))])
            path = os.path.join(self.tmpdir, 'my_state_dict_path')
            torch.save(state_dict, path)

            model_handler = PytorchModelHandlerKeyedTensor(
                state_dict_path=path,
                model_class=
                PytorchLinearRegressionKeyedBatchAndExtraInferenceArgs,
                model_params={
                    'input_dim': 1,
                    'output_dim': 1
                })

            pcoll = pipeline | 'start' >> beam.Create(KEYED_TORCH_EXAMPLES)
            inference_args_side_input = (
                pipeline | 'create side' >> beam.Create(inference_args))
            predictions = pcoll | RunInference(
                model_handler=model_handler,
                inference_args=beam.pvalue.AsDict(inference_args_side_input))
            assert_that(
                predictions,
                equal_to(KEYED_TORCH_PREDICTIONS,
                         equals_fn=_compare_prediction_result))
Example #2
0
    def test_pipeline_gcs_model(self):
        with TestPipeline() as pipeline:
            examples = torch.from_numpy(
                np.array([1, 5, 3, 10], dtype="float32").reshape(-1, 1))
            expected_predictions = [
                PredictionResult(ex, pred) for ex, pred in zip(
                    examples,
                    torch.Tensor([example * 2.0 + 0.5
                                  for example in examples]).reshape(-1, 1))
            ]

            gs_pth = 'gs://apache-beam-ml/models/' \
                'pytorch_lin_reg_model_2x+0.5_state_dict.pth'
            model_handler = PytorchModelHandlerTensor(
                state_dict_path=gs_pth,
                model_class=PytorchLinearRegression,
                model_params={
                    'input_dim': 1,
                    'output_dim': 1
                })

            pcoll = pipeline | 'start' >> beam.Create(examples)
            predictions = pcoll | RunInference(model_handler)
            assert_that(
                predictions,
                equal_to(expected_predictions,
                         equals_fn=_compare_prediction_result))
Example #3
0
def run(argv=None, save_main_session=True):
    """Entry point. Defines and runs the pipeline."""
    known_args, pipeline_args = parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    # In this example we pass keyed inputs to RunInference transform.
    # Therefore, we use KeyedModelHandler wrapper over SklearnModelHandlerNumpy.
    model_loader = KeyedModelHandler(
        SklearnModelHandlerNumpy(model_file_type=ModelFileType.PICKLE,
                                 model_uri=known_args.model_path))

    with beam.Pipeline(options=pipeline_options) as p:
        label_pixel_tuple = (p
                             | "ReadFromInput" >> beam.io.ReadFromText(
                                 known_args.input, skip_header_lines=1)
                             | "PreProcessInputs" >> beam.Map(process_input))

        predictions = (label_pixel_tuple
                       | "RunInference" >> RunInference(model_loader)
                       | "PostProcessOutputs" >> beam.ParDo(PostProcessor()))

        _ = predictions | "WriteOutput" >> beam.io.WriteToText(
            known_args.output,
            shard_name_template='',
            append_trailing_newlines=True)
Example #4
0
    def test_pipeline_local_model(self):
        with TestPipeline() as pipeline:
            examples = torch.from_numpy(
                np.array([1, 5, 3, 10, -14, 0, 0.5, 0.5],
                         dtype="float32")).reshape(-1, 2)
            expected_predictions = [
                PredictionResult(ex, pred) for ex, pred in zip(
                    examples,
                    torch.Tensor(
                        [f1 * 2.0 + f2 * 3 + 0.5
                         for f1, f2 in examples]).reshape(-1, 1))
            ]

            state_dict = OrderedDict([('linear.weight', torch.Tensor([[2.0,
                                                                       3]])),
                                      ('linear.bias', torch.Tensor([0.5]))])
            path = os.path.join(self.tmpdir, 'my_state_dict_path')
            torch.save(state_dict, path)

            model_loader = PytorchModelLoader(
                state_dict_path=path,
                model_class=PytorchLinearRegression(input_dim=2, output_dim=1))

            pcoll = pipeline | 'start' >> beam.Create(examples)
            predictions = pcoll | RunInference(model_loader)
            assert_that(
                predictions,
                equal_to(expected_predictions,
                         equals_fn=_compare_prediction_result))
Example #5
0
 def test_bad_file_raises(self):
     with self.assertRaises(RuntimeError):
         with TestPipeline() as pipeline:
             examples = [numpy.array([0, 0])]
             pcoll = pipeline | 'start' >> beam.Create(examples)
             _ = pcoll | RunInference(
                 SklearnModelHandlerNumpy(model_uri='/var/bad_file_name'))
             pipeline.run()
Example #6
0
def run(argv=None,
        model_class=None,
        model_params=None,
        save_main_session=True):
    """
  Args:
    argv: Command line arguments defined for this example.
    model_class: Reference to the class definition of the model.
                If None, MobilenetV2 will be used as default .
    model_params: Parameters passed to the constructor of the model_class.
                  These will be used to instantiate the model object in the
                  RunInference API.
  """
    known_args, pipeline_args = parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    if not model_class:
        model_class = MobileNetV2
        model_params = {'num_classes': 1000}

    # In this example we pass keyed inputs to RunInference transform.
    # Therefore, we use KeyedModelHandler wrapper over PytorchModelHandler.
    model_handler = KeyedModelHandler(
        PytorchModelHandler(state_dict_path=known_args.model_state_dict_path,
                            model_class=model_class,
                            model_params=model_params))

    with beam.Pipeline(options=pipeline_options) as p:
        filename_value_pair = (
            p
            | 'ReadImageNames' >> beam.io.ReadFromText(known_args.input,
                                                       skip_header_lines=1)
            | 'ReadImageData' >> beam.Map(lambda image_name: read_image(
                image_file_name=image_name, path_to_dir=known_args.images_dir))
            | 'PreprocessImages' >> beam.MapTuple(
                lambda file_name, data: (file_name, preprocess_image(data))))
        predictions = (filename_value_pair
                       | 'PyTorchRunInference' >> RunInference(model_handler)
                       | 'ProcessOutput' >> beam.ParDo(PostProcessor()))

        if known_args.output:
            predictions | "WriteOutputToGCS" >> beam.io.WriteToText(  # pylint: disable=expression-not-assigned
                known_args.output,
                shard_name_template='',
                append_trailing_newlines=True)
Example #7
0
    def test_pipeline_pickled(self):
        temp_file_name = self.tmpdir + os.sep + 'pickled_file'
        with open(temp_file_name, 'wb') as file:
            pickle.dump(build_model(), file)
        with TestPipeline() as pipeline:
            examples = [numpy.array([0, 0]), numpy.array([1, 1])]

            pcoll = pipeline | 'start' >> beam.Create(examples)
            actual = pcoll | RunInference(
                SklearnModelHandlerNumpy(model_uri=temp_file_name))
            expected = [
                PredictionResult(numpy.array([0, 0]), 0),
                PredictionResult(numpy.array([1, 1]), 1)
            ]
            assert_that(
                actual, equal_to(expected,
                                 equals_fn=_compare_prediction_result))
Example #8
0
def run(argv=None,
        model_class=None,
        model_params=None,
        save_main_session=True):
    """
  Args:
    argv: Command line arguments defined for this example.
    model_class: Reference to the class definition of the model.
                If None, maskrcnn_resnet50_fpn will be used as default .
    model_params: Parameters passed to the constructor of the model_class.
                  These will be used to instantiate the model object in the
                  RunInference API.
  """
    known_args, pipeline_args = parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    if not model_class:
        model_class = maskrcnn_resnet50_fpn
        model_params = {'num_classes': 91}

    model_handler = PytorchModelHandlerTensor(
        state_dict_path=known_args.model_state_dict_path,
        model_class=model_class,
        model_params=model_params)

    with beam.Pipeline(options=pipeline_options) as p:
        filename_value_pair = (
            p
            | 'ReadImageNames' >> beam.io.ReadFromText(known_args.input,
                                                       skip_header_lines=1)
            | 'ReadImageData' >> beam.Map(lambda image_name: read_image(
                image_file_name=image_name, path_to_dir=known_args.images_dir))
            | 'PreprocessImages' >> beam.MapTuple(
                lambda file_name, data: (file_name, preprocess_image(data))))
        predictions = (filename_value_pair
                       | 'PyTorchRunInference' >> RunInference(
                           KeyedModelHandler(model_handler))
                       | 'ProcessOutput' >> beam.ParDo(PostProcessor()))

        _ = predictions | "WriteOutput" >> beam.io.WriteToText(
            known_args.output,
            shard_name_template='',
            append_trailing_newlines=True)
Example #9
0
    def test_pipeline_pandas(self):
        temp_file_name = self.tmpdir + os.sep + 'pickled_file'
        with open(temp_file_name, 'wb') as file:
            pickle.dump(build_pandas_pipeline(), file)
        with TestPipeline() as pipeline:
            dataframe = pandas_dataframe()
            splits = [dataframe.loc[[i]] for i in dataframe.index]
            pcoll = pipeline | 'start' >> beam.Create(splits)
            actual = pcoll | RunInference(
                SklearnModelHandlerPandas(model_uri=temp_file_name))

            expected = [
                PredictionResult(splits[0], 5),
                PredictionResult(splits[1], 8),
                PredictionResult(splits[2], 1),
                PredictionResult(splits[3], 1),
                PredictionResult(splits[4], 2),
            ]
            assert_that(
                actual,
                equal_to(expected, equals_fn=_compare_dataframe_predictions))
Example #10
0
    def test_invalid_input_type(self):
        with self.assertRaisesRegex(TypeError, "expected Tensor as element"):
            with TestPipeline() as pipeline:
                examples = np.array([1, 5, 3, 10],
                                    dtype="float32").reshape(-1, 1)

                state_dict = OrderedDict([
                    ('linear.weight', torch.Tensor([[2.0]])),
                    ('linear.bias', torch.Tensor([0.5]))
                ])
                path = os.path.join(self.tmpdir, 'my_state_dict_path')
                torch.save(state_dict, path)

                model_loader = PytorchModelLoader(
                    state_dict_path=path,
                    model_class=PytorchLinearRegression(input_dim=1,
                                                        output_dim=1))

                pcoll = pipeline | 'start' >> beam.Create(examples)
                # pylint: disable=expression-not-assigned
                pcoll | RunInference(model_loader)
Example #11
0
    def test_pipeline_local_model_simple(self):
        with TestPipeline() as pipeline:
            state_dict = OrderedDict([('linear.weight', torch.Tensor([[2.0,
                                                                       3]])),
                                      ('linear.bias', torch.Tensor([0.5]))])
            path = os.path.join(self.tmpdir, 'my_state_dict_path')
            torch.save(state_dict, path)

            model_handler = PytorchModelHandlerTensor(
                state_dict_path=path,
                model_class=PytorchLinearRegression,
                model_params={
                    'input_dim': 2,
                    'output_dim': 1
                })

            pcoll = pipeline | 'start' >> beam.Create(TWO_FEATURES_EXAMPLES)
            predictions = pcoll | RunInference(model_handler)
            assert_that(
                predictions,
                equal_to(TWO_FEATURES_PREDICTIONS,
                         equals_fn=_compare_prediction_result))
Example #12
0
    def test_pipeline_pandas_with_keys(self):
        temp_file_name = self.tmpdir + os.sep + 'pickled_file'
        with open(temp_file_name, 'wb') as file:
            pickle.dump(build_pandas_pipeline(), file)
        with TestPipeline() as pipeline:
            data_frame = pandas_dataframe()
            keys = [str(i) for i in range(5)]
            splits = [data_frame.loc[[i]] for i in data_frame.index]
            keyed_rows = [(key, value) for key, value in zip(keys, splits)]

            pcoll = pipeline | 'start' >> beam.Create(keyed_rows)
            actual = pcoll | RunInference(
                KeyedModelHandler(
                    SklearnModelHandlerPandas(model_uri=temp_file_name)))
            expected = [
                ('0', PredictionResult(splits[0], 5)),
                ('1', PredictionResult(splits[1], 8)),
                ('2', PredictionResult(splits[2], 1)),
                ('3', PredictionResult(splits[3], 1)),
                ('4', PredictionResult(splits[4], 2)),
            ]
            assert_that(
                actual,
                equal_to(expected, equals_fn=_compare_dataframe_predictions))
Example #13
0
def run(argv=None, model_class=None, model_params=None, save_main_session=True):
  """
  Args:
    argv: Command line arguments defined for this example.
    model_class: Reference to the class definition of the model.
                If None, BertForMaskedLM will be used as default .
    model_params: Parameters passed to the constructor of the model_class.
                  These will be used to instantiate the model object in the
                  RunInference API.
  """
  known_args, pipeline_args = parse_known_args(argv)
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

  if not model_class:
    model_config = BertConfig(is_decoder=False, return_dict=True)
    model_class = BertForMaskedLM
    model_params = {'config': model_config}

  # TODO(https://github.com/apache/beam/issues/21863): Remove once optional
  # batching flag added
  class HuggingFaceStripBatchingWrapper(model_class):
    """Wrapper class to convert output from dict of lists to list of dicts

    The `forward()` function in Hugging Face models doesn't return a
    standard torch.Tensor output. Instead, it can return a dictionary of
    different outputs. To work with current RunInference implementation which
    returns a PredictionResult object for each example, we must override the
    `forward()` function and convert the standard Hugging Face forward output
    into the appropriate format of List[Dict[str, torch.Tensor]].

    Before:
    output = {
      'logit': torch.FloatTensor of shape
        (batch_size, sequence_length, config.vocab_size),
      'hidden_states': tuple(torch.FloatTensor) of shape
        (batch_size, sequence_length, hidden_size)
    }
    After:
    output = [
      {
        'logit': torch.FloatTensor of shape
          (sequence_length, config.vocab_size),
        'hidden_states': tuple(torch.FloatTensor) of
          shape (sequence_length, hidden_size)
      },
      {
        'logit': torch.FloatTensor of shape
          (sequence_length, config.vocab_size),
        'hidden_states': tuple(torch.FloatTensor) of shape
          (sequence_length, hidden_size)
      },
      ...
    ]
    where len(output) is batch_size
    """
    def forward(self, **kwargs):
      output = super().forward(**kwargs)
      return [dict(zip(output, v)) for v in zip(*output.values())]

  # TODO: Remove once nested tensors https://github.com/pytorch/nestedtensor
  # is officially released.
  class PytorchNoBatchModelHandler(PytorchModelHandlerKeyedTensor):
    """Wrapper to PytorchModelHandler to limit batch size to 1.

    The tokenized strings generated from BertTokenizer may have different
    lengths, which doesn't work with torch.stack() in current RunInference
    implementation since stack() requires tensors to be the same size.

    Restricting max_batch_size to 1 means there is only 1 example per `batch`
    in the run_inference() call.
    """
    def batch_elements_kwargs(self):
      return {'max_batch_size': 1}

  model_handler = PytorchNoBatchModelHandler(
      state_dict_path=known_args.model_state_dict_path,
      model_class=HuggingFaceStripBatchingWrapper,
      model_params=model_params)

  with beam.Pipeline(options=pipeline_options) as p:
    if not known_args.input:
      text = (p | 'CreateSentences' >> beam.Create([
        'The capital of France is Paris .',
        'It is raining cats and dogs .',
        'He looked up and saw the sun and stars .',
        'Today is Monday and tomorrow is Tuesday .',
        'There are 5 coconuts on this palm tree .',
        'The richest person in the world is not here .',
        'Malls are amazing places to shop because you can find everything you need under one roof .', # pylint: disable=line-too-long
        'This audiobook is sure to liquefy your brain .',
        'The secret ingredient to his wonderful life was gratitude .',
        'The biggest animal in the world is the whale .',
      ]))
    else:
      text = (p | 'ReadSentences' >> beam.io.ReadFromText(known_args.input))
    text_and_tokenized_text_tuple = (
        text
        | 'AddMask' >> beam.Map(add_mask_to_last_word)
        | 'TokenizeSentence' >> beam.Map(tokenize_sentence))
    output = (
        text_and_tokenized_text_tuple
        |
        'PyTorchRunInference' >> RunInference(KeyedModelHandler(model_handler))
        | 'ProcessOutput' >> beam.ParDo(PostProcessor()))
    output | "WriteOutput" >> beam.io.WriteToText( # pylint: disable=expression-not-assigned
      known_args.output,
      shard_name_template='',
      append_trailing_newlines=True)