def get_inputs(
    record_batch: pa.RecordBatch,
    input_specs: Dict[Text, tf.TypeSpec],
    adapter: Optional[tensor_adapter.TensorAdapter] = None
) -> Optional[Dict[Text, Any]]:
    """Returns inputs from record batch for given input specs.

  Args:
    record_batch: Record batch to prepare inputs from.
    input_specs: Input specs keyed by input name.
    adapter: Optional tensor adapter.

  Returns:
    Input tensors keyed by input name.
  """
    inputs = None
    if (not adapter
            and set(input_specs.keys()) <= set(record_batch.schema.names)):
        # Create adapter based on input_specs
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=record_batch.schema,
            tensor_representations=input_specs_to_tensor_representations(
                input_specs))
        adapter = tensor_adapter.TensorAdapter(tensor_adapter_config)
    # Avoid getting the tensors if we appear to be feeding serialized
    # examples to the callable.
    if adapter and not (len(input_specs) == 1
                        and next(iter(input_specs.values())).dtype == tf.string
                        and find_input_name_in_features(
                            set(adapter.TypeSpecs().keys()),
                            next(iter(input_specs.keys()))) is None):
        # TODO(b/172376802): Update to pass input specs to ToBatchTensors.
        inputs = filter_by_input_names(adapter.ToBatchTensors(record_batch),
                                       list(input_specs.keys()))
    return inputs
Exemple #2
0
  def benchmarkRunMetagraphDoFnAtTFLevel(self):
    """Benchmark RunMetaGraphDoFn at the TF level for TFT's TF1 implementation.

    Benchmarks the parts of RunMetaGraphDoFn that involve feeding and
    fetching from the TFT SavedModel. Records the wall time taken.

    Note that this benchmark necessarily duplicates code directly from TFT
    since it's benchmarking the low-level internals of TFT, which are not
    exposed for use in this way.
    """
    common_variables = _get_common_variables(self._dataset)
    tf_config = tft_beam_impl._FIXED_PARALLELISM_TF_CONFIG  # pylint: disable=protected-access

    # This block copied from _GraphStateCompatV1.__init__
    with tf.compat.v1.Graph().as_default() as graph:
      session = tf.compat.v1.Session(graph=graph, config=tf_config)
      with session.as_default():
        inputs, outputs = (
            saved_transform_io.partially_apply_saved_transform_internal(
                self._dataset.tft_saved_model_path(force_tf_compat_v1=True),
                {}))
        session.run(tf.compat.v1.global_variables_initializer())
        session.run(tf.compat.v1.tables_initializer())
        graph.finalize()
      # We ignore the schema, and assume there are no excluded outputs.
      outputs_tensor_keys = sorted(set(outputs.keys()))
      fetches = [outputs[key] for key in outputs_tensor_keys]
      tensor_inputs = graph_tools.get_dependent_inputs(graph, inputs, fetches)
      input_tensor_keys = sorted(tensor_inputs.keys())
      feed_list = [inputs[key] for key in input_tensor_keys]
      callable_get_outputs = session.make_callable(fetches, feed_list=feed_list)

    batch_size, batched_records = _get_batched_records(self._dataset,
                                                       self._max_num_examples())

    input_tensor_adapter = tensor_adapter.TensorAdapter(
        common_variables.tfxio.TensorAdapterConfig())

    # This block copied from _RunMetaGraphDoFn._handle_batch
    start = time.time()
    for batch in batched_records:
      feed_by_name = input_tensor_adapter.ToBatchTensors(
          batch, produce_eager_tensors=False)
      feed_list = [feed_by_name[name] for name in input_tensor_keys]
      outputs_list = callable_get_outputs(*feed_list)
      _ = {key: value for key, value in zip(outputs_tensor_keys, outputs_list)}
    end = time.time()
    delta = end - start

    self.report_benchmark(
        iters=1,
        wall_time=delta,
        extras={
            "batch_size":
                batch_size,
            "num_examples":
                self._dataset.num_examples(limit=self._max_num_examples())
        })
    def testOriginalTypeSpecs(self):
        arrow_schema = pa.schema([pa.field("column1", pa.list_(pa.int32()))])
        tensor_representations = {
            "column1":
            text_format.Parse(
                """
                dense_tensor {
                  column_name: "column1"
                  shape {
                    dim {
                      size: 1
                    }
                  }
                }""", schema_pb2.TensorRepresentation())
        }
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(arrow_schema,
                                               tensor_representations))
        self.assertLen(adapter.TypeSpecs(), 1)
        self.assertEqual(adapter.TypeSpecs(), adapter.OriginalTypeSpecs())

        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                arrow_schema,
                tensor_representations,
                original_type_specs={
                    "column1": tf.TensorSpec(dtype=tf.int32, shape=[None, 1]),
                    "column2": tf.TensorSpec(dtype=tf.int32, shape=[None, 1])
                }))
        self.assertLen(adapter.TypeSpecs(), 1)
        self.assertLen(adapter.OriginalTypeSpecs(), 2)

        with self.assertRaisesRegex(ValueError,
                                    "original_type_specs must be a superset"):
            adapter = tensor_adapter.TensorAdapter(
                tensor_adapter.TensorAdapterConfig(
                    arrow_schema,
                    tensor_representations,
                    original_type_specs={
                        # mismatch spec of column1
                        "column1": tf.TensorSpec(dtype=tf.int64,
                                                 shape=[None, 1]),
                        "column2": tf.TensorSpec(dtype=tf.int32,
                                                 shape=[None, 1])
                    }))
 def testRaiseOnInvalidSparseTensorRepresentation(
         self, tensor_representation_textpb, arrow_schema):
     tensor_representation = text_format.Parse(
         tensor_representation_textpb, schema_pb2.TensorRepresentation())
     with self.assertRaisesRegex(ValueError, "Unable to handle tensor"):
         tensor_adapter.TensorAdapter(
             tensor_adapter.TensorAdapterConfig(
                 pa.schema(
                     [pa.field(k, v) for k, v in arrow_schema.items()]),
                 {"tensor": tensor_representation}))
Exemple #5
0
    def benchmarkTF2RunMetagraphDoFnAtTFLevel(self):
        """Benchmark RunMetaGraphDoFn at the TF level for TFT's TF2 implementation.

    Benchmarks the parts of RunMetaGraphDoFn that involve feeding and
    fetching from the TFT SavedModel. Records the wall time taken.

    Note that this benchmark necessarily duplicates code directly from TFT
    since it's benchmarking the low-level internals of TFT, which are not
    exposed for use in this way.
    """
        common_variables = _get_common_variables(self._dataset,
                                                 force_tf_compat_v1=False)
        tensor_adapter_config = common_variables.tfxio.TensorAdapterConfig()

        # This block copied from _GraphStateV2.__init__
        saved_model_loader = saved_transform_io_v2.SavedModelLoader(
            self._dataset.tft_saved_model_path(force_tf_compat_v1=False))
        callable_get_outputs = saved_model_loader.apply_transform_model
        # We ignore the schema, and assume there are no excluded outputs.
        outputs_tensor_keys = set(saved_model_loader.structured_outputs.keys())
        saved_model_loader.finalize(
            tensor_adapter_config.tensor_representations.keys(),
            outputs_tensor_keys)

        batch_size, batched_records = _get_batched_records(
            self._dataset,
            force_tf_compat_v1=False,
            max_num_examples=self._max_num_examples())

        input_tensor_adapter = tensor_adapter.TensorAdapter(
            tensor_adapter_config)

        # This block copied from _RunMetaGraphDoFn._handle_batch
        start = time.time()
        for batch in batched_records:
            feed_dict = input_tensor_adapter.ToBatchTensors(
                batch, produce_eager_tensors=True)
            _ = callable_get_outputs(feed_dict)
        end = time.time()
        delta = end - start

        self.report_benchmark(
            iters=1,
            wall_time=delta,
            extras={
                "batch_size":
                batch_size,
                "num_examples":
                self._dataset.num_examples(limit=self._max_num_examples())
            })
 def testRaiseOnInvalidDefaultValue(self, value_type, default_value_pbtxt,
                                    exception_regexp):
     tensor_representation = text_format.Parse(
         """
               dense_tensor {
                 column_name: "column"
                 shape {}
               }""", schema_pb2.TensorRepresentation())
     tensor_representation.dense_tensor.default_value.CopyFrom(
         text_format.Parse(default_value_pbtxt,
                           schema_pb2.TensorRepresentation.DefaultValue()))
     with self.assertRaisesRegex(ValueError, exception_regexp):
         tensor_adapter.TensorAdapter(
             tensor_adapter.TensorAdapterConfig(
                 pa.schema([pa.field("column", pa.list_(value_type))]),
                 {"tensor": tensor_representation}))
    def test2DSparseTensor(self):
        tensor_representation = text_format.Parse(
            """
        sparse_tensor {
          value_column_name: "values"
          index_column_names: ["d0", "d1"]
          dense_shape {
            dim {
              size: 10
            }
            dim {
              size: 20
            }
          }
        }
        """, schema_pb2.TensorRepresentation())
        record_batch = pa.RecordBatch.from_arrays(
            [
                pa.array([[1], None, [2], [3, 4, 5], []],
                         type=pa.list_(pa.int64())),
                # Also test that the index column can be of an integral type other
                # than int64.
                pa.array([[9], None, [9], [7, 8, 9], []],
                         type=pa.list_(pa.uint32())),
                pa.array([[0], None, [0], [0, 1, 2], []],
                         type=pa.list_(pa.int64()))
            ],
            ["values", "d0", "d1"])
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                record_batch.schema, {"output": tensor_representation}))
        converted = adapter.ToBatchTensors(record_batch)
        self.assertLen(converted, 1)
        self.assertIn("output", converted)
        actual_output = converted["output"]
        self.assertIsInstance(
            actual_output, (tf.SparseTensor, tf.compat.v1.SparseTensorValue))
        self.assertSparseAllEqual(
            tf.compat.v1.SparseTensorValue(dense_shape=[5, 10, 20],
                                           indices=[[0, 9, 0], [2, 9, 0],
                                                    [3, 7, 0], [3, 8, 1],
                                                    [3, 9, 2]],
                                           values=tf.convert_to_tensor(
                                               [1, 2, 3, 4, 5],
                                               dtype=tf.int64)), actual_output)

        self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch)
 def setup(self):
     super(ModelSignaturesDoFn, self).setup()
     if self._tensor_adapter_config is not None:
         self._tensor_adapter = tensor_adapter.TensorAdapter(
             self._tensor_adapter_config)
     # Verify and filter models to only those used in ModelSpecs.
     loaded_models = {}
     for spec in self._eval_config.model_specs:
         # To maintain consistency between settings where single models are used,
         # always use '' as the model name regardless of whether a name is passed.
         model_name = spec.name if len(
             self._eval_config.model_specs) > 1 else ''
         if model_name not in self._loaded_models:
             raise ValueError(
                 'loaded model for "{}" not found: eval_config={}'.format(
                     spec.name, self._eval_config))
         loaded_models[model_name] = self._loaded_models[model_name]
     self._loaded_models = loaded_models
 def testRaiseOnNoMatchingHandler(self):
     with self.assertRaisesRegex(ValueError, "Unable to handle tensor"):
         tensor_adapter.TensorAdapter(
             tensor_adapter.TensorAdapterConfig(
                 # nested lists are not supported now.
                 pa.schema([
                     pa.field("unsupported_column",
                              pa.list_(pa.list_(pa.int64())))
                 ]),
                 {
                     "tensor":
                     text_format.Parse(
                         """
               dense_tensor {
                 column_name: "unsupported_column"
                 shape: {}
               }
               """, schema_pb2.TensorRepresentation())
                 }))
 def test_relaxed_varlen_sparse_tensor(self):
     # Demonstrates that TensorAdapter(TensorsToRecordBatchConverter()) is not
     # an identity if the second dense dimension of SparseTensor is not tight.
     type_specs = {"sp": tf.SparseTensorSpec([None, None], tf.int32)}
     sp = tf.compat.v1.SparseTensorValue(values=np.array([1, 2], np.int32),
                                         indices=[[0, 0], [2, 0]],
                                         dense_shape=[4, 2])
     if tf.__version__ >= "2":
         sp = tf.SparseTensor.from_value(sp)
     converter = tensor_to_arrow.TensorsToRecordBatchConverter(type_specs)
     rb = converter.convert({"sp": sp})
     adapter = tensor_adapter.TensorAdapter(
         tensor_adapter.TensorAdapterConfig(
             arrow_schema=converter.arrow_schema(),
             tensor_representations=converter.tensor_representations()))
     adapter_output = adapter.ToBatchTensors(
         rb, produce_eager_tensors=tf.__version__ >= "2")
     self.assertAllEqual(sp.values, adapter_output["sp"].values)
     self.assertAllEqual(sp.indices, adapter_output["sp"].indices)
     self.assertAllEqual(adapter_output["sp"].dense_shape, [4, 1])
 def testRaggedTensorStructTypeNonLeaf(self):
     tensor_representation = text_format.Parse(
         """
     ragged_tensor {
       feature_path {
         step: "ragged_feature"
       }
     }
     """, schema_pb2.TensorRepresentation())
     record_batch = pa.RecordBatch.from_arrays([
         pa.StructArray.from_arrays([
             pa.array([[1, 2, 3]], pa.list_(pa.int64())),
             pa.array([["a", "b", "c"]], pa.list_(pa.binary()))
         ], ["inner_feature", "x2"])
     ], ["ragged_feature"])
     with self.assertRaisesRegex(ValueError,
                                 ".*Unable to handle tensor output.*"):
         tensor_adapter.TensorAdapter(
             tensor_adapter.TensorAdapterConfig(
                 record_batch.schema, {"output": tensor_representation}))
   def testRaiseOnRequestingEagerTensorsInGraphMode(self):
       tensor_representation = text_format.Parse(
           """
 sparse_tensor {
   index_column_names: ["key"]
   value_column_name: "value"
   dense_shape {
     dim {
       size: 100
     }
   }
 }
 """, schema_pb2.TensorRepresentation())
       record_batch = pa.RecordBatch.from_arrays(
           [pa.array([[1]]), pa.array([[2]])], ["key", "value"])
       adapter = tensor_adapter.TensorAdapter(
           tensor_adapter.TensorAdapterConfig(
               record_batch.schema, {"output": tensor_representation}))
       with self.assertRaisesRegex(RuntimeError,
                                   "eager mode was not enabled"):
           adapter.ToBatchTensors(record_batch, produce_eager_tensors=True)
    def testRaggedTensor(self, tensor_representation_textpb, record_batch,
                         expected_type_spec, expected_ragged_tensor):
        tensor_representation = text_format.Parse(
            tensor_representation_textpb, schema_pb2.TensorRepresentation())
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                record_batch.schema, {"output": tensor_representation}))
        converted = adapter.ToBatchTensors(record_batch)
        self.assertLen(converted, 1)
        self.assertIn("output", converted)
        actual_output = converted["output"]
        self.assertIsInstance(
            actual_output,
            (tf.RaggedTensor, tf.compat.v1.ragged.RaggedTensorValue))
        if tf.executing_eagerly():
            self.assertTrue(
                expected_type_spec.is_compatible_with(actual_output),
                "{} is not compatible with spec {}".format(
                    actual_output, expected_type_spec))

        self.assertRaggedAllEqual(actual_output, expected_ragged_tensor)
        self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch)
 def testRaggedTensorSlicedRecordBatch(self):
     tensor_representation = text_format.Parse(
         """
     ragged_tensor {
       feature_path {
         step: "ragged_feature"
       }
     }
     """, schema_pb2.TensorRepresentation())
     record_batch = pa.RecordBatch.from_arrays([
         pa.array([[1], None, [2], [3, 4, 5], []],
                  type=pa.list_(pa.int64()))
     ], ["ragged_feature"])
     record_batch = record_batch.slice(1, 3)
     adapter = tensor_adapter.TensorAdapter(
         tensor_adapter.TensorAdapterConfig(
             record_batch.schema, {"output": tensor_representation}))
     with self.assertRaisesRegex(
             ValueError,
             ".*We currently do not handle converting slices to RaggedTensors."
     ):
         adapter.ToBatchTensors(record_batch)
        def convert_and_check(tensors, test_values_conversion):
            converter = tensor_to_arrow.TensorsToRecordBatchConverter(
                type_specs, options)

            self.assertEqual(
                {f.name: f.type
                 for f in converter.arrow_schema()}, expected_schema,
                "actual: {}".format(converter.arrow_schema()))

            canonical_expected_tensor_representations = {}
            for n, r in expected_tensor_representations.items():
                if not isinstance(r, schema_pb2.TensorRepresentation):
                    r = text_format.Parse(r, schema_pb2.TensorRepresentation())
                canonical_expected_tensor_representations[n] = r

            self.assertEqual(canonical_expected_tensor_representations,
                             converter.tensor_representations())

            rb = converter.convert(tensors)
            self.assertLen(expected_record_batch, rb.num_columns)
            for i, column in enumerate(rb):
                expected = expected_record_batch[rb.schema[i].name]
                self.assertTrue(
                    column.equals(expected),
                    "{}: actual: {}, expected: {}".format(
                        rb.schema[i].name, column, expected))
            # Test that TensorAdapter(TensorsToRecordBatchConverter()) is identity.
            adapter = tensor_adapter.TensorAdapter(
                tensor_adapter.TensorAdapterConfig(
                    arrow_schema=converter.arrow_schema(),
                    tensor_representations=converter.tensor_representations()))
            adapter_output = adapter.ToBatchTensors(
                rb, produce_eager_tensors=not test_values_conversion)
            self.assertEqual(adapter_output.keys(), tensors.keys())
            for k in adapter_output.keys():
                if "value" not in k:
                    self._assert_tensor_alike_equal(adapter_output[k],
                                                    tensors[k])
    def testOneTensorFromOneColumn(self, tensor_representation_textpb,
                                   arrow_array, expected_type_spec,
                                   expected_output):

        tensor_representation = text_format.Parse(
            tensor_representation_textpb, schema_pb2.TensorRepresentation())
        column_name = None
        if tensor_representation.HasField("dense_tensor"):
            column_name = tensor_representation.dense_tensor.column_name
        if tensor_representation.HasField("varlen_sparse_tensor"):
            column_name = tensor_representation.varlen_sparse_tensor.column_name

        record_batch = pa.RecordBatch.from_arrays([arrow_array], [column_name])
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                record_batch.schema, {"output": tensor_representation}))
        self.assertEqual(expected_type_spec, adapter.TypeSpecs()["output"])
        converted = adapter.ToBatchTensors(record_batch)
        self.assertLen(converted, 1)
        self.assertIn("output", converted)
        actual_output = converted["output"]
        if tf.executing_eagerly():
            self.assertTrue(
                expected_type_spec.is_compatible_with(actual_output),
                "{} is not compatible with spec {}".format(
                    actual_output, expected_type_spec))
        if isinstance(expected_output,
                      (tf.SparseTensor, tf.compat.v1.SparseTensorValue)):
            self.assertIsInstance(
                actual_output,
                (tf.SparseTensor, tf.compat.v1.SparseTensorValue))
            self.assertSparseAllEqual(expected_output, actual_output)
        else:
            self.assertAllEqual(expected_output, actual_output)

        self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch)
Exemple #17
0
    def test_convert(self, type_specs, expected_schema,
                     expected_tensor_representations, tensor_input,
                     expected_record_batch):
        converter = tensor_to_arrow.TensorsToRecordBatchConverter(type_specs)

        expected_schema = pa.schema(
            [pa.field(n, t) for n, t in sorted(expected_schema.items())])

        self.assertTrue(converter.arrow_schema().equals(expected_schema),
                        "actual: {}".format(converter.arrow_schema()))

        canonical_expected_tensor_representations = {}
        for n, r in expected_tensor_representations.items():
            if not isinstance(r, schema_pb2.TensorRepresentation):
                r = text_format.Parse(r, schema_pb2.TensorRepresentation())
            canonical_expected_tensor_representations[n] = r

        self.assertEqual(canonical_expected_tensor_representations,
                         converter.tensor_representations())

        rb = converter.convert(tensor_input)
        self.assertTrue(
            rb.equals(
                pa.record_batch(
                    [arr for _, arr in sorted(expected_record_batch.items())],
                    schema=expected_schema)))

        # Test that TensorAdapter(TensorsToRecordBatchConverter()) is identity.
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                arrow_schema=converter.arrow_schema(),
                tensor_representations=converter.tensor_representations()))
        adapter_output = adapter.ToBatchTensors(rb, produce_eager_tensors=True)
        self.assertEqual(adapter_output.keys(), tensor_input.keys())
        for k in adapter_output.keys():
            self._assert_tensor_alike_equal(adapter_output[k], tensor_input[k])
    def testMultipleColumns(self):
        record_batch = pa.RecordBatch.from_arrays([
            pa.array([[1], [], [2, 3], None], type=pa.large_list(pa.int64())),
            pa.array([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0]],
                     type=pa.list_(pa.float32())),
            pa.array([None, [b"a", b"b"], [b"c", b"d"], None],
                     type=pa.list_(pa.large_binary())),
            pa.array([[b"w"], [b"x"], [b"y"], [b"z"]],
                     type=pa.list_(pa.string())),
        ], [
            "int64_ragged",
            "float_dense",
            "bytes_ragged",
            "bytes_dense",
        ])

        tensor_representations = {
            "int64_varlen_sparse":
            text_format.Parse(
                """
        varlen_sparse_tensor {
          column_name: "int64_ragged"
        }
        """, schema_pb2.TensorRepresentation()),
            "float_dense":
            text_format.Parse(
                """
        dense_tensor {
          column_name: "float_dense"
          shape {
            dim {
              size: 2
            }
            dim {
              size: 1
            }
          }
        }""", schema_pb2.TensorRepresentation()),
            "bytes_varlen_sparse":
            text_format.Parse(
                """
        varlen_sparse_tensor {
          column_name: "bytes_ragged"
        }
        """, schema_pb2.TensorRepresentation()),
            "bytes_dense":
            text_format.Parse(
                """
        dense_tensor {
          column_name: "bytes_dense"
          shape {
          }
        }
        """, schema_pb2.TensorRepresentation()),
            "bytes_default_filled_dense":
            text_format.Parse(
                """
        dense_tensor {
          column_name: "bytes_ragged"
          shape {
            dim {
              size: 2
            }
          }
          default_value {
            bytes_value: "kk"
          }
        }
        """, schema_pb2.TensorRepresentation()),
        }

        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(record_batch.schema,
                                               tensor_representations))
        type_specs = adapter.TypeSpecs()
        self.assertEqual(
            type_specs, {
                "int64_varlen_sparse":
                tf.SparseTensorSpec(shape=[None, None], dtype=tf.int64),
                "bytes_varlen_sparse":
                tf.SparseTensorSpec(shape=[None, None], dtype=tf.string),
                "float_dense":
                tf.TensorSpec(shape=[None, 2, 1], dtype=tf.float32),
                "bytes_dense":
                tf.TensorSpec(shape=[None], dtype=tf.string),
                "bytes_default_filled_dense":
                tf.TensorSpec(shape=[None, 2], dtype=tf.string),
            })

        tensors = adapter.ToBatchTensors(record_batch)
        self.assertLen(tensors, len(type_specs))
        self.assertSparseAllEqual(
            tf.SparseTensor(values=tf.constant([1, 2, 3], dtype=tf.int64),
                            dense_shape=tf.constant([4, 2], dtype=tf.int64),
                            indices=tf.constant([[0, 0], [2, 0], [2, 1]],
                                                dtype=tf.int64)),
            tensors["int64_varlen_sparse"])
        self.assertSparseAllEqual(
            tf.SparseTensor(values=tf.constant([b"a", b"b", b"c", b"d"]),
                            dense_shape=tf.constant([4, 2], dtype=tf.int64),
                            indices=tf.constant(
                                [[1, 0], [1, 1], [2, 0], [2, 1]],
                                dtype=tf.int64)),
            tensors["bytes_varlen_sparse"])
        self.assertAllEqual(
            tf.constant([[[1.0], [2.0]], [[2.0], [3.0]], [[3.0], [4.0]],
                         [[4.0], [5.0]]],
                        dtype=tf.float32), tensors["float_dense"])
        self.assertAllEqual(tf.constant([b"w", b"x", b"y", b"z"]),
                            tensors["bytes_dense"])
        self.assertAllEqual(
            tf.constant([[b"kk", b"kk"], [b"a", b"b"], [b"c", b"d"],
                         [b"kk", b"kk"]]),
            tensors["bytes_default_filled_dense"])

        if tf.executing_eagerly():
            for name, spec in six.iteritems(type_specs):
                self.assertTrue(
                    spec.is_compatible_with(tensors[name]),
                    "{} is not compatible with spec {}".format(
                        tensors[name], spec))

        self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch)
Exemple #19
0
def record_batch_to_tensor_values(
    record_batch: pa.RecordBatch,
    tensor_representations: Optional[Mapping[
        str, schema_pb2.TensorRepresentation]] = None
) -> types.TensorValueMaybeMultiLevelDict:
    """Returns tensor values extracted from given record batch.

  Args:
    record_batch: Record batch to extract features from.
    tensor_representations: Tensor representations to use when extracting the
      features. If a representation is not found for a given column name, a
      default representation will be used where possible, otherwise an exception
      will be raised.

  Returns:
    Features dict.

  Raises:
    ValueError: If a tensor value cannot be determined for a given column in the
    record batch.
  """
    if tensor_representations is None:
        tensor_representations = {}

    def _shape(value: Any) -> List[int]:
        """Returns the shape associated with given value."""
        if hasattr(value, '__len__'):
            return [len(value)] + _shape(value[0]) if value else [len(value)]
        else:
            return []

    features = {}
    updated_tensor_representations = {}
    for i, col in enumerate(record_batch.schema):
        if col.name in tensor_representations:
            updated_tensor_representations[col.name] = (
                tensor_representations[col.name])
        else:
            col_sizes = record_batch.column(i).value_lengths().unique()
            if len(col_sizes) != 1:
                # Assume VarLenSparseTensor
                tensor_representation = schema_pb2.TensorRepresentation()
                tensor_representation.varlen_sparse_tensor.column_name = col.name
                updated_tensor_representations[
                    col.name] = tensor_representation
            elif not np.all(record_batch[i].is_valid()):
                # Features that are missing some values can't be parsed using a default
                # tensor representation. Convert to numpy arrays containing None values.
                features[col.name] = record_batch[i].to_numpy(
                    zero_copy_only=False)
            else:
                tensor_representation = schema_pb2.TensorRepresentation()
                tensor_representation.dense_tensor.column_name = col.name
                dims = _shape(record_batch[i])
                # Convert dims of the form (..., n, 1) to (..., n).
                if len(dims) > 1 and dims[-1] == 1:
                    dims = dims[:-1]
                if len(dims) > 1:
                    for dim in dims[1:]:  # Skip batch dimension
                        tensor_representation.dense_tensor.shape.dim.append(
                            schema_pb2.FixedShape.Dim(size=dim))
                updated_tensor_representations[
                    col.name] = tensor_representation
    if updated_tensor_representations:
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                arrow_schema=record_batch.schema,
                tensor_representations=updated_tensor_representations))
        try:
            for k, v in adapter.ToBatchTensors(
                    record_batch, produce_eager_tensors=False).items():
                if isinstance(v, tf.compat.v1.ragged.RaggedTensorValue):
                    features[k] = to_ragged_tensor_value(v)
                elif isinstance(v, tf.compat.v1.SparseTensorValue):
                    kind = updated_tensor_representations[k].WhichOneof('kind')
                    if kind == 'sparse_tensor':
                        features[k] = to_sparse_tensor_value(v)
                    elif kind == 'varlen_sparse_tensor':
                        features[k] = to_varlen_sparse_tensor_value(v)
                    else:
                        raise ValueError(
                            f'Unexpected tensor representation kind ({kind}) '
                            f'for tensor of type: {type(v)}')
                else:
                    features[k] = v
        except Exception as e:
            raise ValueError(e, updated_tensor_representations,
                             record_batch) from e
    return features
    def _batch_reducible_process(
            self, batched_extract: types.Extracts) -> List[types.Extracts]:
        def maybe_expand_dims(arr):
            if not hasattr(arr, 'shape') or not arr.shape:
                return np.expand_dims(arr, axis=0)
            else:
                return arr

        def to_dense(t):
            return tf.sparse.to_dense(t) if isinstance(t,
                                                       tf.SparseTensor) else t

        result = copy.copy(batched_extract)
        record_batch = batched_extract[constants.ARROW_RECORD_BATCH_KEY]
        serialized_examples = batched_extract[constants.INPUT_KEY]
        for extracts_key in self._signature_names.keys():
            if extracts_key not in result or not result[extracts_key]:
                result[extracts_key] = [None] * record_batch.num_rows
        for model_name, model in self._loaded_models.items():
            for extracts_key, signature_names in self._signature_names.items():
                for signature_name in (signature_names[model_name]
                                       or self._default_signature_names):
                    required = bool(signature_names[model_name])
                    input_specs = get_input_specs(model, signature_name,
                                                  required) or {}
                    inputs = None
                    # If input_specs exist then try to filter the inputs by the input
                    # names (unlike estimators, keras does not accept unknown inputs).
                    if input_specs:
                        adapter = self._tensor_adapter
                        if (not adapter and set(input_specs.keys()) <= set(
                                record_batch.schema.names)):
                            # Create adapter based on input_specs
                            tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                                arrow_schema=record_batch.schema,
                                tensor_representations=
                                input_specs_to_tensor_representations(
                                    input_specs))
                            adapter = tensor_adapter.TensorAdapter(
                                tensor_adapter_config)
                        # Avoid getting the tensors if we appear to be feeding serialized
                        # examples to the callable.
                        if adapter and not (
                                len(input_specs) == 1
                                and next(iter(input_specs.values())).dtype
                                == tf.string and find_input_name_in_features(
                                    set(adapter.TypeSpecs().keys()),
                                    next(iter(input_specs.keys()))) is None):
                            # TODO(b/172376802): Update to pass input specs to ToBatchTensors.
                            inputs = filter_by_input_names(
                                adapter.ToBatchTensors(record_batch),
                                list(input_specs.keys()))
                    if not inputs:
                        # Assume serialized examples
                        assert serialized_examples is not None, 'Raw examples not found.'
                        inputs = serialized_examples
                        # If a signature name was not provided, default to using the serving
                        # signature since parsing normally will be done outside model.
                        if not signature_name:
                            signature_name = get_default_signature_name(model)
                    signature = get_callable(model, signature_name, required)
                    if signature is None:
                        if not required:
                            continue
                        raise ValueError(
                            'Unable to find %s function needed to update %s' %
                            (signature_name, extracts_key))
                    if isinstance(inputs, dict):
                        if hasattr(signature, 'structured_input_signature'):
                            outputs = signature(**inputs)
                        else:
                            outputs = signature(inputs)
                    else:
                        outputs = signature(
                            tf.constant(inputs, dtype=tf.string))
                    for i in range(record_batch.num_rows):
                        if isinstance(outputs, dict):
                            output = {
                                k: maybe_expand_dims(to_dense(v)[i].numpy())
                                for k, v in outputs.items()
                            }
                        else:
                            output = {
                                signature_name:
                                maybe_expand_dims(
                                    np.asarray(to_dense(outputs))[i])
                            }
                        if result[extracts_key][i] is None:
                            result[extracts_key][i] = collections.defaultdict(
                                dict)
                        result[extracts_key][i][model_name].update(output)  # pytype: disable=unsupported-operands
        for i in range(len(result[extracts_key])):
            # PyType doesn't recognize isinstance(..., dict).
            # pytype: disable=attribute-error,unsupported-operands
            if isinstance(result[extracts_key][i], dict):
                for model_name, output in result[extracts_key][i].items():
                    if not self._prefer_dict_outputs and len(output) == 1:
                        result[extracts_key][i][model_name] = list(
                            output.values())[0]
                # If only one model, the output is stored without using a dict
                if len(self._eval_config.model_specs) == 1:
                    result[extracts_key][i] = list(
                        result[extracts_key][i].values())[0]
            # pytype: enable=attribute-error,unsupported-operands
        return [result]
Exemple #21
0
    def TensorAdapter(self) -> tensor_adapter.TensorAdapter:
        """Returns a TensorAdapter that converts pa.RecordBatch to TF inputs.

    May raise an error if the TFMD schema was not provided at construction time.
    """
        return tensor_adapter.TensorAdapter(self.TensorAdapterConfig())
 def setup(self):
     super(_BatchedPredictionDoFn, self).setup()
     if self._tensor_adapter_config is not None:
         self._tensor_adapter = tensor_adapter.TensorAdapter(
             self._tensor_adapter_config)
 def testRaiseOnUnsupportedTensorRepresentation(self):
     with self.assertRaisesRegex(ValueError, "Unable to handle tensor"):
         tensor_adapter.TensorAdapter(
             tensor_adapter.TensorAdapterConfig(
                 pa.schema([pa.field("a", pa.list_(pa.int64()))]),
                 {"tensor": schema_pb2.TensorRepresentation()}))