def testImplicitTensorRepresentations(self):
    tfxio = self._MakeTFXIO(_SCHEMA)
    self.assertEqual(
        {
            "int_feature": text_format.Parse(
                """varlen_sparse_tensor { column_name: "int_feature" }""",
                schema_pb2.TensorRepresentation()),
            "float_feature": text_format.Parse(
                """varlen_sparse_tensor { column_name: "float_feature" }""",
                schema_pb2.TensorRepresentation()),
            "string_feature": text_format.Parse(
                """varlen_sparse_tensor { column_name: "string_feature" }""",
                schema_pb2.TensorRepresentation()),
        }, tfxio.TensorRepresentations())

    def _AssertFn(record_batch_list):
      self.assertLen(record_batch_list, 1)
      record_batch = record_batch_list[0]
      self._ValidateRecordBatch(tfxio, record_batch)
      self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
      tensor_adapter = tfxio.TensorAdapter()
      dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
      self.assertLen(dict_of_tensors, 3)
      self.assertIn("int_feature", dict_of_tensors)
      self.assertIn("float_feature", dict_of_tensors)
      self.assertIn("string_feature", dict_of_tensors)

    p = beam.Pipeline()
    record_batch_pcoll = p | tfxio.BeamSource(batch_size=1000)
    beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
    pipeline_result = p.run()
    pipeline_result.wait_until_finish()
    telemetry_test_util.ValidateMetrics(
        self, pipeline_result, _TELEMETRY_DESCRIPTORS,
        "tf_example", "tfrecords_gzip")
    def testProjection(self):
        schema = schema_pb2.Schema()
        schema.CopyFrom(_SCHEMA)
        tensor_representations = {
            "dense_string":
            text_format.Parse(
                """dense_tensor {
             column_name: "string_feature"
             shape { dim { size: 2 } }
             default_value { bytes_value: "zzz" }
           }""", schema_pb2.TensorRepresentation()),
            "varlen_string":
            text_format.Parse(
                """varlen_sparse_tensor {
             column_name: "string_feature"
           }""", schema_pb2.TensorRepresentation()),
            "varlen_float":
            text_format.Parse(
                """varlen_sparse_tensor {
             column_name: "float_feature"
           }""", schema_pb2.TensorRepresentation()),
        }
        schema.tensor_representation_group[""].CopyFrom(
            schema_pb2.TensorRepresentationGroup(
                tensor_representation=tensor_representations))

        tfxio = self._MakeTFXIO(schema)
        self.assertEqual(tensor_representations, tfxio.TensorRepresentations())

        projected_tfxio = tfxio.Project(
            ["dense_string", "varlen_string", "varlen_float"])
        self.assertEqual(tensor_representations,
                         projected_tfxio.TensorRepresentations())
        self.assertTrue(projected_tfxio.ArrowSchema().equals(
            pa.schema([
                pa.field("float_feature", pa.list_(pa.float32())),
                pa.field("string_feature", pa.list_(pa.binary())),
            ])))

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self.ValidateRecordBatch(record_batch)
            expected_schema = projected_tfxio.ArrowSchema()
            self.assertTrue(
                record_batch.schema.equals(expected_schema),
                "actual: {}; expected: {}".format(record_batch.schema,
                                                  expected_schema))
            tensor_adapter = projected_tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 3)
            self.assertIn("dense_string", dict_of_tensors)
            self.assertIn("varlen_string", dict_of_tensors)
            self.assertIn("varlen_float", dict_of_tensors)

        with beam.Pipeline() as p:
            # Setting the betch_size to make sure only one batch is generated.
            record_batch_pcoll = p | projected_tfxio.BeamSource(
                batch_size=len(_EXAMPLES))
            beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
Exemple #3
0
    def testProjection(self):
        """Test projecting of a TFXIO."""
        schema = schema_pb2.Schema()
        schema.CopyFrom(_UNORDERED_SCHEMA)
        tensor_representations = {
            "string_tensor":
            schema_pb2.TensorRepresentation(
                dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                    column_name="string_feature")),
            "float_tensor":
            schema_pb2.TensorRepresentation(
                sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor(
                    dense_shape=schema_pb2.FixedShape(
                        dim=[schema_pb2.FixedShape.Dim(size=10)]),
                    index_column_names=["int_feature"],
                    value_column_name="float_feature")),
        }
        tensor_representation_util.SetTensorRepresentationsInSchema(
            schema, tensor_representations)

        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=schema,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)

        projected_tfxio = tfxio.Project(["float_tensor"])

        # The projected_tfxio has the projected schema
        self.assertTrue(projected_tfxio.ArrowSchema().equals(
            _EXPECTED_PROJECTED_ARROW_SCHEMA))

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch,
                                      _EXPECTED_PROJECTED_ARROW_SCHEMA)
            expected_schema = projected_tfxio.ArrowSchema()
            self.assertListEqual(
                record_batch.schema.names, expected_schema.names,
                "actual: {}; expected: {}".format(record_batch.schema.names,
                                                  expected_schema.names))
            self.assertListEqual(
                record_batch.schema.types, expected_schema.types,
                "actual: {}; expected: {}".format(record_batch.schema.types,
                                                  expected_schema.types))
            tensor_adapter = projected_tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 1)
            self.assertIn("float_tensor", dict_of_tensors)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline |
                              projected_tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
Exemple #4
0
    def testTrackRecordTensorRepresentations(self):
        num_dense_tensors = 3
        num_varlen_sparse_tensors = 2
        num_sparse_tensors = 1
        num_ragged_tensors = 4
        tensor_representations = {}
        for i in range(num_dense_tensors):
            tensor_representations[f"dense{i}"] = (
                schema_pb2.TensorRepresentation(
                    dense_tensor=schema_pb2.TensorRepresentation.DenseTensor())
            )
        for i in range(num_varlen_sparse_tensors):
            tensor_representations[f"varlen{i}"] = (
                schema_pb2.TensorRepresentation(
                    varlen_sparse_tensor=schema_pb2.TensorRepresentation.
                    VarLenSparseTensor()))
        for i in range(num_sparse_tensors):
            tensor_representations[f"sparse{i}"] = (
                schema_pb2.TensorRepresentation(
                    sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor(
                    )))
        for i in range(num_ragged_tensors):
            tensor_representations[f"ragged{i}"] = (
                schema_pb2.TensorRepresentation(
                    ragged_tensor=schema_pb2.TensorRepresentation.RaggedTensor(
                    )))

        expected_counters = {
            "dense_tensor": num_dense_tensors,
            "varlen_sparse_tensor": num_varlen_sparse_tensors,
            "sparse_tensor": num_sparse_tensors,
            "ragged_tensor": num_ragged_tensors,
        }

        with beam.Pipeline(
                **test_helpers.make_test_beam_pipeline_kwargs()) as p:
            _ = (p | beam.Create([tensor_representations])
                 | collection.TrackTensorRepresentations(
                     counter_namespace="TestNamespace"))

        pipeline_result = p.run()
        result_metrics = pipeline_result.metrics()
        for kind, expected_count in expected_counters.items():
            actual_counter = result_metrics.query(
                beam.metrics.metric.MetricsFilter().with_name(
                    kind))["counters"]
            self.assertLen(
                actual_counter,
                1,
                msg=
                f"Actual and expected lengths of {kind} counter are different."
            )
            self.assertEqual(
                actual_counter[0].committed,
                expected_count,
                msg=
                f"Actual and expected values for {kind} counter are different."
            )
    def test_simple(self, attach_raw_records):
        raw_record_column_name = "_raw_records" if attach_raw_records else None
        tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
            self._input_path,
            self._decoder_path,
            _TELEMETRY_DESCRIPTORS,
            raw_record_column_name=raw_record_column_name)
        expected_fields = [
            pa.field("st1", pa.list_(pa.binary())),
            pa.field("st2", pa.list_(pa.binary())),
        ]
        if attach_raw_records:
            raw_record_column_type = (pa.large_list(pa.large_binary())
                                      if tfxio._can_produce_large_types else
                                      pa.list_(pa.binary()))
            expected_fields.append(
                pa.field(raw_record_column_name, raw_record_column_type))
        self.assertTrue(tfxio.ArrowSchema().equals(pa.schema(expected_fields)),
                        tfxio.ArrowSchema())
        self.assertEqual(
            tfxio.TensorRepresentations(), {
                "st1":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "st1" }""",
                    schema_pb2.TensorRepresentation()),
                "st2":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "st2" }""",
                    schema_pb2.TensorRepresentation())
            })

        tensor_adapter = tfxio.TensorAdapter()
        self.assertEqual(tensor_adapter.TypeSpecs(),
                         _DecoderForTesting().output_type_specs())

        def _assert_fn(list_of_rb):
            self.assertLen(list_of_rb, 1)
            rb = list_of_rb[0]
            self.assertTrue(rb.schema.equals(tfxio.ArrowSchema()))
            tensors = tensor_adapter.ToBatchTensors(rb)
            self.assertLen(tensors, 2)
            for tensor_name in ("st1", "st2"):
                self.assertIn(tensor_name, tensors)
                st = tensors[tensor_name]
                self.assertAllEqual(st.values, _RECORDS)
                self.assertAllEqual(st.indices, [[0, 0], [1, 0]])
                self.assertAllEqual(st.dense_shape, [2, 1])

        p = beam.Pipeline()
        rb_pcoll = p | tfxio.BeamSource(batch_size=len(_RECORDS))
        beam_testing_util.assert_that(rb_pcoll, _assert_fn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "tensor",
                                            "tfrecords_gzip")
def _InferTensorRepresentationFromSchema(
        schema: schema_pb2.Schema
) -> Dict[str, schema_pb2.TensorRepresentation]:
    """Translate a Feature proto into a TensorRepresentation proto.

  We apply the following rules:
    1. If the feature has a fixed shape (set through Feature.shape field),
       then the feature must always be present (
       Feature.presence.min_fraction == 1.0), and a DenseTensor representation
       will be produced for it.
    2. Otherwise, a VarLenSparseTensor representation will be produced for it.

  Args:
    schema: a schema_pb2.Schema.

  Returns:
    A Dict mapping tensor names to their TensorRepresentations.

  Raises:
    ValueError: if the feature has a fixed shape but is not always present.
  """
    result = {}
    columns_remaining = {f.name: f for f in schema.feature}

    sparse_tensor_repsentations, columns_remaining = (
        _InferSparseTensorRepresentationsFromSchema(schema, columns_remaining))
    result.update(sparse_tensor_repsentations)

    for feature in columns_remaining.values():
        if not _ShouldIncludeFeature(feature):
            continue
        if feature.HasField("shape"):
            if feature.presence.min_fraction != 1:
                raise ValueError(
                    "Feature {} had shape {} set but min_fraction {} != 1.  Use"
                    " value_count not shape field when min_fraction != 1.".
                    format(feature.name, feature.shape,
                           feature.presence.min_fraction))
            logging.info("Feature %s has a shape %s. Setting to DenseTensor.",
                         feature.name, feature.shape)
            result[feature.name] = schema_pb2.TensorRepresentation(
                dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                    column_name=feature.name, shape=feature.shape))
        else:
            logging.info(
                "Feature %s has no shape. Setting to VarLenSparseTensor.",
                feature.name)
            result[feature.name] = schema_pb2.TensorRepresentation(
                varlen_sparse_tensor=schema_pb2.TensorRepresentation.
                VarLenSparseTensor(column_name=feature.name))

    return result
 def tensor_representation(self) -> schema_pb2.TensorRepresentation:
   result = schema_pb2.TensorRepresentation()
   for d in self._unbatched_shape:
     result.sparse_tensor.dense_shape.dim.add().size = d
   result.sparse_tensor.value_column_name = self._value_column_name
   result.sparse_tensor.index_column_names.extend(self._index_column_names)
   return result
 def testGetSourceValueColumnFromTensorRepresentation(
         self, pbtxt, expected):
     self.assertEqual(
         path.ColumnPath(expected),
         tensor_representation_util.
         GetSourceValueColumnFromTensorRepresentation(
             text_format.Parse(pbtxt, schema_pb2.TensorRepresentation())))
    def testCreateTfExampleParserConfig(self, tensor_representation,
                                        feature_type, tf_example,
                                        expected_feature,
                                        expected_parsed_results):
        tensor_representation = text_format.Parse(
            tensor_representation, schema_pb2.TensorRepresentation())
        feature = tensor_representation_util.CreateTfExampleParserConfig(
            tensor_representation, feature_type)

        # Checks that the parser configs are correct.
        for actual_arg, expected_arg in zip(feature, expected_feature):
            self.assertAllEqual(actual_arg, expected_arg)

        # Checks that the parser configs can be used with tf.io.parse_example()
        actual_tensors = tf.io.parse_single_example(tf_example,
                                                    {'feat': feature})
        actual = actual_tensors['feat']
        if isinstance(actual, tf.SparseTensor) or isinstance(
                actual, tf.compat.v1.SparseTensorValue):
            self.assertAllEqual(actual.values, expected_parsed_results.values)
            self.assertAllEqual(actual.indices,
                                expected_parsed_results.indices)
            self.assertAllEqual(actual.dense_shape,
                                expected_parsed_results.dense_shape)
        else:
            self.assertAllEqual(actual, expected_parsed_results)
Exemple #10
0
 def __setstate__(self, t):
   tensor_representations = {}
   for k, v in t[1].items():
     r = schema_pb2.TensorRepresentation()
     r.ParseFromString(v)
     tensor_representations[k] = r
   self.__init__(t[0], tensor_representations, t[2])
Exemple #11
0
 def testPickleTensorAdapterConfig(self):
     config = tensor_adapter.TensorAdapterConfig(
         arrow_schema=pa.schema([pa.field("column1",
                                          pa.list_(pa.int32()))]),
         tensor_representations={
             "column1":
             text_format.Parse(
                 """
             dense_tensor {
               column_name: "column1"
               shape {
                 dim {
                   size: 1
                 }
               }
             }""", schema_pb2.TensorRepresentation())
         },
         original_type_specs={
             "column1": tf.TensorSpec(dtype=tf.int32, shape=[None, 1]),
             "column2": tf.TensorSpec(dtype=tf.int32, shape=[None, 1])
         })
     unpickled_config = pickle.loads(pickle.dumps(config))
     self.assertEqual(config.arrow_schema, unpickled_config.arrow_schema)
     self.assertEqual(config.tensor_representations,
                      unpickled_config.tensor_representations)
     self.assertEqual(config.original_type_specs,
                      unpickled_config.original_type_specs)
 def testInferTensorRepresentationsFromSchema(
     self,
     ascii_proto,
     expected,
     generate_legacy_feature_spec=False,
     schema_is_mixed=False):
   if not _IS_LEGACY_SCHEMA and generate_legacy_feature_spec:
     raise self.skipTest('This test exersizes legacy inference logic, but the '
                         'schema is not legacy schema.')
   schema = text_format.Parse(ascii_proto, schema_pb2.Schema())
   if _IS_LEGACY_SCHEMA:
     schema.generate_legacy_feature_spec = generate_legacy_feature_spec
   expected_protos = {
       k: text_format.Parse(pbtxt, schema_pb2.TensorRepresentation())
       for k, pbtxt in expected.items()
   }
   if not schema_is_mixed:
     self.assertEqual(
         expected_protos,
         tensor_representation_util.InferTensorRepresentationsFromSchema(
             schema))
   self.assertEqual(
       expected_protos,
       tensor_representation_util.InferTensorRepresentationsFromMixedSchema(
           schema))
Exemple #13
0
 def TensorRepresentations(self) -> tensor_adapter.TensorRepresentations:
     return {
         self.raw_record_column_name:
         schema_pb2.TensorRepresentation(
             dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                 column_name=self.raw_record_column_name,
                 shape=schema_pb2.FixedShape(),  # scalar
             ))
     }
Exemple #14
0
    def testImplicitTensorRepresentations(self, use_beam_record_csv_tfxio):
        """Tests inferring of tensor representation."""
        tfxio = self._MakeTFXIO(
            _COLUMN_NAMES,
            schema=_SCHEMA,
            make_beam_record_tfxio=use_beam_record_csv_tfxio)
        self.assertEqual(
            {
                "int_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "int_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "float_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "float_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "string_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "string_feature" }""",
                    schema_pb2.TensorRepresentation()),
            }, tfxio.TensorRepresentations())

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(tfxio, record_batch)
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            tensor_adapter = tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 3)
            self.assertIn("int_feature", dict_of_tensors)
            self.assertIn("float_feature", dict_of_tensors)
            self.assertIn("string_feature", dict_of_tensors)

        p = beam.Pipeline()
        record_batch_pcoll = (self._MakePipelineInputs(
            p, use_beam_record_csv_tfxio)
                              | tfxio.BeamSource(batch_size=len(_ROWS)))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "csv",
                                            _EXPECTED_PHYSICAL_FORMAT)
Exemple #15
0
    def testImplicitTensorRepresentations(self):
        """Tests inferring of tensor representation."""
        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=_UNORDERED_SCHEMA,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
        self.assertEqual(
            {
                "int_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "int_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "float_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "float_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "string_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "string_feature" }""",
                    schema_pb2.TensorRepresentation()),
            }, tfxio.TensorRepresentations())

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA)
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            tensor_adapter = tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 3)
            self.assertIn("int_feature", dict_of_tensors)
            self.assertIn("float_feature", dict_of_tensors)
            self.assertIn("string_feature", dict_of_tensors)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline
                              | tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
Exemple #16
0
    def tensor_representation(self) -> schema_pb2.TensorRepresentation:
        result = schema_pb2.TensorRepresentation()
        result.ragged_tensor.feature_path.step.append(self._tensor_name)
        row_partition_dtype = (
            schema_pb2.TensorRepresentation.RowPartitionDType.INT32
            if self._row_partition_dtype == tf.int32 else
            schema_pb2.TensorRepresentation.RowPartitionDType.INT64)
        result.ragged_tensor.row_partition_dtype = row_partition_dtype

        return result
 def testRaiseOnInvalidSparseTensorRepresentation(
         self, tensor_representation_textpb, arrow_schema):
     tensor_representation = text_format.Parse(
         tensor_representation_textpb, schema_pb2.TensorRepresentation())
     with self.assertRaisesRegex(ValueError, "Unable to handle tensor"):
         tensor_adapter.TensorAdapter(
             tensor_adapter.TensorAdapterConfig(
                 pa.schema(
                     [pa.field(k, v) for k, v in arrow_schema.items()]),
                 {"tensor": tensor_representation}))
    def testImplicitTensorRepresentations(self):
        tfxio = self._MakeTFXIO(_SCHEMA)
        self.assertTrue(tfxio.ArrowSchema().equals(
            pa.schema([
                pa.field("int_feature", pa.list_(pa.int64())),
                pa.field("float_feature", pa.list_(pa.float32())),
                pa.field("string_feature", pa.list_(pa.binary())),
            ])))
        self.assertEqual(
            {
                "int_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "int_feature" }""",
                    schema_pb2.TensorRepresentation()),
                "float_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "float_feature" }""",
                    schema_pb2.TensorRepresentation()),
                "string_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "string_feature" }""",
                    schema_pb2.TensorRepresentation()),
            }, tfxio.TensorRepresentations())

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self.ValidateRecordBatch(record_batch)
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            tensor_adapter = tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 3)
            self.assertIn("int_feature", dict_of_tensors)
            self.assertIn("float_feature", dict_of_tensors)
            self.assertIn("string_feature", dict_of_tensors)

        with beam.Pipeline() as p:
            # Setting the betch_size to make sure only one batch is generated.
            record_batch_pcoll = p | tfxio.BeamSource(
                batch_size=len(_EXAMPLES))
            beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
Exemple #19
0
 def testCreateTfExampleParserConfigRagged(self):
   feature_type = schema_pb2.INT
   tensor_representation = text_format.Parse(
       """
               ragged_tensor {
                 feature_path {
                   step: "ragged_feature"
                 }
               }""", schema_pb2.TensorRepresentation())
   with self.assertRaisesRegex(NotImplementedError,
                               'TensorRepresentation: .* is not supported.'):
     tensor_representation_util.CreateTfExampleParserConfig(
         tensor_representation, feature_type)
Exemple #20
0
 def TensorRepresentations(self):
     return {  # pylint: disable=g-complex-comprehension
         c: text_format.Parse(
             """
           dense_tensor {
             column_name: "%s"
             shape {
               dim {
                 size: 1
               }
             }
           }""" % c, schema_pb2.TensorRepresentation())
         for c in self._columns
     }
 def testInputSpecsToTensorRepresentations(self):
     tensor_representations = model_util.input_specs_to_tensor_representations(
         {
             'input_1':
             tf.TensorSpec(shape=(None, 2), dtype=tf.int64),
             'input_2':
             tf.SparseTensorSpec(shape=(None, 1), dtype=tf.float32),
             'input_3':
             tf.RaggedTensorSpec(shape=(None, None), dtype=tf.float32),
         })
     dense_tensor_representation = text_format.Parse(
         """
     dense_tensor {
       column_name: "input_1"
       shape { dim { size: 2 } }
     }
     """, schema_pb2.TensorRepresentation())
     sparse_tensor_representation = text_format.Parse(
         """
     varlen_sparse_tensor {
       column_name: "input_2"
     }
     """, schema_pb2.TensorRepresentation())
     ragged_tensor_representation = text_format.Parse(
         """
     ragged_tensor {
       feature_path {
         step: "input_3"
       }
     }
     """, schema_pb2.TensorRepresentation())
     self.assertEqual(
         {
             'input_1': dense_tensor_representation,
             'input_2': sparse_tensor_representation,
             'input_3': ragged_tensor_representation
         }, tensor_representations)
Exemple #22
0
def _ragged_tensor_representation_from_feature_spec(
    spec: common_types.RaggedFeature, name: str,
    domains: Dict[str, common_types.DomainType]
) -> Tuple[schema_pb2.Feature, List[schema_pb2.Feature],
           schema_pb2.TensorRepresentation]:
    """Returns representation of a RaggedTensor from a feature spec.

  Args:
    spec: A tf.io.RaggedFeature feature spec.
    name: Feature name.
    domains: A dict whose keys are feature names and values are one of
      schema_pb2.IntDomain, schema_pb2.StringDomain or schema_pb2.FloatDomain.

  Returns:
    A tuple (value_feature, partitions_features, ragged_tensor_rep),
      where value_feature represents RaggedTensor values, partitions_features
      represent row lengths partitions and ragged_tensor_rep - ragged
      TensorRepresentation.

  Raises:
    ValueError: If the feature spec contains partition types different from
      UniformRowLength and RowLengths.
  """
    value_feature = schema_pb2.Feature(name=spec.value_key or name)
    _set_type(name, value_feature, spec.dtype)
    _set_domain(name, value_feature, domains.get(name))

    ragged_tensor = schema_pb2.TensorRepresentation.RaggedTensor(
        feature_path=path_pb2.Path(step=[spec.value_key or name]))

    partitions_features = []
    for partition in spec.partitions:
        if isinstance(partition, tf.io.RaggedFeature.UniformRowLength):  # pytype: disable=attribute-error
            ragged_tensor.partition.append(
                schema_pb2.TensorRepresentation.RaggedTensor.Partition(
                    uniform_row_length=partition.length))
        elif isinstance(partition, tf.io.RaggedFeature.RowLengths):  # pytype: disable=attribute-error
            ragged_tensor.partition.append(
                schema_pb2.TensorRepresentation.RaggedTensor.Partition(
                    row_length=partition.key))
            partitions_features.append(
                schema_pb2.Feature(name=partition.key, type=schema_pb2.INT))
        else:
            raise ValueError(
                'RaggedFeature can only be created with UniformRowLength and '
                'RowLengths partitions.')

    return value_feature, partitions_features, schema_pb2.TensorRepresentation(
        ragged_tensor=ragged_tensor)
 def testCreateTfExampleParserConfigRagged(self):
   feature_type = schema_pb2.INT
   tensor_representation = text_format.Parse(
       """
               ragged_tensor {
                 feature_path {
                   step: "foo"
                   step: "ragged_feature"
                 }
               }""", schema_pb2.TensorRepresentation())
   with self.assertRaisesRegex(
       ValueError, ('Parsing spec from a RaggedTensor with multiple steps in '
                    'feature_path is not implemented.')):
     tensor_representation_util.CreateTfExampleParserConfig(
         tensor_representation, feature_type)
Exemple #24
0
 def tensor_representation(self) -> schema_pb2.TensorRepresentation:
     result = schema_pb2.TensorRepresentation()
     result.ragged_tensor.feature_path.step.append(self._tensor_name)
     row_partition_dtype = (
         schema_pb2.TensorRepresentation.RowPartitionDType.INT32
         if self._row_partition_dtype == tf.int32 else
         schema_pb2.TensorRepresentation.RowPartitionDType.INT64)
     result.ragged_tensor.row_partition_dtype = row_partition_dtype
     for dim in self._unbatched_shape:
         # Create uniform_row_length partitions only.
         if dim is not None:
             result.ragged_tensor.partition.append(
                 schema_pb2.TensorRepresentation.RaggedTensor.Partition(
                     uniform_row_length=dim))
     return result
Exemple #25
0
 def testRecordBatchToTensorValuesWithTensorRepresentation(self):
     record_batch = pa.record_batch(
         [pa.array([[1, 2], [2, 3], [3, 4]]),
          pa.array([[0], [1], [1]])], ['feature_1', 'feature_2'])
     tensor_representation = schema_pb2.TensorRepresentation()
     tensor_representation.dense_tensor.column_name = 'feature_1'
     tensor_representation.dense_tensor.shape.dim.append(
         schema_pb2.FixedShape.Dim(size=2))
     actual = util.record_batch_to_tensor_values(
         record_batch, {'feature_1': tensor_representation})
     expected = {
         'feature_1': np.array([[1, 2], [2, 3], [3, 4]]),
         'feature_2': np.array([0, 1, 1])
     }
     self.assertAllClose(actual, expected)
 def testRaiseOnInvalidDefaultValue(self, value_type, default_value_pbtxt,
                                    exception_regexp):
     tensor_representation = text_format.Parse(
         """
               dense_tensor {
                 column_name: "column"
                 shape {}
               }""", schema_pb2.TensorRepresentation())
     tensor_representation.dense_tensor.default_value.CopyFrom(
         text_format.Parse(default_value_pbtxt,
                           schema_pb2.TensorRepresentation.DefaultValue()))
     with self.assertRaisesRegex(ValueError, exception_regexp):
         tensor_adapter.TensorAdapter(
             tensor_adapter.TensorAdapterConfig(
                 pa.schema([pa.field("column", pa.list_(value_type))]),
                 {"tensor": tensor_representation}))
    def test2DSparseTensor(self):
        tensor_representation = text_format.Parse(
            """
        sparse_tensor {
          value_column_name: "values"
          index_column_names: ["d0", "d1"]
          dense_shape {
            dim {
              size: 10
            }
            dim {
              size: 20
            }
          }
        }
        """, schema_pb2.TensorRepresentation())
        record_batch = pa.RecordBatch.from_arrays(
            [
                pa.array([[1], None, [2], [3, 4, 5], []],
                         type=pa.list_(pa.int64())),
                # Also test that the index column can be of an integral type other
                # than int64.
                pa.array([[9], None, [9], [7, 8, 9], []],
                         type=pa.list_(pa.uint32())),
                pa.array([[0], None, [0], [0, 1, 2], []],
                         type=pa.list_(pa.int64()))
            ],
            ["values", "d0", "d1"])
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                record_batch.schema, {"output": tensor_representation}))
        converted = adapter.ToBatchTensors(record_batch)
        self.assertLen(converted, 1)
        self.assertIn("output", converted)
        actual_output = converted["output"]
        self.assertIsInstance(
            actual_output, (tf.SparseTensor, tf.compat.v1.SparseTensorValue))
        self.assertSparseAllEqual(
            tf.compat.v1.SparseTensorValue(dense_shape=[5, 10, 20],
                                           indices=[[0, 9, 0], [2, 9, 0],
                                                    [3, 7, 0], [3, 8, 1],
                                                    [3, 9, 2]],
                                           values=tf.convert_to_tensor(
                                               [1, 2, 3, 4, 5],
                                               dtype=tf.int64)), actual_output)

        self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch)
    def testOriginalTypeSpecs(self):
        arrow_schema = pa.schema([pa.field("column1", pa.list_(pa.int32()))])
        tensor_representations = {
            "column1":
            text_format.Parse(
                """
                dense_tensor {
                  column_name: "column1"
                  shape {
                    dim {
                      size: 1
                    }
                  }
                }""", schema_pb2.TensorRepresentation())
        }
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(arrow_schema,
                                               tensor_representations))
        self.assertLen(adapter.TypeSpecs(), 1)
        self.assertEqual(adapter.TypeSpecs(), adapter.OriginalTypeSpecs())

        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                arrow_schema,
                tensor_representations,
                original_type_specs={
                    "column1": tf.TensorSpec(dtype=tf.int32, shape=[None, 1]),
                    "column2": tf.TensorSpec(dtype=tf.int32, shape=[None, 1])
                }))
        self.assertLen(adapter.TypeSpecs(), 1)
        self.assertLen(adapter.OriginalTypeSpecs(), 2)

        with self.assertRaisesRegex(ValueError,
                                    "original_type_specs must be a superset"):
            adapter = tensor_adapter.TensorAdapter(
                tensor_adapter.TensorAdapterConfig(
                    arrow_schema,
                    tensor_representations,
                    original_type_specs={
                        # mismatch spec of column1
                        "column1": tf.TensorSpec(dtype=tf.int64,
                                                 shape=[None, 1]),
                        "column2": tf.TensorSpec(dtype=tf.int32,
                                                 shape=[None, 1])
                    }))
 def testInferTensorRepresentationsFromSchema(
         self, ascii_proto, expected, generate_legacy_feature_spec=False):
     # Skip a test if it's testing legacy logic but the schema is not the
     # legacy schema.
     if not _IS_LEGACY_SCHEMA and generate_legacy_feature_spec:
         print('Skipping test case: ', self.id(), file=sys.stderr)
         return
     schema = text_format.Parse(ascii_proto, schema_pb2.Schema())
     if _IS_LEGACY_SCHEMA:
         schema.generate_legacy_feature_spec = generate_legacy_feature_spec
     expected_protos = {
         k: text_format.Parse(pbtxt, schema_pb2.TensorRepresentation())
         for k, pbtxt in expected.items()
     }
     self.assertEqual(
         expected_protos,
         tensor_representation_util.InferTensorRepresentationsFromSchema(
             schema))
 def testCreateTfExampleParserConfigInvalidDefaultValue(self):
   tensor_representation = text_format.Parse(
       """
               dense_tensor {
                 column_name: "dense_column"
                 shape {
                   dim {
                     size: 1
                   }
                 }
                 default_value {
                   int_value: -1
                 }
               }""", schema_pb2.TensorRepresentation())
   feature_type = schema_pb2.FLOAT
   with self.assertRaisesRegex(
       ValueError, 'FeatureType:.* is incompatible with default_value:.*'):
     tensor_representation_util.CreateTfExampleParserConfig(
         tensor_representation, feature_type)