Esempio n. 1
0
    def testSubsetOfColumnNamesWithSubsetSchema(self):
        """Tests when column names and schema features are a subset of columns."""
        schema = text_format.Parse(
            """
      feature {
      name: "int_feature"
      type: INT
      value_count {
        min: 0
        max: 2
      }
      }
      """, schema_pb2.Schema())

        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=["int_feature"],
                             schema=schema)

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            expected_arrow_schema = pa.schema([
                pa.field("int_feature", pa.large_list(pa.int64())),
            ])
            self._ValidateRecordBatch(record_batch, expected_arrow_schema)

        with beam.Pipeline() as p:
            record_batch_pcoll = (p | tfxio.BeamSource(batch_size=_NUM_ROWS))
            beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
Esempio n. 2
0
    def testProjection(self):
        """Test projecting of a TFXIO."""
        schema = schema_pb2.Schema()
        schema.CopyFrom(_UNORDERED_SCHEMA)
        tensor_representations = {
            "string_tensor":
            schema_pb2.TensorRepresentation(
                dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                    column_name="string_feature")),
            "float_tensor":
            schema_pb2.TensorRepresentation(
                sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor(
                    dense_shape=schema_pb2.FixedShape(
                        dim=[schema_pb2.FixedShape.Dim(size=10)]),
                    index_column_names=["int_feature"],
                    value_column_name="float_feature")),
        }
        tensor_representation_util.SetTensorRepresentationsInSchema(
            schema, tensor_representations)

        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=schema,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)

        projected_tfxio = tfxio.Project(["float_tensor"])

        # The projected_tfxio has the projected schema
        self.assertTrue(projected_tfxio.ArrowSchema().equals(
            _EXPECTED_PROJECTED_ARROW_SCHEMA))

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch,
                                      _EXPECTED_PROJECTED_ARROW_SCHEMA)
            expected_schema = projected_tfxio.ArrowSchema()
            self.assertListEqual(
                record_batch.schema.names, expected_schema.names,
                "actual: {}; expected: {}".format(record_batch.schema.names,
                                                  expected_schema.names))
            self.assertListEqual(
                record_batch.schema.types, expected_schema.types,
                "actual: {}; expected: {}".format(record_batch.schema.types,
                                                  expected_schema.types))
            tensor_adapter = projected_tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 1)
            self.assertIn("float_tensor", dict_of_tensors)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline |
                              projected_tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
Esempio n. 3
0
    def testOptionalColumnNamesAndSchema(self):
        """Tests when schema and column names are not provided."""
        tfxio = ParquetTFXIO(file_pattern=self._example_file)

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA)

        with beam.Pipeline() as p:
            record_batch_pcoll = (p | tfxio.BeamSource(batch_size=_NUM_ROWS))
            beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
Esempio n. 4
0
    def testUnorderedSchema(self):
        """Tests various valid schemas."""
        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=_UNORDERED_SCHEMA)

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA)

        with beam.Pipeline() as p:
            record_batch_pcoll = (p | tfxio.BeamSource(batch_size=_NUM_ROWS))
            beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
Esempio n. 5
0
    def testSubsetOfColumnNamesWithCompleteSchema(self):
        """Tests when column names is a subset of schema features."""
        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=["int_feature"],
                             schema=_SCHEMA)

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            expected_arrow_schema = pa.schema([
                pa.field("int_feature", pa.large_list(pa.int64())),
            ])
            self._ValidateRecordBatch(record_batch, expected_arrow_schema)

        with beam.Pipeline() as p:
            record_batch_pcoll = (p | tfxio.BeamSource(batch_size=_NUM_ROWS))
            beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
Esempio n. 6
0
    def testImplicitTensorRepresentations(self):
        """Tests inferring of tensor representation."""
        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=_UNORDERED_SCHEMA,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
        self.assertEqual(
            {
                "int_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "int_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "float_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "float_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "string_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "string_feature" }""",
                    schema_pb2.TensorRepresentation()),
            }, tfxio.TensorRepresentations())

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA)
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            tensor_adapter = tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 3)
            self.assertIn("int_feature", dict_of_tensors)
            self.assertIn("float_feature", dict_of_tensors)
            self.assertIn("string_feature", dict_of_tensors)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline
                              | tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
Esempio n. 7
0
    def testOptionalSchema(self):
        """Tests when the schema is not provided."""
        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)

        self.assertEqual(tfxio.ArrowSchema(), _EXPECTED_ARROW_SCHEMA)

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline
                              | tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
Esempio n. 8
0
    def testExplicitTensorRepresentations(self):
        """Tests when the tensor representation is explicitely provided in the schema."""
        schema = schema_pb2.Schema()
        schema.CopyFrom(_SCHEMA)
        tensor_representations = {
            "my_feature":
            text_format.Parse(
                """
          dense_tensor {
           column_name: "string_feature"
           shape { dim { size: 1 } }
           default_value { bytes_value: "abc" }
         }""", schema_pb2.TensorRepresentation())
        }
        schema.tensor_representation_group[""].CopyFrom(
            schema_pb2.TensorRepresentationGroup(
                tensor_representation=tensor_representations))

        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=schema,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
        self.assertEqual(tensor_representations, tfxio.TensorRepresentations())