Ejemplo n.º 1
0
    def _ProjectTfmdSchema(self,
                           tensor_names: List[Text]) -> schema_pb2.Schema:
        """Projects self._schema by the given tensor names."""
        tensor_representations = self.TensorRepresentations()
        tensor_names = set(tensor_names)
        if not tensor_names.issubset(tensor_representations):
            raise ValueError(
                "Unable to project {} because they were not in the original "
                "TensorRepresentations.".format(tensor_names -
                                                tensor_representations))
        paths = set()
        for tensor_name in tensor_names:
            paths.update(
                tensor_rep_util.GetSourceColumnsFromTensorRepresentation(
                    tensor_representations[tensor_name]))
        result = schema_pb2.Schema()
        # Note: We only copy projected features into the new schema because the
        # coder, and ArrowSchema() only care about Schema.feature. If they start
        # depending on other Schema fields then those fields must also be projected.
        for f in self._schema.feature:
            if path.ColumnPath(f.name) in paths:
                result.feature.add().CopyFrom(f)

        tensor_rep_util.SetTensorRepresentationsInSchema(
            result, {
                k: v
                for k, v in tensor_representations.items() if k in tensor_names
            })

        return result
Ejemplo n.º 2
0
    def testProjection(self):
        """Test projecting of a TFXIO."""
        schema = schema_pb2.Schema()
        schema.CopyFrom(_UNORDERED_SCHEMA)
        tensor_representations = {
            "string_tensor":
            schema_pb2.TensorRepresentation(
                dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                    column_name="string_feature")),
            "float_tensor":
            schema_pb2.TensorRepresentation(
                sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor(
                    dense_shape=schema_pb2.FixedShape(
                        dim=[schema_pb2.FixedShape.Dim(size=10)]),
                    index_column_names=["int_feature"],
                    value_column_name="float_feature")),
        }
        tensor_representation_util.SetTensorRepresentationsInSchema(
            schema, tensor_representations)

        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=schema,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)

        projected_tfxio = tfxio.Project(["float_tensor"])

        # The projected_tfxio has the projected schema
        self.assertTrue(projected_tfxio.ArrowSchema().equals(
            _EXPECTED_PROJECTED_ARROW_SCHEMA))

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch,
                                      _EXPECTED_PROJECTED_ARROW_SCHEMA)
            expected_schema = projected_tfxio.ArrowSchema()
            self.assertListEqual(
                record_batch.schema.names, expected_schema.names,
                "actual: {}; expected: {}".format(record_batch.schema.names,
                                                  expected_schema.names))
            self.assertListEqual(
                record_batch.schema.types, expected_schema.types,
                "actual: {}; expected: {}".format(record_batch.schema.types,
                                                  expected_schema.types))
            tensor_adapter = projected_tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 1)
            self.assertIn("float_tensor", dict_of_tensors)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline |
                              projected_tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
Ejemplo n.º 3
0
    def _ProjectTfmdSchema(self,
                           tensor_names: List[Text]) -> schema_pb2.Schema:
        """Projects self._schema by the given tensor names."""
        tensor_representations = self.TensorRepresentations()
        tensor_names = set(tensor_names)
        if not tensor_names.issubset(tensor_representations):
            raise ValueError(
                "Unable to project {} because they were not in the original "
                "TensorRepresentations.".format(tensor_names -
                                                tensor_representations))
        used_paths = set()
        for tensor_name in tensor_names:
            used_paths.update(
                tensor_representation_util.
                GetSourceColumnsFromTensorRepresentation(
                    tensor_representations[tensor_name]))
        result = schema_pb2.Schema()
        # Note: We only copy projected features into the new schema because the
        # coder, and ArrowSchema() only care about Schema.feature. If they start
        # depending on other Schema fields then those fields must also be projected.
        for f in self._schema.feature:
            p = path.ColumnPath(f.name)
            if f.name == _SEQUENCE_COLUMN_NAME:
                if f.type != schema_pb2.STRUCT:
                    raise ValueError(
                        "Feature {} was expected to be of type STRUCT, but got {}"
                        .format(f.name, f))
                result_sequence_struct = schema_pb2.Feature()
                result_sequence_struct.CopyFrom(f)
                result_sequence_struct.ClearField("struct_domain")
                any_sequence_feature_projected = False
                for sf in f.struct_domain.feature:
                    sequence_feature_path = p.child(sf.name)
                    if sequence_feature_path in used_paths:
                        any_sequence_feature_projected = True
                        result_sequence_struct.struct_domain.feature.add(
                        ).CopyFrom(sf)
                if any_sequence_feature_projected:
                    result.feature.add().CopyFrom(result_sequence_struct)
            elif p in used_paths:
                result.feature.add().CopyFrom(f)

        tensor_representation_util.SetTensorRepresentationsInSchema(
            result, {
                k: v
                for k, v in tensor_representations.items() if k in tensor_names
            })

        return result
Ejemplo n.º 4
0
  def _ProjectTfmdSchemaTensorRepresentation(
      self, tensor_names: List[Text]) -> schema_pb2.Schema:
    """Creates the tensor representation for choosen tensor_names."""
    tensor_representations = self.TensorRepresentations()
    tensor_names = set(tensor_names)

    # The columns in the schema will remain the same, because the csv decoder
    # will need to decode all columns no matter what.
    result = schema_pb2.Schema()
    result.CopyFrom(self._schema)

    # The tensor representation will only contain the projected columns, so the
    # output tensors will only be the projected columns.
    tensor_representation_util.SetTensorRepresentationsInSchema(
        result,
        {k: v for k, v in tensor_representations.items() if k in tensor_names})

    return result