Beispiel #1
0
    def _ValidateRecordBatch(self, record_batch, raw_record_column_name=None):
        self.assertIsInstance(record_batch, pa.RecordBatch)
        self.assertEqual(record_batch.num_rows, 3)
        for i, field in enumerate(record_batch.schema):
            if field.name == raw_record_column_name:
                continue
            if field.name == _SEQUENCE_COLUMN_NAME:
                self.assertTrue(pa.types.is_struct(field.type))
                for seq_column, seq_field in zip(
                        record_batch.column(i).flatten(), list(field.type)):
                    expected_array = _EXPECTED_COLUMN_VALUES[path.ColumnPath(
                        [_SEQUENCE_COLUMN_NAME, seq_field.name])]
                    self.assertTrue(
                        seq_column.equals(expected_array),
                        "Sequence column {} did not match ({} vs {})".format(
                            seq_field.name, seq_column, expected_array))
                continue
            self.assertTrue(
                record_batch.column(i).equals(
                    _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])]),
                "Column {} did not match ({} vs {}).".format(
                    field.name, record_batch.column(i),
                    _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])]))

        if raw_record_column_name is not None:
            self.assertEqual(record_batch.schema.names[-1],
                             raw_record_column_name)
            self.assertTrue(record_batch.columns[-1].type.equals(
                pa.large_list(pa.large_binary())))
            self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                             _SERIALIZED_EXAMPLES)
Beispiel #2
0
def _GetSparseTensorRepresentationUsedColumns(
    sparse_tensor_rep: schema_pb2.TensorRepresentation.SparseTensor
) -> List[path.ColumnPath]:
    result = [path.ColumnPath(c) for c in sparse_tensor_rep.index_column_names]
    if sparse_tensor_rep.HasField("value_column_name"):
        result.append(path.ColumnPath(sparse_tensor_rep.value_column_name))
    return result
    def _ProjectTfmdSchema(self,
                           tensor_names: List[Text]) -> schema_pb2.Schema:
        """Projects self._schema by the given tensor names."""
        tensor_representations = self.TensorRepresentations()
        tensor_names = set(tensor_names)
        if not tensor_names.issubset(tensor_representations):
            raise ValueError(
                "Unable to project {} because they were not in the original "
                "TensorRepresentations.".format(tensor_names -
                                                tensor_representations))
        paths = set()
        for tensor_name in tensor_names:
            paths.update(
                tensor_rep_util.GetSourceColumnsFromTensorRepresentation(
                    tensor_representations[tensor_name]))
        result = schema_pb2.Schema()
        # Note: We only copy projected features into the new schema because the
        # coder, and ArrowSchema() only care about Schema.feature. If they start
        # depending on other Schema fields then those fields must also be projected.
        for f in self._schema.feature:
            if path.ColumnPath(f.name) in paths:
                result.feature.add().CopyFrom(f)

        tensor_rep_util.SetTensorRepresentationsInSchema(
            result, {
                k: v
                for k, v in tensor_representations.items() if k in tensor_names
            })

        return result
 def testGetSourceValueColumnFromTensorRepresentation(
         self, pbtxt, expected):
     self.assertEqual(
         path.ColumnPath(expected),
         tensor_representation_util.
         GetSourceValueColumnFromTensorRepresentation(
             text_format.Parse(pbtxt, schema_pb2.TensorRepresentation())))
Beispiel #5
0
 def BaseCanHandle(
         arrow_schema: pa.Schema,
         tensor_representation: schema_pb2.TensorRepresentation) -> bool:
     depth, value_type = _GetNestDepthAndValueType(
         arrow_schema,
         path.ColumnPath(tensor_representation.dense_tensor.column_name))
     # Can only handle 1-nested lists.
     return depth == 1 and _IsSupportedArrowValueType(value_type)
Beispiel #6
0
 def CanHandle(arrow_schema: pa.Schema,
               tensor_representation: schema_pb2.TensorRepresentation) -> bool:
   depth, value_type = _GetNestDepthAndValueType(
       arrow_schema,
       path.ColumnPath(
           [tensor_representation.varlen_sparse_tensor.column_name]))
   # Currently can only handle 1-nested lists, but can easily support
   # arbitrarily nested ListArrays.
   return depth == 1 and _IsSupportedArrowValueType(value_type)
Beispiel #7
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   column_name = tensor_representation.varlen_sparse_tensor.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema,
                                             path.ColumnPath(column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
Beispiel #8
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   _, value_type = _GetNestDepthAndValueType(
       arrow_schema,
       path.ColumnPath(tensor_representation.dense_tensor.column_name))
   self._default_fill = _GetDefaultFill(
       self._shape[1:], value_type,
       tensor_representation.dense_tensor.default_value)
Beispiel #9
0
  def CanHandle(arrow_schema: pa.Schema,
                tensor_representation: schema_pb2.TensorRepresentation) -> bool:
    """Returns whether `tensor_representation` can be handled."""
    sparse_representation = tensor_representation.sparse_tensor
    if (len(sparse_representation.dense_shape.dim) != len(
        sparse_representation.index_column_names)):
      return False
    if any([d.size <= 0 for d in sparse_representation.dense_shape.dim]):
      return False

    # All the index columns must be of integral types.
    for index_column in sparse_representation.index_column_names:
      depth, value_type = _GetNestDepthAndValueType(
          arrow_schema, path.ColumnPath(index_column))
      if depth != 1 or not pa.types.is_integer(value_type):
        return False

    depth, value_type = _GetNestDepthAndValueType(
        arrow_schema, path.ColumnPath(sparse_representation.value_column_name))
    return depth == 1 and _IsSupportedArrowValueType(value_type)
def ProjectTensorRepresentationsInSchema(
        schema: schema_pb2.Schema,
        tensor_names: Iterable[str]) -> schema_pb2.Schema:
    """Returns a projection of schema by the given tensor names.

  Tries to extract TensorRpresentations from the schema and infers them in case
  there's none. The schema is then projected to have the TensorRepresentations
  and source feature columns of tensors that are present in `tensor_names`.

  Args:
    schema: A TFMD Schema to be projected.
    tensor_names: Names of tensors that schema must be projected on.

  Returns:
    A schema that contains a subset of TensorRepresentations and features in
    `schema` that is a set of source columns for the given tensors.

  Raises:
    ValueError: if `schema` doesn't contain any of the given `tensor_names` or
    TensorRepresentations' source columns are not present in `schema` features.
  """
    tensor_representations = GetTensorRepresentationsFromSchema(schema)
    if tensor_representations is None:
        tensor_representations = InferTensorRepresentationsFromSchema(schema)
    tensor_names = set(tensor_names)
    if not tensor_names.issubset(tensor_representations):
        raise ValueError(
            "Unable to project {} because they were not in the original "
            "or inferred TensorRepresentations.".format(
                tensor_names - tensor_representations.keys()))
    paths = set()
    for tensor_name in tensor_names:
        paths.update(
            GetSourceColumnsFromTensorRepresentation(
                tensor_representations[tensor_name]))
    result = schema_pb2.Schema()

    for feature in schema.feature:
        feature_path = path.ColumnPath(feature.name)
        if feature_path in paths:
            paths.remove(feature_path)
            result.feature.add().CopyFrom(feature)

    if paths:
        raise ValueError(
            "TensorRepresentations source columns {} are not present "
            "in the schema.".format(paths))

    SetTensorRepresentationsInSchema(
        result,
        {k: v
         for k, v in tensor_representations.items() if k in tensor_names})

    return result
def _GetExpectedColumnValues(tfxio):
    if tfxio._can_produce_large_types:
        list_factory = pa.large_list
        bytes_type = pa.large_binary()
    else:
        list_factory = pa.list_
        bytes_type = pa.binary()

    return {
        path.ColumnPath(["int_feature"]):
        pa.array([[1], [2], [3]], type=list_factory(pa.int64())),
        path.ColumnPath(["float_feature"]):
        pa.array([[1, 2, 3, 4], [2, 3, 4, 5], None],
                 type=list_factory(pa.float32())),
        path.ColumnPath([_SEQUENCE_COLUMN_NAME, "int_feature"]):
        pa.array([[[1, 2], [3]], None, [[4]]],
                 list_factory(list_factory(pa.int64()))),
        path.ColumnPath([_SEQUENCE_COLUMN_NAME, "string_feature"]):
        pa.array([None, [[b"foo", b"bar"], []], [[b"baz"]]],
                 list_factory(list_factory(bytes_type)))
    }
Beispiel #12
0
    def _ProjectTfmdSchema(self,
                           tensor_names: List[Text]) -> schema_pb2.Schema:
        """Projects self._schema by the given tensor names."""
        tensor_representations = self.TensorRepresentations()
        tensor_names = set(tensor_names)
        if not tensor_names.issubset(tensor_representations):
            raise ValueError(
                "Unable to project {} because they were not in the original "
                "TensorRepresentations.".format(tensor_names -
                                                tensor_representations))
        used_paths = set()
        for tensor_name in tensor_names:
            used_paths.update(
                tensor_representation_util.
                GetSourceColumnsFromTensorRepresentation(
                    tensor_representations[tensor_name]))
        result = schema_pb2.Schema()
        # Note: We only copy projected features into the new schema because the
        # coder, and ArrowSchema() only care about Schema.feature. If they start
        # depending on other Schema fields then those fields must also be projected.
        for f in self._schema.feature:
            p = path.ColumnPath(f.name)
            if f.name == _SEQUENCE_COLUMN_NAME:
                if f.type != schema_pb2.STRUCT:
                    raise ValueError(
                        "Feature {} was expected to be of type STRUCT, but got {}"
                        .format(f.name, f))
                result_sequence_struct = schema_pb2.Feature()
                result_sequence_struct.CopyFrom(f)
                result_sequence_struct.ClearField("struct_domain")
                any_sequence_feature_projected = False
                for sf in f.struct_domain.feature:
                    sequence_feature_path = p.child(sf.name)
                    if sequence_feature_path in used_paths:
                        any_sequence_feature_projected = True
                        result_sequence_struct.struct_domain.feature.add(
                        ).CopyFrom(sf)
                if any_sequence_feature_projected:
                    result.feature.add().CopyFrom(result_sequence_struct)
            elif p in used_paths:
                result.feature.add().CopyFrom(f)

        tensor_representation_util.SetTensorRepresentationsInSchema(
            result, {
                k: v
                for k, v in tensor_representations.items() if k in tensor_names
            })

        return result
Beispiel #13
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   sparse_representation = tensor_representation.sparse_tensor
   self._index_column_indices = tuple(
       arrow_schema.get_field_index(c)
       for c in sparse_representation.index_column_names)
   self._value_column_index = arrow_schema.get_field_index(
       sparse_representation.value_column_name)
   self._shape = [dim.size for dim in sparse_representation.dense_shape.dim]
   _, value_type = _GetNestDepthAndValueType(
       arrow_schema, path.ColumnPath(sparse_representation.value_column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._coo_size = len(self._shape) + 1
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
Beispiel #14
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   dense_rep = tensor_representation.dense_tensor
   column_name = dense_rep.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema,
                                             path.ColumnPath(column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
   unbatched_shape = [
       d.size for d in tensor_representation.dense_tensor.shape.dim
   ]
   self._shape = [None] + unbatched_shape
   self._unbatched_flat_len = int(np.prod(unbatched_shape, initial=1))
Beispiel #15
0
    "float_value": schema_pb2.FLOAT,
}


def _GetSparseTensorRepresentationUsedColumns(
    sparse_tensor_rep: schema_pb2.TensorRepresentation.SparseTensor
) -> List[path.ColumnPath]:
    result = [path.ColumnPath(c) for c in sparse_tensor_rep.index_column_names]
    if sparse_tensor_rep.HasField("value_column_name"):
        result.append(path.ColumnPath(sparse_tensor_rep.value_column_name))
    return result


_TENSOR_REPRESENTATION_KIND_TO_COLUMNS_GETTER = {
    "dense_tensor":
    lambda tr: [path.ColumnPath(tr.dense_tensor.column_name)],
    "varlen_sparse_tensor":
    lambda tr: [path.ColumnPath(tr.varlen_sparse_tensor.column_name)],
    "sparse_tensor":
    lambda tr: _GetSparseTensorRepresentationUsedColumns(tr.sparse_tensor),
    "ragged_tensor":
    lambda tr: [path.ColumnPath.from_proto(tr.ragged_tensor.feature_path)],
    None:
    lambda _: [],
}


def SetTensorRepresentationsInSchema(
    schema: schema_pb2.Schema,
    tensor_representations: Mapping[Text, schema_pb2.TensorRepresentation],
    tensor_representation_group_name: Text = _DEFAULT_TENSOR_REPRESENTATION_GROUP
Beispiel #16
0
      key: "string_feature"
      value {
        feature { bytes_list { value: ["baz"] } }
      }
    }
  }
""",
]

_SERIALIZED_EXAMPLES = [
    text_format.Parse(pbtxt, tf.train.SequenceExample()).SerializeToString()
    for pbtxt in _EXAMPLES
]

_EXPECTED_COLUMN_VALUES = {
    path.ColumnPath(["int_feature"]):
    pa.array([[1], [2], [3]], type=pa.large_list(pa.int64())),
    path.ColumnPath(["float_feature"]):
    pa.array([[1, 2, 3, 4], [2, 3, 4, 5], None],
             type=pa.large_list(pa.float32())),
    path.ColumnPath([_SEQUENCE_COLUMN_NAME, "int_feature"]):
    pa.array([[[1, 2], [3]], None, [[4]]],
             pa.large_list(pa.large_list(pa.int64()))),
    path.ColumnPath([_SEQUENCE_COLUMN_NAME, "string_feature"]):
    pa.array([None, [[b"foo", b"bar"], []], [[b"baz"]]],
             pa.large_list(pa.large_list(pa.large_binary())))
}


def _WriteInputs(filename):
    with tf.io.TFRecordWriter(filename, "GZIP") as w: