def _make_2d_varlen_sparse_tensor_test_cases(): result = [] for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items(): if tf_type == tf.string: values = tf.constant([b"1", b"2", b"3"], dtype=tf.string) expected_array = pa.array([[b"1"], [], [b"2", b"3"], []], type=pa.large_list(arrow_type)) else: values = tf.constant([1, 2, 3], dtype=tf_type) expected_array = pa.array([[1], [], [2, 3], []], type=pa.large_list(arrow_type)) result.append( dict(testcase_name="2d_varlen_sparse_tensor_%s" % tf_type.name, type_specs={"sp": tf.SparseTensorSpec([None, None], tf_type)}, expected_schema={"sp": pa.large_list(arrow_type)}, expected_tensor_representations={ "sp": """varlen_sparse_tensor { column_name: "sp" }""", }, tensor_input={ "sp": tf.SparseTensor(values=values, indices=[[0, 0], [2, 0], [2, 1]], dense_shape=[4, 2]), }, expected_record_batch={"sp": expected_array})) return result
def arrow_fields(self) -> List[pa.Field]: return ([ pa.field(self._value_column_name, pa.large_list( self._values_arrow_type)) ] + [ pa.field(n, pa.large_list(pa.int64())) for n in self._index_column_names ])
def test_large_list_type(): ty = pa.large_list(pa.utf8()) assert isinstance(ty, pa.LargeListType) assert ty.value_type == pa.utf8() with pytest.raises(TypeError): pa.large_list(None)
def _make_2d_dense_tensor_test_cases(): result = [] for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items(): if tf_type == tf.string: tensor = tf.constant([[b"1", b"2"], [b"3", b"4"]], dtype=tf.string) expected_array = pa.array([[b"1", b"2"], [b"3", b"4"]], type=pa.large_list(arrow_type)) else: tensor = tf.constant([[1, 2], [3, 4]], dtype=tf_type) expected_array = pa.array([[1, 2], [3, 4]], type=pa.large_list(arrow_type)) result.append( dict(testcase_name="2d_dense_tensor_%s" % tf_type.name, type_specs={"dt": tf.TensorSpec([None, 2], tf_type)}, expected_schema={"dt": pa.large_list(arrow_type)}, expected_tensor_representations={ "dt": """dense_tensor { column_name: "dt" shape { dim { size: 2} } }""", }, tensor_input={"dt": tensor}, expected_record_batch={"dt": expected_array})) return result
def test_nested_large_lists(seq): data = [[], [1, 2], None] arr = pa.array(seq(data), type=pa.large_list(pa.int16())) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.large_list(pa.int16()) assert arr.to_pylist() == data
def test_large_list_type(): ty = pa.large_list(pa.utf8()) assert isinstance(ty, pa.LargeListType) assert ty.value_type == pa.utf8() assert ty.value_field == pa.field("item", pa.utf8(), nullable=True) with pytest.raises(TypeError): pa.large_list(None)
def convert_to_arrow( schema: schema_pb2.Schema, converter: tensor_to_arrow.TensorsToRecordBatchConverter, fetches: Dict[str, common_types.TensorValueType] ) -> Tuple[List[pa.Array], pa.Schema]: """Converts fetches to a list of pyarrow arrays and schema. Maps the values fetched by `tf.Session.run` or returned by a tf.function to pyarrow format. Args: schema: A `Schema` proto. converter: A `tf.Tensor` to `pa.RecordBatch` converter that contains `tf.TypeSpec`s of `FixedLen` and `VarLen` features. Note that the converter doesn't support general `SparseFeature`s, they are handled here. fetches: A dict representing a batch of data, either as returned by `Session.run` or eager tensors. Returns: A tuple of a list of pyarrow arrays and schema representing fetches. Raises: ValueError: If batch sizes are inconsistent. """ tensors = {} sparse_arrays = [] sparse_fields = [] feature_specs = schema_utils.schema_as_feature_spec(schema).feature_spec for name, tensor_or_value in fetches.items(): feature_spec = feature_specs[name] if isinstance(feature_spec, tf.io.SparseFeature): sparse_components = _handle_sparse_batch(tensor_or_value, feature_spec, name) values_type = pa.large_list( _tf_dtype_to_arrow_type(feature_spec.dtype)) indices_type = pa.large_list(pa.int64()) sparse_arrays.append( pa.array(sparse_components.pop(feature_spec.value_key), type=values_type)) sparse_fields.append(pa.field(feature_spec.value_key, values_type)) for indices_key, instance_indices in sparse_components.items(): flat_indices = [ np.ravel(indices) for indices in instance_indices ] sparse_arrays.append(pa.array(flat_indices, type=indices_type)) sparse_fields.append(pa.field(indices_key, indices_type)) else: tensors[name] = tensor_or_value record_batch = converter.convert(tensors) arrow_schema = record_batch.schema for field in sparse_fields: arrow_schema = arrow_schema.append(field) return record_batch.columns + sparse_arrays, arrow_schema
def test_large_list_array_flatten(): typ2 = pa.large_list(pa.large_list(pa.int16())) arr2 = pa.array( [None, [[1, None, 2], None, [3, 4]], [], [[], [5, 6], None], [[7, 8]]], type=typ2) typ1 = pa.large_list(pa.int16()) assert typ1 == typ2.value_type arr1 = pa.array([[1, None, 2], None, [3, 4], [], [5, 6], None, [7, 8]], type=typ1) assert arr2.flatten().equals(arr1)
def test_query_large_list_arrays(self, sql, expected_output): # Large list of int32 & int64. record_batch = pa.RecordBatch.from_arrays([ pa.array([[1, 2, 3], [4], None, [5], [], [6], [None], [7]], type=pa.large_list(pa.int64())), pa.array([[10, 20, 30], [40], None, None, [], [], [None], [None]], type=pa.large_list(pa.int32())), ], ['f1', 'f2']) query = sql_util.RecordBatchSQLSliceQuery(sql, record_batch.schema) slices = query.Execute(record_batch) self.assertEqual(slices, expected_output)
def _GetExpectedColumnValues(tfxio): if tfxio._can_produce_large_types: int_type = pa.large_list(pa.int64()) float_type = pa.large_list(pa.float32()) bytes_type = pa.large_list(pa.large_binary()) else: int_type = pa.list_(pa.int64()) float_type = pa.list_(pa.float32()) bytes_type = pa.list_(pa.binary()) return { "int_feature": pa.array([[1], [2]], type=int_type), "float_feature": pa.array([[2.0], [3.0]], type=float_type), "string_feature": pa.array([[b"abc"], [b"xyz"]], type=bytes_type), }
def _GetFeatureTypeToArrowTypeMapping( large_types: bool) -> Dict[int, pa.DataType]: if large_types: return { ColumnType.UNKNOWN: pa.null(), ColumnType.INT: pa.large_list(pa.int64()), ColumnType.FLOAT: pa.large_list(pa.float32()), ColumnType.STRING: pa.large_list(pa.large_binary()) } return { ColumnType.UNKNOWN: pa.null(), ColumnType.INT: pa.list_(pa.int64()), ColumnType.FLOAT: pa.list_(pa.float32()), ColumnType.STRING: pa.list_(pa.binary()) }
def _ValidateRecordBatch(self, tfxio, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 3) expected_column_values = GetExpectedColumnValues(tfxio) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue self.assertTrue( record_batch.column(i).equals( expected_column_values[field.name]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), expected_column_values[field.name])) if raw_record_column_name is not None: if tfxio._can_produce_large_types: raw_record_column_type = pa.large_list(pa.large_binary()) else: raw_record_column_type = pa.list_(pa.binary()) self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertTrue( record_batch.columns[-1].type.equals(raw_record_column_type)) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _SERIALIZED_EXAMPLES)
def _make_record_batch(num_cols, num_rows): columns = [ pa.array([[b"kk"]] * num_rows, type=pa.large_list(pa.large_binary())) for _ in range(num_cols) ] column_names = ["col%d" % c for c in range(num_cols)] return pa.record_batch(columns, column_names)
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.large_list(pa.uint8()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.large_string(), pa.large_binary(), pa.list_(pa.int32()), pa.large_list(pa.uint16()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.struct([ pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string()) ]), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string()) ], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()))
def test_csv_to_recordbatch_schema_features_subset_of_column_names(self): input_lines = ['1,2.0,hello', '5,12.34,world'] column_names = ['int_feature', 'float_feature', 'str_feature'] schema = text_format.Parse( """feature { name: "int_feature" type: INT }""", schema_pb2.Schema()) self.assertEqual( csv_decoder.GetArrowSchema(column_names, schema), pa.schema([pa.field('int_feature', pa.large_list(pa.int64()))])) def _check_record_batches(record_batches): self.assertLen(record_batches, 1) self.assertTrue(record_batches[0].equals( pa.RecordBatch.from_arrays( [pa.array([[1], [5]], pa.large_list(pa.int64()))], ['int_feature']))) with beam.Pipeline() as p: record_batches = ( p | 'CreatingPColl' >> beam.Create(input_lines, reshuffle=False) | 'CSVToRecordBatch' >> csv_decoder.CSVToRecordBatch( column_names=column_names, delimiter=',', desired_batch_size=1000, schema=schema)) beam_test_util.assert_that(record_batches, _check_record_batches, label='check_record_batches')
def sequence_to_pyseries( name: str, values: Sequence[Any], dtype: Optional[Type[DataType]] = None, strict: bool = True, ) -> "PySeries": """ Construct a PySeries from a sequence. """ # Empty sequence defaults to Float32 type if not values and dtype is None: dtype = Float32 if dtype is not None: constructor = polars_type_to_constructor(dtype) pyseries = constructor(name, values, strict) if dtype == Date32: pyseries = pyseries.cast(str(pl.Date32), True) elif dtype == Date64: pyseries = pyseries.cast(str(pl.Date64), True) return pyseries else: value = _get_first_non_none(values) dtype_ = type(value) if value is not None else float if dtype_ == date or dtype_ == datetime: if not _PYARROW_AVAILABLE: raise ImportError( "'pyarrow' is required for converting a Sequence of date or datetime values to a PySeries." ) return arrow_to_pyseries(name, pa.array(values)) elif dtype_ == list or dtype_ == tuple or dtype_ == pl.Series: nested_value = _get_first_non_none(value) nested_dtype = type(nested_value) if value is not None else float if not _PYARROW_AVAILABLE: raise ImportError( f"'pyarrow' is required for converting a Sequence of {nested_dtype} to a PySeries." ) try: nested_arrow_dtype = py_type_to_arrow_type(nested_dtype) except ValueError as e: raise ValueError( f"Cannot construct Series from sequence of {nested_dtype}." ) from e try: arrow_values = pa.array(values, pa.large_list(nested_arrow_dtype)) return arrow_to_pyseries(name, arrow_values) # failure expected for mixed sequences like `[[12], "foo", 9]` except pa.lib.ArrowInvalid: return PySeries.new_object(name, values, strict) else: constructor = py_type_to_constructor(dtype_) return constructor(name, values, strict)
def _AssertFn(record_batches): self.assertLen(record_batches, 1) record_batch = record_batches[0] self.assertEqual(record_batch.num_rows, 1) self.assertEqual(record_batch.num_columns, 1) self.assertTrue(record_batch[0].equals( pa.array([[123]], type=pa.large_list(pa.int64()))))
def _ValidateRecordBatch(self, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 2) expected_schema = _EXPECTED_ARROW_SCHEMA if raw_record_column_name is not None: expected_schema = pa.schema( list(expected_schema) + [pa.field(raw_record_column_name, pa.large_list(pa.large_binary()))]) self.assertTrue( record_batch.schema.equals(expected_schema), "Expected: {} ; got {}".format(expected_schema, record_batch.schema)) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue self.assertTrue( record_batch.column(i).equals(_EXPECTED_COLUMN_VALUES[field.name]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), _EXPECTED_COLUMN_VALUES[field.name])) if raw_record_column_name is not None: self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _RAW_RECORDS)
def _LargeListCanBeConvertedToPandas() -> bool: """Returns True if a large_list can be converted to a pd.Series.""" try: pa.array([], type=pa.large_list(pa.int32())).to_pandas() except: # pylint:disable=bare-except return False return True
def _ValidateRecordBatch(self, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 3) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue if field.name == _SEQUENCE_COLUMN_NAME: self.assertTrue(pa.types.is_struct(field.type)) for seq_column, seq_field in zip( record_batch.column(i).flatten(), list(field.type)): expected_array = _EXPECTED_COLUMN_VALUES[path.ColumnPath( [_SEQUENCE_COLUMN_NAME, seq_field.name])] self.assertTrue( seq_column.equals(expected_array), "Sequence column {} did not match ({} vs {})".format( seq_field.name, seq_column, expected_array)) continue self.assertTrue( record_batch.column(i).equals( _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])])) if raw_record_column_name is not None: self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertTrue(record_batch.columns[-1].type.equals( pa.large_list(pa.large_binary()))) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _SERIALIZED_EXAMPLES)
def testRecordBatchAndTensorAdapter(self): column_name = "raw_record" telemetry_descriptors = ["some", "component"] tfxio = raw_tf_record.RawTfRecordTFXIO( self._raw_record_file, column_name, telemetry_descriptors=telemetry_descriptors) expected_type = (pa.large_list(pa.large_binary()) if _ProducesLargeTypes(tfxio) else pa.list_(pa.binary())) got_schema = tfxio.ArrowSchema() self.assertTrue( got_schema.equals(pa.schema([pa.field(column_name, expected_type)])), "got: {}".format(got_schema)) def _AssertFn(record_batches): self.assertLen(record_batches, 1) record_batch = record_batches[0] self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) self.assertTrue(record_batch.columns[0].equals( pa.array([[r] for r in _RAW_RECORDS], type=expected_type))) tensor_adapter = tfxio.TensorAdapter() tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(tensors, 1) self.assertIn(column_name, tensors) p = beam.Pipeline() record_batch_pcoll = p | tfxio.BeamSource(batch_size=len(_RAW_RECORDS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, telemetry_descriptors, "bytes", "tfrecords_gzip")
def testIsBinaryLike(self): for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()): self.assertTrue(arrow_util.is_binary_like(t)) for t in (pa.list_(pa.binary()), pa.large_list(pa.string())): self.assertFalse(arrow_util.is_binary_like(t))
def coerce_arrow(array: pa.Array) -> pa.Array: # also coerces timezone to naive representation # units are accounted for by pyarrow if "timestamp" in str(array.type): warnings.warn( "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost", ) ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False) ms = pa.compute.cast(ts_ms, pa.int64()) del ts_ms array = pa.compute.cast(ms, pa.date64()) del ms # note: Decimal256 could not be cast to float elif isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) # simplest solution is to cast to (large)-string arrays # this is copy and expensive elif isinstance(array.type, pa.DictionaryType): if pa.types.is_string(array.type.value_type): array = pa.compute.cast(array, pa.large_utf8()) else: raise ValueError( "polars does not support dictionary encoded types other than strings" ) if hasattr(array, "num_chunks") and array.num_chunks > 1: if pa.types.is_string(array.type): array = pa.compute.cast(array, pa.large_utf8()) elif pa.types.is_list(array.type): array = pa.compute.cast(array, pa.large_list()) array = array.combine_chunks() return array
def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] expected_arrow_schema = pa.schema([ pa.field("int_feature", pa.large_list(pa.int64())), ]) self._ValidateRecordBatch(record_batch, expected_arrow_schema)
def _make_2d_generic_sparse_tensor_test_cases(): result = [] for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items(): if tf_type == tf.string: values = tf.constant([b"1", b"2", b"3", b"4"], dtype=tf.string) expected_value_array = pa.array( [[b"1", b"2", b"3"], [], [b"4"], []], type=pa.large_list(arrow_type)) else: values = tf.constant([1, 2, 3, 4], dtype=tf_type) expected_value_array = pa.array([[1, 2, 3], [], [4], []], type=pa.large_list(arrow_type)) result.append( dict(testcase_name="2d_generic_sparse_tensor_%s" % tf_type.name, type_specs={"sp1": tf.SparseTensorSpec([None, 5], tf_type)}, expected_schema={ "sp1$values": pa.large_list(arrow_type), "sp1$index0": pa.large_list(pa.int64()), }, expected_tensor_representations={ "sp1": """sparse_tensor { dense_shape { dim { size: 5 } } value_column_name: "sp1$values" index_column_names: "sp1$index0" }""", }, tensor_input={ "sp1": tf.SparseTensor(values=values, indices=[[0, 0], [0, 2], [0, 4], [2, 1]], dense_shape=[4, 5]), }, expected_record_batch={ "sp1$values": expected_value_array, "sp1$index0": pa.array([[0, 2, 4], [], [1], []], type=pa.large_list(pa.int64())), }, options=tensor_to_arrow.TensorsToRecordBatchConverter.Options( generic_sparse_tensor_names=frozenset({"sp1"})))) return result
def _GetExpectedArrowSchema(tfxio, raw_record_column_name=None): if tfxio._can_produce_large_types: int_type = pa.large_list(pa.int64()) float_type = pa.large_list(pa.float32()) bytes_type = pa.large_list(pa.large_binary()) else: int_type = pa.list_(pa.int64()) float_type = pa.list_(pa.float32()) bytes_type = pa.list_(pa.binary()) fields = [ pa.field("int_feature", int_type), pa.field("float_feature", float_type), pa.field("string_feature", bytes_type) ] if raw_record_column_name is not None: fields.append(pa.field(raw_record_column_name, bytes_type)) return pa.schema(fields)
def arrow_fields(self) -> List[pa.Field]: # TODO(b/159717195): clean up protected-access # pylint: disable=protected-access arrow_type = _tf_dtype_to_arrow_type(self._type_spec._dtype) for _ in range(self._type_spec._ragged_rank): arrow_type = pa.large_list(arrow_type) return [ pa.field(self._tensor_name, arrow_type) ]
def GetExpectedColumnValues(tfxio): if tfxio._can_produce_large_types: int_type = pa.large_list(pa.int64()) float_type = pa.large_list(pa.float32()) bytes_type = pa.large_list(pa.large_binary()) else: int_type = pa.list_(pa.int64()) float_type = pa.list_(pa.float32()) bytes_type = pa.list_(pa.binary()) return { "int_feature": pa.array([[1], [2], [3]], type=int_type), "float_feature": pa.array([[1, 2, 3, 4], [2, 3, 4, 5], [4, 5, 6, 7]], type=float_type), "string_feature": pa.array([None, ["foo", "bar"], None], type=bytes_type), }
def test_simple(self, attach_raw_records): raw_record_column_name = "_raw_records" if attach_raw_records else None tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, self._decoder_path, _TELEMETRY_DESCRIPTORS, raw_record_column_name=raw_record_column_name) expected_fields = [ pa.field("st1", pa.list_(pa.binary())), pa.field("st2", pa.list_(pa.binary())), ] if attach_raw_records: raw_record_column_type = (pa.large_list(pa.large_binary()) if tfxio._can_produce_large_types else pa.list_(pa.binary())) expected_fields.append( pa.field(raw_record_column_name, raw_record_column_type)) self.assertTrue(tfxio.ArrowSchema().equals(pa.schema(expected_fields)), tfxio.ArrowSchema()) self.assertEqual( tfxio.TensorRepresentations(), { "st1": text_format.Parse( """varlen_sparse_tensor { column_name: "st1" }""", schema_pb2.TensorRepresentation()), "st2": text_format.Parse( """varlen_sparse_tensor { column_name: "st2" }""", schema_pb2.TensorRepresentation()) }) tensor_adapter = tfxio.TensorAdapter() self.assertEqual(tensor_adapter.TypeSpecs(), _DecoderForTesting().output_type_specs()) def _assert_fn(list_of_rb): self.assertLen(list_of_rb, 1) rb = list_of_rb[0] self.assertTrue(rb.schema.equals(tfxio.ArrowSchema())) tensors = tensor_adapter.ToBatchTensors(rb) self.assertLen(tensors, 2) for tensor_name in ("st1", "st2"): self.assertIn(tensor_name, tensors) st = tensors[tensor_name] self.assertAllEqual(st.values, _RECORDS) self.assertAllEqual(st.indices, [[0, 0], [1, 0]]) self.assertAllEqual(st.dense_shape, [2, 1]) p = beam.Pipeline() rb_pcoll = p | tfxio.BeamSource(batch_size=len(_RECORDS)) beam_testing_util.assert_that(rb_pcoll, _assert_fn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "tensor", "tfrecords_gzip")