Example #1
0
def _make_2d_varlen_sparse_tensor_test_cases():
    result = []
    for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items():
        if tf_type == tf.string:
            values = tf.constant([b"1", b"2", b"3"], dtype=tf.string)
            expected_array = pa.array([[b"1"], [], [b"2", b"3"], []],
                                      type=pa.large_list(arrow_type))
        else:
            values = tf.constant([1, 2, 3], dtype=tf_type)
            expected_array = pa.array([[1], [], [2, 3], []],
                                      type=pa.large_list(arrow_type))
        result.append(
            dict(testcase_name="2d_varlen_sparse_tensor_%s" % tf_type.name,
                 type_specs={"sp": tf.SparseTensorSpec([None, None], tf_type)},
                 expected_schema={"sp": pa.large_list(arrow_type)},
                 expected_tensor_representations={
                     "sp": """varlen_sparse_tensor { column_name: "sp" }""",
                 },
                 tensor_input={
                     "sp":
                     tf.SparseTensor(values=values,
                                     indices=[[0, 0], [2, 0], [2, 1]],
                                     dense_shape=[4, 2]),
                 },
                 expected_record_batch={"sp": expected_array}))
    return result
Example #2
0
 def arrow_fields(self) -> List[pa.Field]:
   return ([
       pa.field(self._value_column_name, pa.large_list(
           self._values_arrow_type))
   ] + [
       pa.field(n, pa.large_list(pa.int64())) for n in self._index_column_names
   ])
Example #3
0
def test_large_list_type():
    ty = pa.large_list(pa.utf8())
    assert isinstance(ty, pa.LargeListType)
    assert ty.value_type == pa.utf8()

    with pytest.raises(TypeError):
        pa.large_list(None)
def _make_2d_dense_tensor_test_cases():
    result = []
    for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items():
        if tf_type == tf.string:
            tensor = tf.constant([[b"1", b"2"], [b"3", b"4"]], dtype=tf.string)
            expected_array = pa.array([[b"1", b"2"], [b"3", b"4"]],
                                      type=pa.large_list(arrow_type))
        else:
            tensor = tf.constant([[1, 2], [3, 4]], dtype=tf_type)
            expected_array = pa.array([[1, 2], [3, 4]],
                                      type=pa.large_list(arrow_type))
        result.append(
            dict(testcase_name="2d_dense_tensor_%s" % tf_type.name,
                 type_specs={"dt": tf.TensorSpec([None, 2], tf_type)},
                 expected_schema={"dt": pa.large_list(arrow_type)},
                 expected_tensor_representations={
                     "dt":
                     """dense_tensor {
                         column_name: "dt"
                         shape { dim { size: 2} }
                       }""",
                 },
                 tensor_input={"dt": tensor},
                 expected_record_batch={"dt": expected_array}))
    return result
def test_nested_large_lists(seq):
    data = [[], [1, 2], None]
    arr = pa.array(seq(data), type=pa.large_list(pa.int16()))
    assert len(arr) == 3
    assert arr.null_count == 1
    assert arr.type == pa.large_list(pa.int16())
    assert arr.to_pylist() == data
Example #6
0
def test_large_list_type():
    ty = pa.large_list(pa.utf8())
    assert isinstance(ty, pa.LargeListType)
    assert ty.value_type == pa.utf8()
    assert ty.value_field == pa.field("item", pa.utf8(), nullable=True)

    with pytest.raises(TypeError):
        pa.large_list(None)
Example #7
0
def convert_to_arrow(
    schema: schema_pb2.Schema,
    converter: tensor_to_arrow.TensorsToRecordBatchConverter,
    fetches: Dict[str, common_types.TensorValueType]
) -> Tuple[List[pa.Array], pa.Schema]:
    """Converts fetches to a list of pyarrow arrays and schema.

  Maps the values fetched by `tf.Session.run` or returned by a tf.function to
  pyarrow format.

  Args:
    schema: A `Schema` proto.
    converter: A `tf.Tensor` to `pa.RecordBatch` converter that contains
      `tf.TypeSpec`s of `FixedLen` and `VarLen` features. Note that the
      converter doesn't support general `SparseFeature`s, they are handled here.
    fetches: A dict representing a batch of data, either as returned by
      `Session.run` or eager tensors.

  Returns:
    A tuple of a list of pyarrow arrays and schema representing fetches.

  Raises:
    ValueError: If batch sizes are inconsistent.
  """

    tensors = {}
    sparse_arrays = []
    sparse_fields = []
    feature_specs = schema_utils.schema_as_feature_spec(schema).feature_spec
    for name, tensor_or_value in fetches.items():
        feature_spec = feature_specs[name]
        if isinstance(feature_spec, tf.io.SparseFeature):
            sparse_components = _handle_sparse_batch(tensor_or_value,
                                                     feature_spec, name)
            values_type = pa.large_list(
                _tf_dtype_to_arrow_type(feature_spec.dtype))
            indices_type = pa.large_list(pa.int64())
            sparse_arrays.append(
                pa.array(sparse_components.pop(feature_spec.value_key),
                         type=values_type))
            sparse_fields.append(pa.field(feature_spec.value_key, values_type))
            for indices_key, instance_indices in sparse_components.items():
                flat_indices = [
                    np.ravel(indices) for indices in instance_indices
                ]
                sparse_arrays.append(pa.array(flat_indices, type=indices_type))
                sparse_fields.append(pa.field(indices_key, indices_type))
        else:
            tensors[name] = tensor_or_value
    record_batch = converter.convert(tensors)
    arrow_schema = record_batch.schema
    for field in sparse_fields:
        arrow_schema = arrow_schema.append(field)

    return record_batch.columns + sparse_arrays, arrow_schema
Example #8
0
def test_large_list_array_flatten():
    typ2 = pa.large_list(pa.large_list(pa.int16()))
    arr2 = pa.array(
        [None, [[1, None, 2], None, [3, 4]], [], [[], [5, 6], None], [[7, 8]]],
        type=typ2)

    typ1 = pa.large_list(pa.int16())
    assert typ1 == typ2.value_type
    arr1 = pa.array([[1, None, 2], None, [3, 4], [], [5, 6], None, [7, 8]],
                    type=typ1)

    assert arr2.flatten().equals(arr1)
Example #9
0
    def test_query_large_list_arrays(self, sql, expected_output):
        # Large list of int32 & int64.
        record_batch = pa.RecordBatch.from_arrays([
            pa.array([[1, 2, 3], [4], None, [5], [], [6], [None], [7]],
                     type=pa.large_list(pa.int64())),
            pa.array([[10, 20, 30], [40], None, None, [], [], [None], [None]],
                     type=pa.large_list(pa.int32())),
        ], ['f1', 'f2'])

        query = sql_util.RecordBatchSQLSliceQuery(sql, record_batch.schema)
        slices = query.Execute(record_batch)

        self.assertEqual(slices, expected_output)
Example #10
0
def _GetExpectedColumnValues(tfxio):
  if tfxio._can_produce_large_types:
    int_type = pa.large_list(pa.int64())
    float_type = pa.large_list(pa.float32())
    bytes_type = pa.large_list(pa.large_binary())
  else:
    int_type = pa.list_(pa.int64())
    float_type = pa.list_(pa.float32())
    bytes_type = pa.list_(pa.binary())

  return {
      "int_feature": pa.array([[1], [2]], type=int_type),
      "float_feature": pa.array([[2.0], [3.0]], type=float_type),
      "string_feature": pa.array([[b"abc"], [b"xyz"]], type=bytes_type),
  }
Example #11
0
def _GetFeatureTypeToArrowTypeMapping(
    large_types: bool) -> Dict[int, pa.DataType]:
  if large_types:
    return {
        ColumnType.UNKNOWN: pa.null(),
        ColumnType.INT: pa.large_list(pa.int64()),
        ColumnType.FLOAT: pa.large_list(pa.float32()),
        ColumnType.STRING: pa.large_list(pa.large_binary())
    }
  return {
      ColumnType.UNKNOWN: pa.null(),
      ColumnType.INT: pa.list_(pa.int64()),
      ColumnType.FLOAT: pa.list_(pa.float32()),
      ColumnType.STRING: pa.list_(pa.binary())
  }
    def _ValidateRecordBatch(self,
                             tfxio,
                             record_batch,
                             raw_record_column_name=None):
        self.assertIsInstance(record_batch, pa.RecordBatch)
        self.assertEqual(record_batch.num_rows, 3)
        expected_column_values = GetExpectedColumnValues(tfxio)
        for i, field in enumerate(record_batch.schema):
            if field.name == raw_record_column_name:
                continue
            self.assertTrue(
                record_batch.column(i).equals(
                    expected_column_values[field.name]),
                "Column {} did not match ({} vs {}).".format(
                    field.name, record_batch.column(i),
                    expected_column_values[field.name]))

        if raw_record_column_name is not None:
            if tfxio._can_produce_large_types:
                raw_record_column_type = pa.large_list(pa.large_binary())
            else:
                raw_record_column_type = pa.list_(pa.binary())
            self.assertEqual(record_batch.schema.names[-1],
                             raw_record_column_name)
            self.assertTrue(
                record_batch.columns[-1].type.equals(raw_record_column_type))
            self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                             _SERIALIZED_EXAMPLES)
Example #13
0
def _make_record_batch(num_cols, num_rows):
    columns = [
        pa.array([[b"kk"]] * num_rows, type=pa.large_list(pa.large_binary()))
        for _ in range(num_cols)
    ]
    column_names = ["col%d" % c for c in range(num_cols)]
    return pa.record_batch(columns, column_names)
Example #14
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.large_list(pa.uint8()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
Example #15
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(),
            pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(),
            pa.binary(10), pa.large_string(), pa.large_binary(),
            pa.list_(pa.int32()), pa.large_list(pa.uint16()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.string()))
Example #16
0
    def test_csv_to_recordbatch_schema_features_subset_of_column_names(self):
        input_lines = ['1,2.0,hello', '5,12.34,world']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        schema = text_format.Parse(
            """feature { name: "int_feature" type: INT }""",
            schema_pb2.Schema())
        self.assertEqual(
            csv_decoder.GetArrowSchema(column_names, schema),
            pa.schema([pa.field('int_feature', pa.large_list(pa.int64()))]))

        def _check_record_batches(record_batches):
            self.assertLen(record_batches, 1)
            self.assertTrue(record_batches[0].equals(
                pa.RecordBatch.from_arrays(
                    [pa.array([[1], [5]], pa.large_list(pa.int64()))],
                    ['int_feature'])))

        with beam.Pipeline() as p:
            record_batches = (
                p
                | 'CreatingPColl' >> beam.Create(input_lines, reshuffle=False)
                | 'CSVToRecordBatch' >> csv_decoder.CSVToRecordBatch(
                    column_names=column_names,
                    delimiter=',',
                    desired_batch_size=1000,
                    schema=schema))
            beam_test_util.assert_that(record_batches,
                                       _check_record_batches,
                                       label='check_record_batches')
Example #17
0
def sequence_to_pyseries(
    name: str,
    values: Sequence[Any],
    dtype: Optional[Type[DataType]] = None,
    strict: bool = True,
) -> "PySeries":
    """
    Construct a PySeries from a sequence.
    """
    # Empty sequence defaults to Float32 type
    if not values and dtype is None:
        dtype = Float32

    if dtype is not None:
        constructor = polars_type_to_constructor(dtype)
        pyseries = constructor(name, values, strict)
        if dtype == Date32:
            pyseries = pyseries.cast(str(pl.Date32), True)
        elif dtype == Date64:
            pyseries = pyseries.cast(str(pl.Date64), True)
        return pyseries

    else:
        value = _get_first_non_none(values)
        dtype_ = type(value) if value is not None else float

        if dtype_ == date or dtype_ == datetime:
            if not _PYARROW_AVAILABLE:
                raise ImportError(
                    "'pyarrow' is required for converting a Sequence of date or datetime values to a PySeries."
                )
            return arrow_to_pyseries(name, pa.array(values))

        elif dtype_ == list or dtype_ == tuple or dtype_ == pl.Series:
            nested_value = _get_first_non_none(value)
            nested_dtype = type(nested_value) if value is not None else float

            if not _PYARROW_AVAILABLE:
                raise ImportError(
                    f"'pyarrow' is required for converting a Sequence of {nested_dtype} to a PySeries."
                )

            try:
                nested_arrow_dtype = py_type_to_arrow_type(nested_dtype)
            except ValueError as e:
                raise ValueError(
                    f"Cannot construct Series from sequence of {nested_dtype}."
                ) from e

            try:
                arrow_values = pa.array(values,
                                        pa.large_list(nested_arrow_dtype))
                return arrow_to_pyseries(name, arrow_values)
            # failure expected for mixed sequences like `[[12], "foo", 9]`
            except pa.lib.ArrowInvalid:
                return PySeries.new_object(name, values, strict)

        else:
            constructor = py_type_to_constructor(dtype_)
            return constructor(name, values, strict)
Example #18
0
 def _AssertFn(record_batches):
     self.assertLen(record_batches, 1)
     record_batch = record_batches[0]
     self.assertEqual(record_batch.num_rows, 1)
     self.assertEqual(record_batch.num_columns, 1)
     self.assertTrue(record_batch[0].equals(
         pa.array([[123]], type=pa.large_list(pa.int64()))))
Example #19
0
  def _ValidateRecordBatch(self,
                           record_batch,
                           raw_record_column_name=None):
    self.assertIsInstance(record_batch, pa.RecordBatch)
    self.assertEqual(record_batch.num_rows, 2)
    expected_schema = _EXPECTED_ARROW_SCHEMA
    if raw_record_column_name is not None:
      expected_schema = pa.schema(
          list(expected_schema) +
          [pa.field(raw_record_column_name, pa.large_list(pa.large_binary()))])
    self.assertTrue(
        record_batch.schema.equals(expected_schema),
        "Expected: {} ; got {}".format(expected_schema,
                                       record_batch.schema))
    for i, field in enumerate(record_batch.schema):
      if field.name == raw_record_column_name:
        continue
      self.assertTrue(
          record_batch.column(i).equals(_EXPECTED_COLUMN_VALUES[field.name]),
          "Column {} did not match ({} vs {}).".format(
              field.name, record_batch.column(i),
              _EXPECTED_COLUMN_VALUES[field.name]))

    if raw_record_column_name is not None:
      self.assertEqual(record_batch.schema.names[-1], raw_record_column_name)
      self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                       _RAW_RECORDS)
Example #20
0
def _LargeListCanBeConvertedToPandas() -> bool:
  """Returns True if a large_list can be converted to a pd.Series."""
  try:
    pa.array([], type=pa.large_list(pa.int32())).to_pandas()
  except:  # pylint:disable=bare-except
    return False
  return True
Example #21
0
    def _ValidateRecordBatch(self, record_batch, raw_record_column_name=None):
        self.assertIsInstance(record_batch, pa.RecordBatch)
        self.assertEqual(record_batch.num_rows, 3)
        for i, field in enumerate(record_batch.schema):
            if field.name == raw_record_column_name:
                continue
            if field.name == _SEQUENCE_COLUMN_NAME:
                self.assertTrue(pa.types.is_struct(field.type))
                for seq_column, seq_field in zip(
                        record_batch.column(i).flatten(), list(field.type)):
                    expected_array = _EXPECTED_COLUMN_VALUES[path.ColumnPath(
                        [_SEQUENCE_COLUMN_NAME, seq_field.name])]
                    self.assertTrue(
                        seq_column.equals(expected_array),
                        "Sequence column {} did not match ({} vs {})".format(
                            seq_field.name, seq_column, expected_array))
                continue
            self.assertTrue(
                record_batch.column(i).equals(
                    _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])]),
                "Column {} did not match ({} vs {}).".format(
                    field.name, record_batch.column(i),
                    _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])]))

        if raw_record_column_name is not None:
            self.assertEqual(record_batch.schema.names[-1],
                             raw_record_column_name)
            self.assertTrue(record_batch.columns[-1].type.equals(
                pa.large_list(pa.large_binary())))
            self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                             _SERIALIZED_EXAMPLES)
Example #22
0
    def testRecordBatchAndTensorAdapter(self):
        column_name = "raw_record"
        telemetry_descriptors = ["some", "component"]
        tfxio = raw_tf_record.RawTfRecordTFXIO(
            self._raw_record_file,
            column_name,
            telemetry_descriptors=telemetry_descriptors)
        expected_type = (pa.large_list(pa.large_binary()) if
                         _ProducesLargeTypes(tfxio) else pa.list_(pa.binary()))

        got_schema = tfxio.ArrowSchema()
        self.assertTrue(
            got_schema.equals(pa.schema([pa.field(column_name,
                                                  expected_type)])),
            "got: {}".format(got_schema))

        def _AssertFn(record_batches):
            self.assertLen(record_batches, 1)
            record_batch = record_batches[0]
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            self.assertTrue(record_batch.columns[0].equals(
                pa.array([[r] for r in _RAW_RECORDS], type=expected_type)))
            tensor_adapter = tfxio.TensorAdapter()
            tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(tensors, 1)
            self.assertIn(column_name, tensors)

        p = beam.Pipeline()
        record_batch_pcoll = p | tfxio.BeamSource(batch_size=len(_RAW_RECORDS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            telemetry_descriptors, "bytes",
                                            "tfrecords_gzip")
    def testIsBinaryLike(self):
        for t in (pa.binary(), pa.large_binary(), pa.string(),
                  pa.large_string()):
            self.assertTrue(arrow_util.is_binary_like(t))

        for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
            self.assertFalse(arrow_util.is_binary_like(t))
Example #24
0
def coerce_arrow(array: pa.Array) -> pa.Array:
    # also coerces timezone to naive representation
    # units are accounted for by pyarrow
    if "timestamp" in str(array.type):
        warnings.warn(
            "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
        )
        ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
        ms = pa.compute.cast(ts_ms, pa.int64())
        del ts_ms
        array = pa.compute.cast(ms, pa.date64())
        del ms
    # note: Decimal256 could not be cast to float
    elif isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    # simplest solution is to cast to (large)-string arrays
    # this is copy and expensive
    elif isinstance(array.type, pa.DictionaryType):
        if pa.types.is_string(array.type.value_type):
            array = pa.compute.cast(array, pa.large_utf8())
        else:
            raise ValueError(
                "polars does not support dictionary encoded types other than strings"
            )

    if hasattr(array, "num_chunks") and array.num_chunks > 1:
        if pa.types.is_string(array.type):
            array = pa.compute.cast(array, pa.large_utf8())
        elif pa.types.is_list(array.type):
            array = pa.compute.cast(array, pa.large_list())
        array = array.combine_chunks()
    return array
Example #25
0
 def _AssertFn(record_batch_list):
     self.assertLen(record_batch_list, 1)
     record_batch = record_batch_list[0]
     expected_arrow_schema = pa.schema([
         pa.field("int_feature", pa.large_list(pa.int64())),
     ])
     self._ValidateRecordBatch(record_batch, expected_arrow_schema)
def _make_2d_generic_sparse_tensor_test_cases():
    result = []
    for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items():
        if tf_type == tf.string:
            values = tf.constant([b"1", b"2", b"3", b"4"], dtype=tf.string)
            expected_value_array = pa.array(
                [[b"1", b"2", b"3"], [], [b"4"], []],
                type=pa.large_list(arrow_type))
        else:
            values = tf.constant([1, 2, 3, 4], dtype=tf_type)
            expected_value_array = pa.array([[1, 2, 3], [], [4], []],
                                            type=pa.large_list(arrow_type))
        result.append(
            dict(testcase_name="2d_generic_sparse_tensor_%s" % tf_type.name,
                 type_specs={"sp1": tf.SparseTensorSpec([None, 5], tf_type)},
                 expected_schema={
                     "sp1$values": pa.large_list(arrow_type),
                     "sp1$index0": pa.large_list(pa.int64()),
                 },
                 expected_tensor_representations={
                     "sp1":
                     """sparse_tensor {
                dense_shape {
                  dim {
                    size: 5
                  }
                }
                value_column_name: "sp1$values"
                index_column_names: "sp1$index0"
                }""",
                 },
                 tensor_input={
                     "sp1":
                     tf.SparseTensor(values=values,
                                     indices=[[0, 0], [0, 2], [0, 4], [2, 1]],
                                     dense_shape=[4, 5]),
                 },
                 expected_record_batch={
                     "sp1$values":
                     expected_value_array,
                     "sp1$index0":
                     pa.array([[0, 2, 4], [], [1], []],
                              type=pa.large_list(pa.int64())),
                 },
                 options=tensor_to_arrow.TensorsToRecordBatchConverter.Options(
                     generic_sparse_tensor_names=frozenset({"sp1"}))))
    return result
Example #27
0
def _GetExpectedArrowSchema(tfxio, raw_record_column_name=None):
  if tfxio._can_produce_large_types:
    int_type = pa.large_list(pa.int64())
    float_type = pa.large_list(pa.float32())
    bytes_type = pa.large_list(pa.large_binary())
  else:
    int_type = pa.list_(pa.int64())
    float_type = pa.list_(pa.float32())
    bytes_type = pa.list_(pa.binary())
  fields = [
      pa.field("int_feature", int_type),
      pa.field("float_feature", float_type),
      pa.field("string_feature", bytes_type)
  ]
  if raw_record_column_name is not None:
    fields.append(pa.field(raw_record_column_name, bytes_type))
  return pa.schema(fields)
Example #28
0
 def arrow_fields(self) -> List[pa.Field]:
   # TODO(b/159717195): clean up protected-access
   # pylint: disable=protected-access
   arrow_type = _tf_dtype_to_arrow_type(self._type_spec._dtype)
   for _ in range(self._type_spec._ragged_rank):
     arrow_type = pa.large_list(arrow_type)
   return [
       pa.field(self._tensor_name, arrow_type)
   ]
def GetExpectedColumnValues(tfxio):
    if tfxio._can_produce_large_types:
        int_type = pa.large_list(pa.int64())
        float_type = pa.large_list(pa.float32())
        bytes_type = pa.large_list(pa.large_binary())
    else:
        int_type = pa.list_(pa.int64())
        float_type = pa.list_(pa.float32())
        bytes_type = pa.list_(pa.binary())

    return {
        "int_feature":
        pa.array([[1], [2], [3]], type=int_type),
        "float_feature":
        pa.array([[1, 2, 3, 4], [2, 3, 4, 5], [4, 5, 6, 7]], type=float_type),
        "string_feature":
        pa.array([None, ["foo", "bar"], None], type=bytes_type),
    }
    def test_simple(self, attach_raw_records):
        raw_record_column_name = "_raw_records" if attach_raw_records else None
        tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
            self._input_path,
            self._decoder_path,
            _TELEMETRY_DESCRIPTORS,
            raw_record_column_name=raw_record_column_name)
        expected_fields = [
            pa.field("st1", pa.list_(pa.binary())),
            pa.field("st2", pa.list_(pa.binary())),
        ]
        if attach_raw_records:
            raw_record_column_type = (pa.large_list(pa.large_binary())
                                      if tfxio._can_produce_large_types else
                                      pa.list_(pa.binary()))
            expected_fields.append(
                pa.field(raw_record_column_name, raw_record_column_type))
        self.assertTrue(tfxio.ArrowSchema().equals(pa.schema(expected_fields)),
                        tfxio.ArrowSchema())
        self.assertEqual(
            tfxio.TensorRepresentations(), {
                "st1":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "st1" }""",
                    schema_pb2.TensorRepresentation()),
                "st2":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "st2" }""",
                    schema_pb2.TensorRepresentation())
            })

        tensor_adapter = tfxio.TensorAdapter()
        self.assertEqual(tensor_adapter.TypeSpecs(),
                         _DecoderForTesting().output_type_specs())

        def _assert_fn(list_of_rb):
            self.assertLen(list_of_rb, 1)
            rb = list_of_rb[0]
            self.assertTrue(rb.schema.equals(tfxio.ArrowSchema()))
            tensors = tensor_adapter.ToBatchTensors(rb)
            self.assertLen(tensors, 2)
            for tensor_name in ("st1", "st2"):
                self.assertIn(tensor_name, tensors)
                st = tensors[tensor_name]
                self.assertAllEqual(st.values, _RECORDS)
                self.assertAllEqual(st.indices, [[0, 0], [1, 0]])
                self.assertAllEqual(st.dense_shape, [2, 1])

        p = beam.Pipeline()
        rb_pcoll = p | tfxio.BeamSource(batch_size=len(_RECORDS))
        beam_testing_util.assert_that(rb_pcoll, _assert_fn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "tensor",
                                            "tfrecords_gzip")