Example #1
0
    def GetTensor(self, record_batch: pa.RecordBatch,
                  produce_eager_tensors: bool) -> Any:
        values_array = record_batch.column(self._value_column_index)
        values_parent_indices = array_util.GetFlattenedArrayParentIndices(
            values_array)
        indices_arrays = [np.asarray(values_parent_indices)]
        for index_column_index in self._index_column_indices:
            indices_arrays.append(
                np.asarray(record_batch.column(index_column_index).flatten()))
        flat_values_array = values_array.flatten()
        if self._convert_to_binary_fn is not None:
            flat_values_array = self._convert_to_binary_fn(flat_values_array)
        values_np = np.asarray(flat_values_array)
        coo_np = np.empty(shape=(len(values_np), self._coo_size),
                          dtype=np.int64)
        try:
            np.stack(indices_arrays, axis=1, out=coo_np)
        except ValueError as e:
            raise ValueError("Error constructing the COO for SparseTensor. "
                             "number of values: {}; "
                             "size of each index array: {}; "
                             "original error {}.".format(
                                 len(values_np),
                                 [len(i) for i in indices_arrays], e))

        dense_shape = [len(record_batch)] + self._shape

        if produce_eager_tensors:
            return tf.sparse.SparseTensor(
                indices=tf.convert_to_tensor(coo_np),
                dense_shape=tf.convert_to_tensor(dense_shape, dtype=tf.int64),
                values=tf.convert_to_tensor(values_np))
        return tf.compat.v1.SparseTensorValue(indices=coo_np,
                                              dense_shape=dense_shape,
                                              values=values_np)
Example #2
0
def generate_partial_statistics_in_memory(
    record_batch: pa.RecordBatch, options: stats_options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
  """Generates statistics for an in-memory list of examples.

  Args:
    record_batch: Arrow RecordBatch.
    options: Options for generating data statistics.
    stats_generators: A list of combiner statistics generators.

  Returns:
    A list of accumulators containing partial statistics.
  """
  result = []
  if options.feature_whitelist:
    schema = record_batch.schema
    whitelisted_columns = [
        record_batch.column(schema.get_field_index(f))
        for f in options.feature_whitelist
    ]
    record_batch = pa.RecordBatch.from_arrays(whitelisted_columns,
                                              list(options.feature_whitelist))
  for generator in stats_generators:
    result.append(
        generator.add_input(generator.create_accumulator(), record_batch))
  return result
Example #3
0
 def add_input(self, accumulator: List[float],
               examples: pa.RecordBatch) -> List[float]:
   accumulator[0] += examples.num_rows
   if self._weight_feature:
     weights_column = examples.column(
         examples.schema.get_field_index(self._weight_feature))
     accumulator[1] += np.sum(np.asarray(weights_column.flatten()))
   return accumulator
Example #4
0
def AppendRawRecordColumn(
    record_batch: pa.RecordBatch,
    column_name: Text,
    raw_records: List[bytes],
    produce_large_types: bool,
    record_index_column_name: Optional[Text] = None
) -> pa.RecordBatch:
  """Appends `raw_records` as a new column in `record_batch`.

  Args:
    record_batch: The RecordBatch to append to.
    column_name: The name of the column to be appended.
    raw_records: A list of bytes to be appended.
    produce_large_types: If True, the appended column will be of type
      large_list<large_binary>, otherwise list<binary>.
    record_index_column_name: If not specified, len(raw_records) must equal
      to record_batch.num_rows. Otherwise, `record_batch` must contain an
      list_like<integer> column to indicate which element in `raw_records`
      is the source of a row in `record_batch`. Specifically,
      record_index_column[i] == [j] means the i-th row came from the j-th
      element in `raw_records`. This column must not contain nulls, and all
      its elements must be single-element lists.

  Returns:
    A new RecordBatch whose last column is the raw record column, of given name.
  """
  schema = record_batch.schema
  if record_index_column_name is None:
    assert record_batch.num_rows == len(raw_records)
  else:
    record_index_column_index = schema.get_field_index(
        record_index_column_name)
    assert record_index_column_index != -1, (
        "Record index column {} did not exist."
        .format(record_index_column_name))
    record_index_column = record_batch.column(record_index_column_index)
    assert record_index_column.null_count == 0, (
        "Record index column must not contain nulls: {} nulls".format(
            record_index_column.null_count))
    column_type = record_index_column.type
    assert ((pa.types.is_list(column_type) or
             pa.types.is_large_list(column_type)) and
            pa.types.is_integer(column_type.value_type)), (
                "Record index column {} must be of type list_like<integer>, "
                "but got: {}".format(record_index_column_name, column_type))
    record_indices = np.asarray(record_index_column.flatten())
    assert len(record_indices) == len(record_batch), (
        "Record indices must be aligned with the record batch, but got "
        "different lengths: {} vs {}".format(
            len(record_indices), len(record_batch)))
    raw_records = np.asarray(raw_records, dtype=np.object)[record_indices]
  assert schema.get_field_index(column_name) == -1
  raw_record_column = CreateRawRecordColumn(raw_records, produce_large_types)
  return pa.RecordBatch.from_arrays(
      list(record_batch.columns) + [raw_record_column],
      list(schema.names) + [column_name])
Example #5
0
def _fetch_raw_data_column(record_batch: pa.RecordBatch) -> np.ndarray:
  """Fetch the raw data column.

  Args:
    record_batch: An Arrow RecordBatch.

  Returns:
    Raw data column.
  """
  column_index = record_batch.schema.get_field_index(
      constants.ARROW_INPUT_COLUMN)
  assert column_index >= 0, 'Arrow input column not found.'
  return np.asarray(record_batch.column(column_index).flatten())
Example #6
0
    def GetTensor(self, record_batch: pa.RecordBatch,
                  produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
        column_path = self._path.suffix(1)
        column = record_batch.column(self._column_index)
        column_type = column.type
        if (self._row_partition_dtype ==
                schema_pb2.TensorRepresentation.RowPartitionDType.INT32):
            offsets_dtype = np.int32
        elif (self._row_partition_dtype
              == schema_pb2.TensorRepresentation.RowPartitionDType.INT64
              or self._row_partition_dtype ==
              schema_pb2.TensorRepresentation.RowPartitionDType.UNSPECIFIED):
            offsets_dtype = np.int64
        row_splits = []

        # Get row splits of each level in the record batch.
        while True:
            # TODO(b/156514075): add support for handling slices.
            if column.offset != 0:
                raise ValueError(
                    "This record batch is sliced. We currently do not handle converting"
                    " slices to RaggedTensors.")
            if pa.types.is_struct(column_type):
                column = column.field(column_path.initial_step())
                column_path = column_path.suffix(1)
                column_type = column.type
            elif _IsListLike(column_type):
                row_splits.append(
                    np.asarray(column.offsets, dtype=offsets_dtype))
                column = column.flatten()
                column_type = column.type
            else:
                break

        values = column
        if self._convert_to_binary_fn is not None:
            values = self._convert_to_binary_fn(values)
        values = np.asarray(values)

        if produce_eager_tensors:
            factory = tf.RaggedTensor.from_row_splits
        else:
            factory = tf.compat.v1.ragged.RaggedTensorValue

        result = values
        for row_split in reversed(row_splits):
            result = factory(values=result, row_splits=row_split)

        return result
Example #7
0
  def GetTensor(self, record_batch: pa.RecordBatch,
                produce_eager_tensors: bool) -> Any:
    array = record_batch.column(self._column_index)
    coo_array, dense_shape_array = array_util.CooFromListArray(array)
    dense_shape_np = dense_shape_array.to_numpy()
    values_np = np.asarray(array.flatten())
    coo_np = coo_array.to_numpy().reshape(values_np.size, 2)

    if produce_eager_tensors:
      return tf.sparse.SparseTensor(
          indices=tf.convert_to_tensor(coo_np),
          dense_shape=tf.convert_to_tensor(dense_shape_np),
          values=tf.convert_to_tensor(values_np))
    return tf.compat.v1.SparseTensorValue(
        indices=coo_np, dense_shape=dense_shape_np, values=values_np)
Example #8
0
def get_weight_feature(input_record_batch: pa.RecordBatch,
                       weight_column: Text) -> np.ndarray:
    """Gets the weight column from the input record batch.

  Args:
    input_record_batch: Input record batch.
    weight_column: Name of the column containing the weight.

  Returns:
    A numpy array containing the weights of the examples in the input
    record_batch.

  Raises:
    ValueError: If the weight feature is not present in the input record_batch
    or is not a valid weight feature (must be of numeric type and have a
    single value for each example).
  """
    weights_field_index = input_record_batch.schema.get_field_index(
        weight_column)
    if weights_field_index < 0:
        raise ValueError('Weight column "{}" not present in the input '
                         'record batch.'.format(weight_column))
    weights = input_record_batch.column(weights_field_index)

    if pa.types.is_null(weights.type):
        raise ValueError(
            'Weight column "{}" cannot be null.'.format(weight_column))
    # Before flattening, check that there is a single value for each example.
    weight_lengths = array_util.ListLengthsFromListArray(weights).to_numpy()
    if not np.all(weight_lengths == 1):
        raise ValueError(
            'Weight column "{}" must have exactly one value in each example.'.
            format(weight_column))
    flat_weights = weights.flatten()
    # Before converting to numpy view, check the type (cannot convert string and
    # binary arrays to numpy view).
    flat_weights_type = flat_weights.type
    if (not pa.types.is_floating(flat_weights_type)
            and not pa.types.is_integer(flat_weights_type)):
        raise ValueError(
            'Weight column "{}" must be of numeric type. Found {}.'.format(
                weight_column, flat_weights_type))
    return np.asarray(flat_weights)
Example #9
0
def _filter_features(
    record_batch: pa.RecordBatch,
    feature_whitelist: List[types.FeatureName]) -> pa.RecordBatch:
  """Removes features that are not whitelisted.

  Args:
    record_batch: Input Arrow RecordBatch.
    feature_whitelist: A set of feature names to whitelist.

  Returns:
    An Arrow RecordBatch containing only the whitelisted features of the input.
  """
  schema = record_batch.schema
  column_names = set(schema.names)
  columns_to_select = []
  column_names_to_select = []
  for feature_name in feature_whitelist:
    if feature_name in column_names:
      columns_to_select.append(
          record_batch.column(schema.get_field_index(feature_name)))
      column_names_to_select.append(feature_name)
  return pa.RecordBatch.from_arrays(columns_to_select, column_names_to_select)
Example #10
0
def get_column(record_batch: pa.RecordBatch,
               feature_name: types.FeatureName,
               missing_ok: bool = False) -> Optional[pa.Array]:
    """Get a column by feature name.

  Args:
    record_batch: A pa.RecordBatch.
    feature_name: The name of a feature (column) within record_batch.
    missing_ok: If True, returns None for missing feature names.

  Returns:
    The column with the specified name, or None if missing_ok is true and
    a column with the specified name is missing, or more than one exist.

  Raises:
    KeyError: If a column with the specified name is missing, or more than
    one exist, and missing_ok is False.
  """
    idx = record_batch.schema.get_field_index(feature_name)
    if idx < 0:
        if missing_ok:
            return None
        raise KeyError('missing column %s' % feature_name)
    return record_batch.column(idx)
Example #11
0
 def GetTensor(self, record_batch: pa.RecordBatch,
               produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
     column = record_batch.column(self._column_index)
     column = array_util.FillNullLists(column, self._default_fill)
     return self._ListArrayToTensor(column, produce_eager_tensors)
Example #12
0
 def GetTensor(self, record_batch: pa.RecordBatch,
               produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
     column = record_batch.column(self._column_index)
     return self._ListArrayToTensor(column, produce_eager_tensors)
    def feature_value_slicer(
            record_batch: pa.RecordBatch) -> Iterable[types.SlicedRecordBatch]:
        """A function that generates sliced record batches.

    The naive approach of doing this would be to iterate each row, identify
    slice keys for the row and keep track of index ranges for each slice key.
    And then generate an arrow record batch for each slice key based on the
    index ranges. This would be expensive as we are identifying the slice keys
    for each row individually and we would have to loop over the feature values
    including crossing them when we have to slice on multiple features. The
    current approach generates the slice keys for a batch by performing joins
    over indices of individual features. And then groups the joined record batch
    by slice key to get the row indices corresponding to a slice.

    Args:
      record_batch: Arrow RecordBatch.

    Yields:
      Sliced record batch (slice_key, record_batch) where record_batch contains
      the rows corresponding to a slice.
    """
        per_feature_parent_indices = []
        for feature_name, values in six.iteritems(features):
            feature_array = record_batch.column(
                record_batch.schema.get_field_index(feature_name))
            flattened, value_parent_indices = arrow_util.flatten_nested(
                feature_array, True)
            non_missing_values = np.asarray(flattened)
            # Create dataframe with feature value and parent index.
            df = DataFrame({
                feature_name: non_missing_values,
                _PARENT_INDEX_COLUMN: value_parent_indices
            })
            df.drop_duplicates(inplace=True)
            # Filter based on slice values
            if values is not None:
                df = df.loc[df[feature_name].isin(values)]
            per_feature_parent_indices.append(df)

        # Join dataframes based on parent indices.
        # Note that we want the parent indices per slice key to be sorted in the
        # merged dataframe. The individual dataframes have the parent indices in
        # sorted order. We use "inner" join type to preserve the order of the left
        # keys (also note that same parent index rows would be consecutive). Hence
        # we expect the merged dataframe to have sorted parent indices per
        # slice key.
        merged_df = functools.reduce(
            lambda base, update: pd.merge(
                base,
                update,
                how='inner',  # pylint: disable=g-long-lambda
                on=_PARENT_INDEX_COLUMN),
            per_feature_parent_indices)

        # Construct a new column in the merged dataframe with the slice keys.
        merged_df[_SLICE_KEY_COLUMN] = ''
        index = 0
        for col_name in sorted(merged_df.columns):
            if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]:
                continue
            slice_key_col = (_to_slice_key(col_name) + '_' +
                             merged_df[col_name].apply(_to_slice_key))
            if index == 0:
                merged_df[_SLICE_KEY_COLUMN] = slice_key_col
                index += 1
            else:
                merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col)

        # Since the parent indices are sorted per slice key, the groupby would
        # preserve the sorted order within each group.
        per_slice_parent_indices = merged_df.groupby(
            _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN]
        for slice_key, parent_indices in per_slice_parent_indices:
            yield (slice_key,
                   table_util.RecordBatchTake(
                       record_batch, pa.array(parent_indices.to_numpy())))
Example #14
0
def get_array(
    record_batch: pa.RecordBatch,
    query_path: types.FeaturePath,
    return_example_indices: bool,
    wrap_flat_struct_in_list: bool = True,
) -> Tuple[pa.Array, Optional[np.ndarray]]:
  """Retrieve a nested array (and optionally example indices) from RecordBatch.

  This function has the same assumption over `record_batch` as
  `enumerate_arrays()` does.

  If the provided path refers to a leaf in the `record_batch`, then a
  "nested_list" will be returned. If the provided path does not refer to a leaf,
  a "struct" with be returned.

  See `enumerate_arrays()` for definition of "nested_list" and "struct".

  Args:
    record_batch: The RecordBatch whose arrays to be visited.
    query_path: The FeaturePath to lookup in the record_batch.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.
    wrap_flat_struct_in_list: if True, and if the query_path leads to a
      struct<[Ts]> array, it will be wrapped in a list array, where each
      sub-list contains one element. Caller can make use of this option to
      assume this function always returns a list<inner_type>.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the record_batch.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the
    record_batch and its nested struct arrays.
  """

  def _recursion_helper(
      query_path: types.FeaturePath, array: pa.Array,
      example_indices: Optional[np.ndarray]
  ) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Recursion helper."""
    array_type = array.type
    if not query_path:
      if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
        array = array_util.ToSingletonListArray(array)
      return array, example_indices
    if not pa.types.is_struct(get_innermost_nested_type(array_type)):
      raise KeyError('Cannot process query_path "{}" inside an array of type '
                     '{}. Expecting a struct<...> or '
                     '(large_)list...<struct<...>>.'.format(
                         query_path, array_type))
    flat_struct_array, parent_indices = flatten_nested(
        array, example_indices is not None)
    flat_indices = None
    if example_indices is not None:
      flat_indices = example_indices[parent_indices]

    step = query_path.steps()[0]
    try:
      child_array = flat_struct_array.field(step)
    except KeyError:
      raise KeyError('query_path step "{}" not in struct.'.format(step))
    relative_path = types.FeaturePath(query_path.steps()[1:])
    return _recursion_helper(relative_path, child_array, flat_indices)

  if not query_path:
    raise KeyError('query_path must be non-empty.')
  column_name = query_path.steps()[0]
  field_index = record_batch.schema.get_field_index(column_name)
  if field_index < 0:
    raise KeyError('query_path step 0 "{}" not in record batch.'
                   .format(column_name))
  array = record_batch.column(field_index)
  array_path = types.FeaturePath(query_path.steps()[1:])

  example_indices = np.arange(
      record_batch.num_rows) if return_example_indices else None
  return _recursion_helper(array_path, array, example_indices)
Example #15
0
  def GetTensor(self, record_batch: pa.RecordBatch,
                produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
    if (self._row_partition_dtype ==
        schema_pb2.TensorRepresentation.RowPartitionDType.INT32):
      offsets_dtype = np.int32
    elif (self._row_partition_dtype ==
          schema_pb2.TensorRepresentation.RowPartitionDType.INT64 or
          self._row_partition_dtype ==
          schema_pb2.TensorRepresentation.RowPartitionDType.UNSPECIFIED):
      offsets_dtype = np.int64

    if produce_eager_tensors:
      factory = tf.RaggedTensor.from_row_splits
    else:
      factory = tf.compat.v1.ragged.RaggedTensorValue

    # A RaggedTensor is composed by the following dimensions:
    # [B, D_0, D_1, ..., D_N, P_0, P_1, ..., P_M, U_0, U_1, ..., U_P]
    #
    # These dimensions belong to different categories:
    # * B: Batch size dimension
    # * D_n: Dimensions specified by the nested structure from the schema and
    # the column path to the values. n >= 1.
    # * P_m: Dimensions specified by the partitions that do not specify a fixed
    # dimension size. m >= 0.
    # * U_p: Dimensions specified by the inner uniform row length partitions
    # that make the inner dimensions fixed. p>=0.

    # Get row splits of each level in the record batch.
    # Store the row splits for the Dn dimensions that store the representation
    # of the nested structure on the dataset schema.
    outer_row_splits = []

    column_path = self._value_path.suffix(1)
    column = record_batch.column(self._column_index)
    column_type = column.type
    # Keep track of an accessor for the parent struct, so we can access other
    # fields required to get future dimensions row splits.
    parent_field_accessor = lambda field: record_batch.column(  # pylint:disable=g-long-lambda
        record_batch.schema.get_field_index(field))

    while True:
      # TODO(b/156514075): add support for handling slices.
      if column.offset != 0:
        raise ValueError(
            "This record batch is sliced. We currently do not handle converting"
            " slices to RaggedTensors.")
      if pa.types.is_struct(column_type):
        parent_column = column
        parent_field_accessor = parent_column.field
        column = column.field(column_path.initial_step())
        column_path = column_path.suffix(1)
        column_type = column.type
      elif _IsListLike(column_type):
        outer_row_splits.append(np.asarray(column.offsets, dtype=offsets_dtype))
        column = column.flatten()
        column_type = column.type
      else:
        break

    # Now that we have stored the row splits for the Dn dimensions, lets
    # start the construction of the RaggedTensor from the inner dimensions to
    # the outermost.

    # Take the values and set the shape for the inner most dimensions (Up)
    values = column
    if self._convert_to_binary_fn is not None:
      values = self._convert_to_binary_fn(values)
    values = np.asarray(values)
    values = np.reshape(values, self._values_fixed_shape)

    ragged_tensor = values

    # Build the RaggedTensor from the values and the specified partitions.

    # Now iterate from inner most partitions to outermost.
    # But first we need pop the last row split from the outer dimensions (D_n)
    # and scale it given the number of elements in the inner fixed dimensions.
    try:
      outer_last_row_split = _FloorDivide(outer_row_splits.pop(),
                                          self._inferred_dimensions_elements)
    except RuntimeError as e:
      raise ValueError(
          ("The values features lenghts cannot support "
           "the claimed fixed shape {}").format(self._inner_fixed_shape)) from e

    # Keep track of the previous dimension to help building row splits when an
    # uniform row length partition is found.
    prev_dimension = values.shape[0]
    for partition in reversed(self._ragged_partitions):
      if partition.HasField("uniform_row_length"):
        # If a uniform row length partition is found, we need to scale down the
        # last outer dimension row split.
        try:
          outer_last_row_split = _FloorDivide(outer_last_row_split,
                                              partition.uniform_row_length)
        except RuntimeError as e:
          raise ValueError(("The values features lengths cannnot support the "
                            "specified uniform row length of size {}").format(
                                partition.uniform_row_length)) from e

        row_splits = np.arange(
            0,
            prev_dimension + 1,
            partition.uniform_row_length,
            dtype=offsets_dtype)

        ragged_tensor = factory(ragged_tensor, row_splits=row_splits)
        try:
          prev_dimension = _FloorDivide(prev_dimension,
                                        partition.uniform_row_length)
        except RuntimeError as e:
          raise ValueError(
              ("The previous ragged partitions contained {} elements, "
               "which are not valid with the specified uniform row length: {}"
              ).format(prev_dimension, partition.uniform_row_length)) from e

      elif partition.HasField("row_length"):
        row_length_array = parent_field_accessor(partition.row_length)

        # When the outer most dimension specified by the partitions (P_0) comes
        # from another array other than values, we need to update the last
        # dimension row splits defined by the nested structure (D_n) given the
        # offsets of the array.
        outer_last_row_split = np.asarray(
            row_length_array.offsets, dtype=offsets_dtype)

        # Build row splits.
        row_length = np.asarray(row_length_array.flatten())
        row_splits = np.zeros(len(row_length) + 1, dtype=offsets_dtype)
        np.cumsum(row_length, out=row_splits[1:])

        if prev_dimension != row_splits[-1]:
          raise ValueError(
              ("The sum of row lengts provided in '{}' do not match "
               "with previous dimension found {}.").format(
                   partition.row_length, prev_dimension))

        ragged_tensor = factory(ragged_tensor, row_splits=row_splits)
        prev_dimension = len(row_length)

      else:
        raise ValueError("Empty partition found.")

    # Add back the last row split from the outer dimensions (D_n).
    outer_row_splits.append(outer_last_row_split)

    # Apply the outer ragged dimensions to thre resulting tensor.
    # Now that the RaggedTensor is build up to the P_0 dimensions, we need to
    # specify the row splits for the D_n dimensions.
    for row_split in reversed(outer_row_splits):
      ragged_tensor = factory(ragged_tensor, row_splits=row_split)

    return ragged_tensor
    def __call__(self, records: pa.RecordBatch) -> pa.RecordBatch:
        """Transformation logic for HashRedact action.

        Args:
            records (pa.RecordBatch): record batch to transform

        Returns:
            pa.RecordBatch: transformed record batch
        """
        columns = [column for column in self.columns if column in records.schema.names]
        indices = [records.schema.get_field_index(c) for c in columns]
        new_columns = records.columns
        algo = self.hash_algo.lower()
        hashFunc = hashlib.md5
        if algo == "md5":
            hashFunc = hashlib.md5
        elif algo == "sha256":
            hashFunc = hashlib.sha256
        elif algo == "sha512":
            hashFunc = hashlib.sha512
        else:
            raise ValueError(f"Algorithm {algo} is not supported!")
        for i in indices:
            new_columns[i] = pa.array([hashFunc(v.as_py().encode()).hexdigest() for v in records.column(i)])

        new_schema = self.schema(records.schema)
        return pa.RecordBatch.from_arrays(new_columns, schema=new_schema)
Example #17
0
def record_batch_to_tensor_values(
    record_batch: pa.RecordBatch,
    tensor_representations: Optional[Mapping[
        str, schema_pb2.TensorRepresentation]] = None
) -> types.TensorValueMaybeMultiLevelDict:
    """Returns tensor values extracted from given record batch.

  Args:
    record_batch: Record batch to extract features from.
    tensor_representations: Tensor representations to use when extracting the
      features. If a representation is not found for a given column name, a
      default representation will be used where possible, otherwise an exception
      will be raised.

  Returns:
    Features dict.

  Raises:
    ValueError: If a tensor value cannot be determined for a given column in the
    record batch.
  """
    if tensor_representations is None:
        tensor_representations = {}

    def _shape(value: Any) -> List[int]:
        """Returns the shape associated with given value."""
        if hasattr(value, '__len__'):
            return [len(value)] + _shape(value[0]) if value else [len(value)]
        else:
            return []

    features = {}
    updated_tensor_representations = {}
    for i, col in enumerate(record_batch.schema):
        if col.name in tensor_representations:
            updated_tensor_representations[col.name] = (
                tensor_representations[col.name])
        else:
            col_sizes = record_batch.column(i).value_lengths().unique()
            if len(col_sizes) != 1:
                # Assume VarLenSparseTensor
                tensor_representation = schema_pb2.TensorRepresentation()
                tensor_representation.varlen_sparse_tensor.column_name = col.name
                updated_tensor_representations[
                    col.name] = tensor_representation
            elif not np.all(record_batch[i].is_valid()):
                # Features that are missing some values can't be parsed using a default
                # tensor representation. Convert to numpy arrays containing None values.
                features[col.name] = record_batch[i].to_numpy(
                    zero_copy_only=False)
            else:
                tensor_representation = schema_pb2.TensorRepresentation()
                tensor_representation.dense_tensor.column_name = col.name
                dims = _shape(record_batch[i])
                # Convert dims of the form (..., n, 1) to (..., n).
                if len(dims) > 1 and dims[-1] == 1:
                    dims = dims[:-1]
                if len(dims) > 1:
                    for dim in dims[1:]:  # Skip batch dimension
                        tensor_representation.dense_tensor.shape.dim.append(
                            schema_pb2.FixedShape.Dim(size=dim))
                updated_tensor_representations[
                    col.name] = tensor_representation
    if updated_tensor_representations:
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                arrow_schema=record_batch.schema,
                tensor_representations=updated_tensor_representations))
        try:
            for k, v in adapter.ToBatchTensors(
                    record_batch, produce_eager_tensors=False).items():
                if isinstance(v, tf.compat.v1.ragged.RaggedTensorValue):
                    features[k] = to_ragged_tensor_value(v)
                elif isinstance(v, tf.compat.v1.SparseTensorValue):
                    kind = updated_tensor_representations[k].WhichOneof('kind')
                    if kind == 'sparse_tensor':
                        features[k] = to_sparse_tensor_value(v)
                    elif kind == 'varlen_sparse_tensor':
                        features[k] = to_varlen_sparse_tensor_value(v)
                    else:
                        raise ValueError(
                            f'Unexpected tensor representation kind ({kind}) '
                            f'for tensor of type: {type(v)}')
                else:
                    features[k] = v
        except Exception as e:
            raise ValueError(e, updated_tensor_representations,
                             record_batch) from e
    return features
Example #18
0
def get_array(
    record_batch: pa.RecordBatch,
    query_path: types.FeaturePath,
    return_example_indices: bool
) -> Tuple[pa.Array, Optional[np.ndarray]]:
  """Retrieve a nested array (and optionally example indices) from RecordBatch.

  It assumes `record_batch` contains only arrays of the following supported
  types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the record_batch, then a ListArray
  with a primitive element type will be returned. If the provided path does not
  refer to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    record_batch: The RecordBatch whose arrays to be visited.
    query_path: The FeaturePath to lookup in the record_batch.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the record_batch.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the
    record_batch and its nested struct arrays.
  """

  def _recursion_helper(
      query_path: types.FeaturePath, array: pa.Array,
      example_indices: Optional[np.ndarray]
  ) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Recursion helper."""
    if not query_path:
      return array, example_indices
    array_type = array.type
    if (not is_list_like(array_type) or
        not pa.types.is_struct(array_type.value_type)):
      raise KeyError('Cannot process query_path "{}" inside an array of type '
                     '{}. Expecting a (large_)list<struct<...>>.'.format(
                         query_path, array_type))
    flat_struct_array = array.flatten()
    flat_indices = None
    if example_indices is not None:
      flat_indices = example_indices[
          array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

    step = query_path.steps()[0]
    try:
      child_array = flat_struct_array.field(step)
    except KeyError:
      raise KeyError('query_path step "{}" not in struct.'.format(step))
    relative_path = types.FeaturePath(query_path.steps()[1:])
    return _recursion_helper(relative_path, child_array, flat_indices)

  if not query_path:
    raise KeyError('query_path must be non-empty.')
  column_name = query_path.steps()[0]
  field_index = record_batch.schema.get_field_index(column_name)
  if field_index < 0:
    raise KeyError('query_path step 0 "{}" not in record batch.'
                   .format(column_name))
  array = record_batch.column(field_index)
  array_path = types.FeaturePath(query_path.steps()[1:])

  example_indices = np.arange(
      record_batch.num_rows) if return_example_indices else None
  return _recursion_helper(array_path, array, example_indices)