Ejemplo n.º 1
0
  def test_get_array_null_bitmap_as_byte_array(self):
    array = pa.array([], type=pa.int32())
    null_masks = array_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([], type=pa.uint8())))

    array = pa.array([1, 2, None, 3, None], type=pa.int32())
    null_masks = array_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(
        null_masks.equals(pa.array([0, 0, 1, 0, 1], type=pa.uint8())))

    array = pa.array([1, 2, 3])
    null_masks = array_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([0, 0, 0], type=pa.uint8())))

    array = pa.array([None, None, None], type=pa.int32())
    null_masks = array_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([1, 1, 1], type=pa.uint8())))
    # Demonstrate that the returned array can be converted to a numpy boolean
    # array w/o copying
    np.testing.assert_equal(
        np.array([True, True, True]), null_masks.to_numpy().view(np.bool))
Ejemplo n.º 2
0
    def update(self,
               feature_path: types.FeaturePath,
               feature_array: pa.Array,
               feature_type: types.FeatureNameStatisticsType,
               num_values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial common statistics using the input value."""
        # All the values in this column is null and we cannot deduce the type of
        # the feature. This is not an error as this feature might have some values
        # in other batches.
        if feature_type is None:
            return

        if self.type is None:
            self.type = feature_type
        elif self.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_path, self.type, feature_type))

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        num_values = arrow_util.primitive_array_to_numpy(
            array_util.ListLengthsFromListArray(feature_array))
        none_mask = arrow_util.primitive_array_to_numpy(
            array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(
                np.bool)

        self.num_non_missing += len(feature_array) - feature_array.null_count
        num_values_not_none = num_values[~none_mask]
        # We do this check to avoid failing in np.min/max with empty array.
        if num_values_not_none.size == 0:
            return
        # Use np.maximum.reduce(num_values_not_none, initial=self.max_num_values)
        # once we upgrade to numpy 1.16
        self.max_num_values = max(np.max(num_values_not_none),
                                  self.max_num_values)
        self.min_num_values = min(np.min(num_values_not_none),
                                  self.min_num_values)
        self.total_num_values += np.sum(num_values_not_none)
        self.num_values_summary = num_values_quantiles_combiner.add_input(
            self.num_values_summary, [num_values_not_none])

        if weights is not None:
            if weights.size != num_values.size:
                raise ValueError('Weight feature must not be missing.')
            self.weighted_total_num_values += np.sum(num_values * weights)
            self.weighted_num_non_missing += np.sum(weights[~none_mask])
Ejemplo n.º 3
0
 def _UpdateNumColumnsDist(self, record_batch: pa.RecordBatch) -> None:
   # Define number of columns of a row to be the number of cells in that row
   # whose values are not null.
   # It can be computed by summing up (element wise) the negation of null
   # flags (converted to integer) of all the arrays.
   null_bitmaps = [
       np.asarray(array_util.GetArrayNullBitmapAsByteArray(c)).view(np.bool)
       for c in record_batch]
   indicators = [(~bitmap).view(np.uint8) for bitmap in null_bitmaps]
   sum_indicators = np.zeros(record_batch.num_rows, dtype=np.int64)
   for indicator in indicators:
     np.add(sum_indicators, indicator, out=sum_indicators)
   for num_column in sum_indicators.tolist():
     self._num_columns_dist.update(num_column)
  def update(self,
             feature_path: types.FeaturePath,
             feature_array: pa.Array,
             feature_type: types.FeatureNameStatisticsType,
             make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
             weights: Optional[np.ndarray] = None) -> None:
    """Update the partial common statistics using the input value."""
    if self.type is None:
      self.type = feature_type  # pytype: disable=annotation-type-mismatch
    elif feature_type is not None and self.type != feature_type:
      raise TypeError('Cannot determine the type of feature %s. '
                      'Found values of types %s and %s.' %
                      (feature_path, self.type, feature_type))

    nest_level = arrow_util.get_nest_level(feature_array.type)
    if self.presence_and_valency_stats is None:
      self.presence_and_valency_stats = [
          _PresenceAndValencyStats(make_quantiles_sketch_fn)
          for _ in range(nest_level)
      ]
    elif nest_level != len(self.presence_and_valency_stats):
      raise ValueError('Inconsistent nestedness in feature {}: {} vs {}'.format(
          feature_path, nest_level, len(self.presence_and_valency_stats)))

    # And there's nothing we can collect in this case.
    if not feature_array:
      return

    level = 0
    while arrow_util.is_list_like(feature_array.type):
      presence_mask = ~np.asarray(
          array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(np.bool)
      num_values = np.asarray(
          array_util.ListLengthsFromListArray(feature_array))
      num_values_not_none = num_values[presence_mask]
      self.presence_and_valency_stats[level].update(feature_array,
                                                    presence_mask, num_values,
                                                    num_values_not_none,
                                                    weights)
      flattened = feature_array.flatten()
      if weights is not None:
        parent_indices = array_util.GetFlattenedArrayParentIndices(
            feature_array).to_numpy()
        weights = weights[parent_indices]
      feature_array = flattened
      level += 1
Ejemplo n.º 5
0
  def null_mask(self, path: types.FeaturePath) -> np.ndarray:
    """Returns a boolean mask of rows which are null in the referenced array.

    If the requested path cannot be found in the table, it will be considered
    null in all rows in the table.

    Args:
      path: The path corresponding to the array from which to generate the null
        mask.
    """
    try:
      array, _ = arrow_util.get_array(
          self._table, path, broadcast_column_name=None)
      # GetArrayNullBitmapAsByteArray is only useful for non-null type arrays.
      if pa.types.is_null(array.type):
        return np.full(self._table.num_rows, True)
      return np.asarray(
          array_util.GetArrayNullBitmapAsByteArray(array), dtype=bool)
    except KeyError:
      return np.full(self._table.num_rows, True)
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats],
        input_table: pa.Table
    ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      input_table: An Arrow Table whose columns are features and rows are
        examples.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        feature_value_list_lengths = dict()
        feature_is_missing = dict()
        batch_example_count = input_table.num_rows
        # Do a single pass through the input table to determine the value list
        # lengths and whether the feature is missing for every feature
        # that is an index or value feature in any sparse feature in the schema.
        for feature_path, leaf_array, _ in arrow_util.enumerate_arrays(
                input_table, weight_column=None, enumerate_leaves_only=True):
            if (feature_path in self._all_index_feature_paths
                    or feature_path in self._all_value_feature_paths):
                if pa.types.is_null(leaf_array.type):
                    # If the column is a NullArray, it is missing from the entire batch
                    # (missing features have value list lengths of 0).
                    feature_value_list_lengths[feature_path] = np.full(
                        batch_example_count, 0)
                    feature_is_missing[feature_path] = np.full(
                        batch_example_count, True)
                else:
                    feature_value_list_lengths[feature_path] = np.asarray(
                        array_util.ListLengthsFromListArray(leaf_array))
                    feature_is_missing[feature_path] = np.asarray(
                        array_util.GetArrayNullBitmapAsByteArray(leaf_array))

        # Now create a partial sparse feature stats object for each sparse feature
        # using the value list lengths and feature missing information collected
        # above.
        for feature_path in self._sparse_feature_component_paths:
            value_feature_path = self._sparse_feature_component_paths[
                feature_path].value_feature
            index_feature_paths = self._sparse_feature_component_paths[
                feature_path].index_features

            # Create a filter identifying examples in which the entire sparse feature
            # is missing since those examples should not be included in counting
            # missing counts or length differences.
            component_features_missing = np.array([
                feature_is_missing.get(path, np.full(batch_example_count,
                                                     True)) for path in
                itertools.chain([value_feature_path], index_feature_paths)
            ])
            entire_sparse_feature_missing = np.all(component_features_missing,
                                                   axis=0)
            num_examples_missing_sparse_feature = np.sum(
                entire_sparse_feature_missing)

            # If all examples in the batch are missing the sparse feature, do not
            # update the accumulator with the partial stats for that sparse feature.
            if num_examples_missing_sparse_feature == batch_example_count:
                continue

            is_missing_value_feature = feature_is_missing.get(
                value_feature_path)
            # If this batch does not have the value feature at all,
            # missing_value_count is the number of examples in the batch.
            # Also populate the value list lengths for the value feature with all 0s
            # since a missing feature is considered to have a value list length of 0.
            if is_missing_value_feature is None:
                missing_value_count = batch_example_count
                feature_value_list_lengths[value_feature_path] = np.full(
                    batch_example_count, 0)
            else:
                missing_value_count = np.sum(is_missing_value_feature)
            # Do not include examples that are entirely missing the sparse feature in
            # the missing value count.
            missing_value_count -= num_examples_missing_sparse_feature

            missing_index_counts = collections.Counter()
            min_length_diff = dict()
            max_length_diff = dict()
            for index_feature_path in index_feature_paths:
                is_missing_index_feature = feature_is_missing.get(
                    index_feature_path)
                if is_missing_index_feature is None:
                    # If this batch does not have this index feature at all,
                    # missing_index_count for that index feature is the number of
                    # examples in the batch.
                    missing_index_count = batch_example_count
                    # Populate the value list lengths for the index feature with all 0s
                    # since a missing feature is considered to have a value list length of
                    # 0.
                    feature_value_list_lengths[index_feature_path] = np.full(
                        batch_example_count, 0)
                else:
                    missing_index_count = np.sum(is_missing_index_feature)
                # Do not include examples that are entirely missing the sparse feature
                # in the missing value count.
                missing_index_counts[index_feature_path] = (
                    missing_index_count - num_examples_missing_sparse_feature)

                length_differences = np.subtract(
                    feature_value_list_lengths[index_feature_path],
                    feature_value_list_lengths[value_feature_path])

                # Do not include examples that are entirely missing the sparse feature
                # in determining the min and max length differences.
                filtered_length_differences = length_differences[
                    ~entire_sparse_feature_missing]
                # This generator should not get to this point if the current sparse
                # feature is missing from all examples in the batch (which would cause
                # filtered_length_differences to be empty).
                assert filtered_length_differences.size != 0
                min_length_diff[index_feature_path] = np.min(
                    filtered_length_differences)
                max_length_diff[index_feature_path] = np.max(
                    filtered_length_differences)

            stats_for_feature = _PartialSparseFeatureStats(
                missing_value_count, missing_index_counts, min_length_diff,
                max_length_diff)
            existing_stats_for_feature = accumulator.get(feature_path)
            if existing_stats_for_feature is None:
                accumulator[feature_path] = stats_for_feature
            else:
                accumulator[feature_path] += stats_for_feature
        return accumulator