コード例 #1
0
    def test_get_flattened_array_parent_indices(self):
        indices = array_util.GetFlattenedArrayParentIndices(
            pa.array([], type=pa.list_(pa.int32())))
        self.assertTrue(indices.equals(pa.array([], type=pa.int32())))

        indices = array_util.GetFlattenedArrayParentIndices(
            pa.array([[1.], [2.], [], [3.]]))
        self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
コード例 #2
0
ファイル: arrow_util.py プロジェクト: Bobgy/data-validation
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not is_list_like(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a (large_)list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)
コード例 #3
0
def flatten_nested(
    array: pa.Array, return_parent_indices: bool = False
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
  """Flattens all the list arrays nesting an array.

  If `array` is not list-like, itself will be returned.

  Args:
    array: pa.Array to flatten.
    return_parent_indices: If True, also returns the parent indices array.

  Returns:
    A tuple. The first term is the flattened array. The second term is None
    if `return_parent_indices` is False; otherwise it's a parent indices array
    parallel to the flattened array: if parent_indices[i] = j, then
    flattened_array[i] belongs to the j-th element of the input array.
  """
  parent_indices = None

  while is_list_like(array.type):
    if return_parent_indices:
      cur_parent_indices = array_util.GetFlattenedArrayParentIndices(
          array).to_numpy()
      if parent_indices is None:
        parent_indices = cur_parent_indices
      else:
        parent_indices = parent_indices[cur_parent_indices]
    array = array.flatten()

  # the array is not nested at the first place.
  if return_parent_indices and parent_indices is None:
    parent_indices = np.arange(len(array))
  return array, parent_indices
コード例 #4
0
ファイル: arrow_util.py プロジェクト: Bobgy/data-validation
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     weights: Optional[np.ndarray]
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
     """Recursion helper."""
     array_type = array.type
     if is_list_like(array_type) and pa.types.is_struct(
             array_type.value_type):
         if not enumerate_leaves_only:
             yield (feature_path, array, weights)
         flat_struct_array = array.flatten()
         flat_weights = None
         if weights is not None:
             flat_weights = weights[
                 array_util.GetFlattenedArrayParentIndices(
                     array).to_numpy()]
         for field in flat_struct_array.type:
             field_name = field.name
             # use "yield from" after PY 3.3.
             for e in _recursion_helper(feature_path.child(field_name),
                                        flat_struct_array.field(field_name),
                                        flat_weights):
                 yield e
     else:
         yield (feature_path, array, weights)
コード例 #5
0
    def GetTensor(self, record_batch: pa.RecordBatch,
                  produce_eager_tensors: bool) -> Any:
        values_array = record_batch.column(self._value_column_index)
        values_parent_indices = array_util.GetFlattenedArrayParentIndices(
            values_array)
        indices_arrays = [np.asarray(values_parent_indices)]
        for index_column_index in self._index_column_indices:
            indices_arrays.append(
                np.asarray(record_batch.column(index_column_index).flatten()))
        flat_values_array = values_array.flatten()
        if self._convert_to_binary_fn is not None:
            flat_values_array = self._convert_to_binary_fn(flat_values_array)
        values_np = np.asarray(flat_values_array)
        coo_np = np.empty(shape=(len(values_np), self._coo_size),
                          dtype=np.int64)
        try:
            np.stack(indices_arrays, axis=1, out=coo_np)
        except ValueError as e:
            raise ValueError("Error constructing the COO for SparseTensor. "
                             "number of values: {}; "
                             "size of each index array: {}; "
                             "original error {}.".format(
                                 len(values_np),
                                 [len(i) for i in indices_arrays], e))

        dense_shape = [len(record_batch)] + self._shape

        if produce_eager_tensors:
            return tf.sparse.SparseTensor(
                indices=tf.convert_to_tensor(coo_np),
                dense_shape=tf.convert_to_tensor(dense_shape, dtype=tf.int64),
                values=tf.convert_to_tensor(values_np))
        return tf.compat.v1.SparseTensorValue(indices=coo_np,
                                              dense_shape=dense_shape,
                                              values=values_np)
コード例 #6
0
    def update(self,
               feature_array: pa.Array,
               values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        flattened_value_array = feature_array.flatten()
        # Note: to_numpy will fail if flattened_value_array is empty.
        if not flattened_value_array:
            return
        values = np.asarray(flattened_value_array)
        nan_mask = np.isnan(values)
        self.num_nan += np.sum(nan_mask)
        non_nan_mask = ~nan_mask
        values_no_nan = values[non_nan_mask]

        # We do this check to avoid failing in np.min/max with empty array.
        if values_no_nan.size == 0:
            return
        # This is to avoid integer overflow when computing sum or sum of squares.
        values_no_nan_as_double = values_no_nan.astype(np.float64)
        self.sum += np.sum(values_no_nan_as_double)
        self.sum_of_squares += np.sum(values_no_nan_as_double *
                                      values_no_nan_as_double)
        # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade
        # to numpy 1.16
        curr_min = np.min(values_no_nan)
        curr_max = np.max(values_no_nan)
        self.min = min(self.min, curr_min)
        self.max = max(self.max, curr_max)
        if curr_min == float('-inf') or curr_max == float('inf'):
            finite_values = values_no_nan[np.isfinite(values_no_nan)]
            if finite_values.size > 0:
                self.finite_min = min(self.finite_min, np.min(finite_values))
                self.finite_max = max(self.finite_max, np.max(finite_values))

        self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan)
        self.quantiles_summary = values_quantiles_combiner.add_input(
            self.quantiles_summary,
            [values_no_nan, np.ones_like(values_no_nan)])
        if weights is not None:
            value_parent_indices = np.asarray(
                array_util.GetFlattenedArrayParentIndices(feature_array))
            flat_weights = weights[value_parent_indices]
            flat_weights_no_nan = flat_weights[non_nan_mask]
            weighted_values = flat_weights_no_nan * values_no_nan
            self.weighted_sum += np.sum(weighted_values)
            self.weighted_sum_of_squares += np.sum(weighted_values *
                                                   values_no_nan)
            self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                self.weighted_quantiles_summary,
                [values_no_nan, flat_weights_no_nan])
            self.weighted_total_num_values += np.sum(flat_weights_no_nan)
コード例 #7
0
def _flatten_and_impute(
    examples_table: pa.Table, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow table.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples_table: Arrow table containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples_table.num_rows
    result = {}
    for feature_column in examples_table.itercolumns():
        feature_path = types.FeaturePath([feature_column.name])
        # Assume we have only a single chunk.
        feature_array = feature_column.data.chunk(0)
        # to_pandas returns a readonly array. Create a copy as we will be imputing
        # the NaN values.
        non_missing_values = np.copy(
            arrow_util.primitive_array_to_numpy(feature_array.flatten()))
        non_missing_parent_indices = arrow_util.primitive_array_to_numpy(
            array_util.GetFlattenedArrayParentIndices(feature_array))
        is_categorical_feature = feature_path in categorical_features
        result_dtype = non_missing_values.dtype
        if non_missing_parent_indices.size < num_rows and is_categorical_feature:
            result_dtype = np.object
        flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
        num_values = arrow_util.primitive_array_to_numpy(
            array_util.ListLengthsFromListArray(feature_array))
        missing_parent_indices = np.where(num_values == 0)[0]
        if feature_path in categorical_features:
            imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
        else:
            # Also impute any NaN values.
            nan_mask = np.isnan(non_missing_values)
            imputation_fill_value = sys.maxsize
            if not np.all(nan_mask):
                imputation_fill_value = non_missing_values[~nan_mask].max(
                ) * 10
            non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value
        flattened_array[non_missing_parent_indices] = non_missing_values
        if missing_parent_indices.any():
            flattened_array[missing_parent_indices] = imputation_fill_value
        result[feature_path] = flattened_array
    return result
コード例 #8
0
ファイル: array_util_test.py プロジェクト: tensorflow/tfx-bsl
    def test_get_flattened_array_parent_indices(self, list_type_factory,
                                                parent_indices_type):
        indices = array_util.GetFlattenedArrayParentIndices(
            pa.array([], type=list_type_factory(pa.int32())))
        self.assertTrue(indices.equals(pa.array([], type=parent_indices_type)))

        indices = array_util.GetFlattenedArrayParentIndices(
            pa.array([[1.], [2.], [], [3., 4.]],
                     type=list_type_factory(pa.float32())))
        self.assertTrue(
            indices.equals(pa.array([0, 1, 3, 3], type=parent_indices_type)))

        indices = array_util.GetFlattenedArrayParentIndices(
            pa.array([[1.], [2.], [], [3., 4.]],
                     type=list_type_factory(pa.float32())).slice(1))
        self.assertTrue(
            indices.equals(pa.array([0, 2, 2], type=parent_indices_type)))

        indices = array_util.GetFlattenedArrayParentIndices(
            pa.array([list(range(1024))], type=list_type_factory(pa.int64())))
        self.assertTrue(
            indices.equals(pa.array([0] * 1024, type=parent_indices_type)))
コード例 #9
0
def _get_example_value_presence(
        table: pa.Table, path: types.FeaturePath,
        boundaries: Optional[Iterable[float]]) -> Optional[pd.Series]:
    """Returns information about which examples contained which values.

  This function treats all values for a given path within a single example
  as a set and and returns a mapping between each example index and the distinct
  values which are present in that example.

  The result of calling this function for path 'p' on an arrow table with the
  two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be
  pd.Series(['a', 'b', 'a'], index=[0, 0, 1]).

  If the array retrieved from get_array is null, this function returns None.

  Args:
    table: The table in which to look up the path.
    path: The FeaturePath for which to fetch values.
    boundaries: Optionally, a set of bin boundaries to use for binning the array
      values.

  Returns:
    A Pandas Series containing distinct pairs of array values and example
    indices. The series values will be the array values, and the series index
    values will be the example indices.
  """
    arr, example_indices = arrow_util.get_array(table,
                                                path,
                                                return_example_indices=True)
    if pa.types.is_null(arr.type):
        return None

    arr_flat = arr.flatten()
    example_indices_flat = example_indices[
        array_util.GetFlattenedArrayParentIndices(arr).to_numpy()]
    if boundaries is not None:
        element_indices, bins = bin_util.bin_array(arr_flat, boundaries)
        df = pd.DataFrame({
            'example_indices':
            example_indices_flat[element_indices],
            'values':
            bins
        })
    else:
        df = pd.DataFrame({
            'example_indices': example_indices_flat,
            'values': np.asarray(arr_flat)
        })
    df_unique = df.drop_duplicates()
    return df_unique.set_index('example_indices')['values']
コード例 #10
0
 def _RecursionHelper(row_indices, array):
   """Flattens `array` while maintains the `row_indices`."""
   array_type = array.type
   if _IsListLike(array_type):
     parent_indices = np.asarray(
         array_util.GetFlattenedArrayParentIndices(array))
     _RecursionHelper(row_indices[parent_indices], array.flatten())
   elif pa.types.is_struct(array_type):
     for child in array.flatten():
       _RecursionHelper(row_indices, child)
   else:
     value_type = _GetValueType(array.type)
     dist_by_type = self._num_feature_values_dist_by_type[value_type]
     for num_values in np.bincount(row_indices, minlength=num_rows).tolist():
       dist_by_type.update(num_values)
       self._num_feature_values_dist.update(num_values)
コード例 #11
0
  def update(self,
             feature_path: types.FeaturePath,
             feature_array: pa.Array,
             feature_type: types.FeatureNameStatisticsType,
             make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
             weights: Optional[np.ndarray] = None) -> None:
    """Update the partial common statistics using the input value."""
    if self.type is None:
      self.type = feature_type  # pytype: disable=annotation-type-mismatch
    elif feature_type is not None and self.type != feature_type:
      raise TypeError('Cannot determine the type of feature %s. '
                      'Found values of types %s and %s.' %
                      (feature_path, self.type, feature_type))

    nest_level = arrow_util.get_nest_level(feature_array.type)
    if self.presence_and_valency_stats is None:
      self.presence_and_valency_stats = [
          _PresenceAndValencyStats(make_quantiles_sketch_fn)
          for _ in range(nest_level)
      ]
    elif nest_level != len(self.presence_and_valency_stats):
      raise ValueError('Inconsistent nestedness in feature {}: {} vs {}'.format(
          feature_path, nest_level, len(self.presence_and_valency_stats)))

    # And there's nothing we can collect in this case.
    if not feature_array:
      return

    level = 0
    while arrow_util.is_list_like(feature_array.type):
      presence_mask = ~np.asarray(
          array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(np.bool)
      num_values = np.asarray(
          array_util.ListLengthsFromListArray(feature_array))
      num_values_not_none = num_values[presence_mask]
      self.presence_and_valency_stats[level].update(feature_array,
                                                    presence_mask, num_values,
                                                    num_values_not_none,
                                                    weights)
      flattened = feature_array.flatten()
      if weights is not None:
        parent_indices = array_util.GetFlattenedArrayParentIndices(
            feature_array).to_numpy()
        weights = weights[parent_indices]
      feature_array = flattened
      level += 1
コード例 #12
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _ValueCounts],
        input_record_batch: pa.RecordBatch
    ) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_record_batch,
                weight_column=self._weight_feature,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            if feature_type is None:
                continue
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values = leaf_array.flatten()
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = array_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = np.asarray(flattened_values)
                    parent_indices = array_util.GetFlattenedArrayParentIndices(
                        leaf_array)
                    weighted_counts.weighted_update(
                        flattened_values_np,
                        weights[np.asarray(parent_indices)])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
コード例 #13
0
    def _get_univalent_values_with_parent_indices(
            self,
            examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feature_column.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            # Assume we have only a single chunk.
            assert feature_column.data.num_chunks == 1
            feat_arr = feature_column.data.chunk(0)
            value_lengths = arrow_util.primitive_array_to_numpy(
                array_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feat_arr.flatten())
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                array_util.GetFlattenedArrayParentIndices(feat_arr))
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
コード例 #14
0
    def _get_univalent_values_with_parent_indices(
            self,
            examples: pa.RecordBatch) -> Dict[types.FeatureName, pd.DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_name, feat_arr in zip(examples.schema.names,
                                          examples.columns):
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feat_arr.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (None,
                                statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            value_lengths = np.asarray(
                array_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            non_missing_values = np.asarray(feat_arr.flatten())
            value_parent_indices = np.asarray(
                array_util.GetFlattenedArrayParentIndices(feat_arr))
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
コード例 #15
0
def _to_topk_tuples(
    sliced_table: Tuple[types.SliceKey, pa.Table],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[
    Tuple[Tuple[types.SliceKey, FeaturePathTuple, Any],
          Union[int, Tuple[int, Union[int, float]]]]]:
  """Generates tuples for computing top-k and uniques from input tables."""
  slice_key, table = sliced_table

  for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
      table,
      weight_column=weight_feature,
      enumerate_leaves_only=True):
    feature_array_type = feature_array.type
    if pa.types.is_null(feature_array_type):
      continue
    if feature_path in bytes_features:
      continue
    if (feature_path in categorical_features or
        stats_util.get_feature_type_from_arrow_type(
            feature_path,
            feature_array_type) == statistics_pb2.FeatureNameStatistics.STRING):
      flattened_values = feature_array.flatten()
      if weights is not None and flattened_values:
        # Slow path: weighted uniques.
        flattened_values_np = np.asarray(flattened_values)
        parent_indices = (
            np.asarray(
                array_util.GetFlattenedArrayParentIndices(feature_array)))
        weights_ndarray = weights[parent_indices]
        for value, count, weight in _weighted_unique(
            flattened_values_np, weights_ndarray):
          yield (slice_key, feature_path.steps(), value), (count, weight)
      else:
        value_counts = array_util.ValueCounts(flattened_values)
        values = value_counts.field('values').to_pylist()
        counts = value_counts.field('counts').to_pylist()
        for value, count in six.moves.zip(values, counts):
          yield ((slice_key, feature_path.steps(), value), count)
コード例 #16
0
def _get_example_value_presence(
        record_batch: pa.RecordBatch, path: types.FeaturePath,
        boundaries: Optional[Iterable[float]],
        weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]:
    """Returns information about which examples contained which values.

  This function treats all values for a given path within a single example
  as a set and and returns a mapping between each example index and the distinct
  values which are present in that example.

  The result of calling this function for path 'p' on an arrow record batch with
  the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be
  pd.Series(['a', 'b', 'a'], index=[0, 0, 1]).

  If the array retrieved from get_array is null, this function returns None.

  Args:
    record_batch: The RecordBatch in which to look up the path.
    path: The FeaturePath for which to fetch values.
    boundaries: Optionally, a set of bin boundaries to use for binning the array
      values.
    weight_column_name: Optionally, a weight column to return in addition to the
      value and example index.

  Returns:
    A Pandas DataFrame containing distinct pairs of array values and example
    indices, along with the corresponding flattened example weights. The index
    will be the example indices and the values will be stored in a column named
    'values'. If weight_column_name is provided, a second column will be
    returned containing the array values, and 'weights' containing the weights
    for the example from which each value came.
  """
    arr, example_indices = arrow_util.get_array(record_batch,
                                                path,
                                                return_example_indices=True)
    if pa.types.is_null(arr.type):
        return None

    arr_flat = arr.flatten()
    is_binary_like = arrow_util.is_binary_like(arr_flat.type)
    assert boundaries is None or not is_binary_like, (
        'Boundaries can only be applied to numeric columns')
    if is_binary_like:
        # use dictionary_encode so we can use np.unique on object arrays
        dict_array = arr_flat.dictionary_encode()
        arr_flat = dict_array.indices
        arr_flat_dict = np.asarray(dict_array.dictionary)
    parent_indices = array_util.GetFlattenedArrayParentIndices(arr).to_numpy()
    example_indices_flat = example_indices[parent_indices]
    if boundaries is not None:
        element_indices, bins = bin_util.bin_array(arr_flat, boundaries)
        rows = np.vstack([example_indices_flat[element_indices], bins])
    else:
        rows = np.vstack([example_indices_flat, np.asarray(arr_flat)])
    if not rows.size:
        return None
    # Deduplicate values which show up more than once in the same example. This
    # makes P(X=x|Y=y) in the standard lift definition behave as
    # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y.
    unique_rows = np.unique(rows, axis=1)
    example_indices = unique_rows[0, :]
    values = unique_rows[1, :]
    if is_binary_like:
        # return binary like values a pd.Categorical wrapped in a Series. This makes
        # subsqeuent operations like pd.Merge cheaper.
        values = pd.Categorical.from_codes(values, categories=arr_flat_dict)
    columns = {'example_indices': example_indices, 'values': values}
    if weight_column_name:
        weights = arrow_util.get_weight_feature(record_batch,
                                                weight_column_name)
        columns['weights'] = np.asarray(weights)[example_indices]
    df = pd.DataFrame(columns)
    return df.set_index('example_indices')
コード例 #17
0
    def feature_value_slicer(table):
        """A function that generates sliced tables.

    The naive approach of doing this would be to iterate each row, identify
    slice keys for the row and keep track of index ranges for each slice key.
    And then generate an arrow table for each slice key based on the index
    ranges. This would be expensive as we are identifying the slice keys for
    each row individually and we would have to loop over the feature values
    including crossing them when we have to slice on multiple features. The
    current approach generates the slice keys for a batch by performing joins
    over indices of individual features. And then groups the joined table by
    slice key to get the row indices corresponding to a slice.

    Args:
      table: Arrow table.

    Yields:
      Sliced table (slice_key, Arrow table) where the table contains the rows
      corresponding to a slice.
    """
        per_feature_parent_indices = []
        for feature_name, values in six.iteritems(features):
            column = table.column(feature_name)
            # Assume we have a single chunk.
            feature_array = column.data.chunk(0)
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feature_array.flatten())
            value_parent_indices = array_util.GetFlattenedArrayParentIndices(
                feature_array).to_numpy()
            # Create dataframe with feature value and parent index.
            df = pd.DataFrame({
                feature_name: non_missing_values,
                _PARENT_INDEX_COLUMN: value_parent_indices
            })
            df.drop_duplicates(inplace=True)
            # Filter based on slice values
            if values is not None:
                df = df.loc[df[feature_name].isin(values)]
            per_feature_parent_indices.append(df)

        # Join dataframes based on parent indices.
        # Note that we want the parent indices per slice key to be sorted in the
        # merged dataframe. The individual dataframes have the parent indices in
        # sorted order. We use "inner" join type to preserve the order of the left
        # keys (also note that same parent index rows would be consecutive). Hence
        # we expect the merged dataframe to have sorted parent indices per
        # slice key.
        merged_df = functools.reduce(
            lambda base, update: pd.merge(
                base,
                update,
                how='inner',  # pylint: disable=g-long-lambda
                on=_PARENT_INDEX_COLUMN),
            per_feature_parent_indices)

        # Construct a new column in the merged dataframe with the slice keys.
        merged_df[_SLICE_KEY_COLUMN] = ''
        index = 0
        for col_name in sorted(merged_df.columns):
            if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]:
                continue
            slice_key_col = (_to_slice_key(col_name) + '_' +
                             merged_df[col_name].apply(_to_slice_key))
            if index == 0:
                merged_df[_SLICE_KEY_COLUMN] = slice_key_col
                index += 1
            else:
                merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col)

        # Since the parent indices are sorted per slice key, the groupby would
        # preserve the sorted order within each group.
        per_slice_parent_indices = merged_df.groupby(
            _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN]
        for slice_key, parent_indices in per_slice_parent_indices:
            yield (slice_key,
                   table_util.SliceTableByRowIndices(
                       table, pa.array(parent_indices.to_numpy())))