def add_input(self, accumulator, input_table):

        weight_ndarrays = []
        if self._weight_feature is not None:
            for a in input_table.column(
                    self._weight_feature).data.iterchunks():
                weight_array = arrow_util.FlattenListArray(a)
                if len(weight_array) != len(a):
                    raise ValueError(
                        'If weight is specified, then each example must have a weight '
                        'feature of length 1.')
                # to_numpy() can only be called against a non-empty arrow array.
                if weight_array:
                    weight_ndarrays.append(weight_array.to_numpy())
                else:
                    weight_ndarrays.append(
                        np.array([], dtype=weight_array.to_pandas_dtype()))

        for column in input_table.columns:
            feature_name = column.name
            if feature_name == self._weight_feature:
                continue
            unweighted_counts = collections.Counter()
            weighted_counts = _WeightedCounter()
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, column.type)
            if not (feature_name in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                continue

            for feature_array, weight_ndarray in six.moves.zip_longest(
                    column.data.iterchunks(), weight_ndarrays, fillvalue=None):
                flattened_values_array = arrow_util.FlattenListArray(
                    feature_array)
                # to_numpy() cannot be called if the array is empty.
                if not flattened_values_array:
                    continue
                if feature_type == statistics_pb2.FeatureNameStatistics.STRING:
                    values_ndarray = flattened_values_array.to_pandas()
                else:
                    values_ndarray = flattened_values_array.to_numpy()
                value_parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                    feature_array).to_numpy()
                unweighted_counts.update(values_ndarray)
                if weight_ndarray is not None:
                    weight_per_value = weight_ndarray[value_parent_indices]
                    weighted_counts.weighted_update(values_ndarray,
                                                    weight_per_value)

            if feature_name not in accumulator:
                accumulator[feature_name] = _ValueCounts(
                    unweighted_counts=unweighted_counts,
                    weighted_counts=weighted_counts)
            else:
                accumulator[feature_name].unweighted_counts.update(
                    unweighted_counts)
                accumulator[feature_name].weighted_counts.update(
                    weighted_counts)
        return accumulator
    def test_flatten_list_array(self):
        flattened = arrow_util.FlattenListArray(
            pa.array([], type=pa.list_(pa.int64())))
        self.assertTrue(flattened.equals(pa.array([], type=pa.int64())))

        flattened = arrow_util.FlattenListArray(
            pa.array([[1.], [2.], [], [3.]]))
        self.assertTrue(flattened.equals(pa.array([1., 2., 3.])))
    def update(self,
               feature_column,
               values_quantiles_combiner,
               weight_column=None):
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_column:
            return

        weight_chunks = weight_column.data.iterchunks(
        ) if weight_column else []
        for feature_array, weight_array in six.moves.zip_longest(
                feature_column.data.iterchunks(), weight_chunks,
                fillvalue=None):
            flattened_value_array = arrow_util.FlattenListArray(feature_array)
            # Note: to_numpy will fail if flattened_value_array is empty.
            if not flattened_value_array:
                continue
            values = flattened_value_array.to_numpy()
            nan_mask = np.isnan(values)
            non_nan_mask = ~nan_mask
            values_no_nan = values[non_nan_mask]
            # This is to avoid integer overflow when computing sum or sum of squares.
            values_no_nan_as_double = values_no_nan.astype(np.float64)
            self.num_nan += np.sum(nan_mask)
            self.sum += np.sum(values_no_nan_as_double)
            self.sum_of_squares += np.sum(values_no_nan_as_double *
                                          values_no_nan_as_double)
            self.min = min(self.min, np.min(values_no_nan))
            self.max = max(self.max, np.max(values_no_nan))
            self.num_zeros += values_no_nan.size - np.count_nonzero(
                values_no_nan)
            self.quantiles_summary = values_quantiles_combiner.add_input(
                self.quantiles_summary,
                [values_no_nan, np.ones_like(values_no_nan)])

            if weight_array:
                example_weights = arrow_util.FlattenListArray(
                    weight_array).to_numpy().astype(np.float32, copy=False)

                if example_weights.size != len(weight_array):
                    raise ValueError('Weight feature must not be missing.')
                value_parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                    feature_array).to_numpy()
                weights = example_weights[value_parent_indices]
                weights_no_nan = weights[non_nan_mask]
                weighted_values = weights_no_nan * values_no_nan
                self.weighted_sum += np.sum(weighted_values)
                self.weighted_sum_of_squares += np.sum(weighted_values *
                                                       values_no_nan)
                self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                    self.weighted_quantiles_summary,
                    [values_no_nan, weights_no_nan])
                self.weighted_total_num_values += np.sum(weights_no_nan)
Exemple #4
0
    def add_input(self, accumulator, input_table):
        weight_column = (input_table.column(self._weight_feature)
                         if self._weight_feature else None)
        weight_array = weight_column.data.chunk(0) if weight_column else []
        if weight_array:
            flattened_weights = arrow_util.FlattenListArray(
                weight_array).to_numpy()

        for column in input_table.columns:
            feature_name = column.name
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            feature_path = types.FeaturePath([feature_name])
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, column.type)
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if not (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                continue
            value_array = column.data.chunk(0)
            flattened_values = arrow_util.FlattenListArray(value_array)
            unweighted_counts = collections.Counter()
            # Compute unweighted counts.
            value_counts = arrow_util.ValueCounts(flattened_values)
            for value_count in value_counts:
                value_count = value_count.as_py()
                unweighted_counts[
                    value_count['values']] = value_count['counts']

            # Compute weighted counts if a weight feature is specified.
            weighted_counts = _WeightedCounter()
            if weight_array:
                if (pa.types.is_binary(flattened_values.type)
                        or pa.types.is_string(flattened_values.type)):
                    # no free conversion.
                    flattened_values_np = flattened_values.to_pandas()
                else:
                    flattened_values_np = flattened_values.to_numpy()
                indices = arrow_util.GetFlattenedArrayParentIndices(
                    value_array)
                weighted_counts.weighted_update(
                    flattened_values_np, flattened_weights[indices.to_numpy()])

            if feature_path not in accumulator:
                accumulator[feature_path] = _ValueCounts(
                    unweighted_counts=unweighted_counts,
                    weighted_counts=weighted_counts)
            else:
                accumulator[feature_path].unweighted_counts.update(
                    unweighted_counts)
                accumulator[feature_path].weighted_counts.update(
                    weighted_counts)
        return accumulator
    def update(self,
               feature_column,
               feature_type,
               num_values_quantiles_combiner,
               weight_column=None):
        """Update the partial common statistics using the input value."""
        # All the values in this column is null and we cannot deduce the type of
        # the feature. This is not an error as this feature might have some values
        # in other batches.
        if feature_type is None:
            return

        if self.type is None:
            self.type = feature_type
        elif self.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_column.name, self.type, feature_type))

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_column:
            return

        if weight_column and (feature_column.data.num_chunks !=
                              weight_column.data.num_chunks):
            raise ValueError(
                'Expected the feature column {} and weight column {} to have the '
                'same number of chunks.'.format(feature_column.name,
                                                weight_column.name))

        weight_chunks = weight_column.data.iterchunks(
        ) if weight_column else []
        for feature_array, weight_array in six.moves.zip_longest(
                feature_column.data.iterchunks(), weight_chunks,
                fillvalue=None):
            num_values = arrow_util.ListLengthsFromListArray(
                feature_array).to_numpy()
            none_mask = arrow_util.GetArrayNullBitmapAsByteArray(
                feature_array).to_numpy().view(np.bool)

            num_values_not_none = num_values[~none_mask]
            self.num_non_missing += len(
                feature_array) - feature_array.null_count
            self.max_num_values = max(np.max(num_values_not_none),
                                      self.max_num_values)
            self.min_num_values = min(np.min(num_values_not_none),
                                      self.min_num_values)
            self.total_num_values += np.sum(num_values_not_none)
            self.num_values_summary = num_values_quantiles_combiner.add_input(
                self.num_values_summary, [num_values_not_none])

            if weight_array:
                weights = (arrow_util.FlattenListArray(
                    weight_array).to_numpy().astype(np.float32, copy=False))
                if weights.size != num_values.size:
                    raise ValueError('Weight feature must not be missing.')
                self.weighted_total_num_values += np.sum(num_values * weights)
                self.weighted_num_non_missing += np.sum(weights[~none_mask])
 def add_input(self, accumulator, examples_table):
     accumulator[0] += examples_table.num_rows
     if self._weight_feature:
         weights_column = examples_table.column(self._weight_feature)
         for weight_array in weights_column.data.iterchunks():
             accumulator[1] += np.sum(
                 arrow_util.FlattenListArray(weight_array).to_numpy())
     return accumulator
Exemple #7
0
def _to_topk_tuples(sliced_table, categorical_features, weight_feature=None):
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table
    weight_column = table.column(weight_feature) if weight_feature else None
    weight_array = weight_column.data.chunk(0) if weight_column else []
    if weight_array:
        flattened_weights = arrow_util.FlattenListArray(
            weight_array).to_numpy()

    for feature_column in table.columns:
        feature_name = feature_column.name
        # Skip the weight feature.
        if feature_name == weight_feature:
            continue
        feature_path = types.FeaturePath([feature_name])
        # if it's not a categorical feature nor a string feature, we don't bother
        # with topk stats.
        if not (feature_path in categorical_features
                or feature_column.type.equals(pa.list_(pa.binary()))
                or feature_column.type.equals(pa.list_(pa.string()))):
            continue
        value_array = feature_column.data.chunk(0)
        flattened_values = arrow_util.FlattenListArray(value_array)

        if weight_array and flattened_values:
            if (pa.types.is_binary(flattened_values.type)
                    or pa.types.is_string(flattened_values.type)):
                # no free conversion.
                flattened_values_np = flattened_values.to_pandas()
            else:
                flattened_values_np = flattened_values.to_numpy()
            indices = arrow_util.GetFlattenedArrayParentIndices(value_array)
            weights_ndarray = flattened_weights[indices.to_numpy()]
            for value, count, weight in _weighted_unique(
                    flattened_values_np, weights_ndarray):
                yield (slice_key, feature_path.steps(), value), (count, weight)
        else:
            value_counts = arrow_util.ValueCounts(flattened_values)
            values = value_counts.field('values').to_pylist()
            counts = value_counts.field('counts').to_pylist()
            for value, count in six.moves.zip(values, counts):
                yield ((slice_key, feature_path.steps(), value), count)
Exemple #8
0
  def add_input(self, accumulator,
                input_column):
    """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      input_column: An arrow column representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
    if accumulator.invalidate:
      return accumulator
    feature_type = stats_util.get_feature_type_from_arrow_type(
        input_column.name, input_column.type)
    # Ignore null array.
    if feature_type is None:
      return accumulator
    # If we see a different type, invalidate.
    if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
      accumulator.invalidate = True
      return accumulator

    for feature_array in input_column.data.iterchunks():
      # Consider using memoryview to avoid copying after upgrading to
      # arrow 0.12. Note that this would involve modifying the subsequent logic
      # to iterate over the values in a loop.
      values = arrow_util.FlattenListArray(feature_array).to_pandas()
      accumulator.total_num_values += values.size
      image_formats = self._image_decoder.get_formats(values)
      valid_mask = ~pd.isnull(image_formats)
      valid_formats = image_formats[valid_mask]
      format_counts = np.unique(valid_formats, return_counts=True)
      for (image_format, count) in zip(*format_counts):
        accumulator.counter_by_format[image_format] += count
      unknown_count = image_formats.size - valid_formats.size
      if unknown_count > 0:
        accumulator.counter_by_format[''] += unknown_count

      if self._enable_size_stats:
        # Get image height and width.
        image_sizes = self._image_decoder.get_sizes(values[valid_mask])
        if image_sizes.any():
          max_sizes = np.max(image_sizes, axis=0)
          # Update the max image height/width with all image values.
          accumulator.max_height = max(accumulator.max_height, max_sizes[0])
          accumulator.max_width = max(accumulator.max_width, max_sizes[1])
    return accumulator
Exemple #9
0
    def update(self, feature_column, values_quantiles_combiner, weights=None):
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_column:
            return

        feature_array = feature_column.data.chunk(0)
        flattened_value_array = arrow_util.FlattenListArray(
            feature_column.data.chunk(0))
        # Note: to_numpy will fail if flattened_value_array is empty.
        if not flattened_value_array:
            return
        values = flattened_value_array.to_numpy()
        nan_mask = np.isnan(values)
        self.num_nan += np.sum(nan_mask)
        non_nan_mask = ~nan_mask
        values_no_nan = values[non_nan_mask]
        # We do this check to avoid failing in np.min/max with empty array.
        if values_no_nan.size == 0:
            return
        # This is to avoid integer overflow when computing sum or sum of squares.
        values_no_nan_as_double = values_no_nan.astype(np.float64)
        self.sum += np.sum(values_no_nan_as_double)
        self.sum_of_squares += np.sum(values_no_nan_as_double *
                                      values_no_nan_as_double)
        # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade
        # to numpy 1.16
        self.min = min(self.min, np.min(values_no_nan))
        self.max = max(self.max, np.max(values_no_nan))
        self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan)
        self.quantiles_summary = values_quantiles_combiner.add_input(
            self.quantiles_summary,
            [values_no_nan, np.ones_like(values_no_nan)])
        if weights is not None:
            value_parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                feature_array).to_numpy()
            flat_weights = weights[value_parent_indices]
            flat_weights_no_nan = flat_weights[non_nan_mask]
            weighted_values = flat_weights_no_nan * values_no_nan
            self.weighted_sum += np.sum(weighted_values)
            self.weighted_sum_of_squares += np.sum(weighted_values *
                                                   values_no_nan)
            self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                self.weighted_quantiles_summary,
                [values_no_nan, flat_weights_no_nan])
            self.weighted_total_num_values += np.sum(flat_weights_no_nan)
Exemple #10
0
    def add_input(self, accumulator, examples_table):

        weights = None
        if self._weight_feature:
            weights = (arrow_util.FlattenListArray(
                examples_table.column(
                    self._weight_feature).data.chunk(0)).to_numpy())
            if len(weights) != len(examples_table):
                raise ValueError('Expected exactly one weight per example.')

        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            feature_path = types.FeaturePath([feature_name])
            is_categorical_feature = feature_path in self._categorical_features

            # If we encounter this feature for the first time, create a
            # new partial basic stats.
            stats_for_feature = accumulator.get(feature_path)
            if stats_for_feature is None:
                stats_for_feature = _PartialBasicStats(
                    self._weight_feature is not None)
                # Store empty summary.
                stats_for_feature.common_stats.num_values_summary = (
                    self._num_values_quantiles_combiner.create_accumulator())
                stats_for_feature.numeric_stats.quantiles_summary = (
                    self._values_quantiles_combiner.create_accumulator())
                accumulator[feature_path] = stats_for_feature

            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, feature_column.type)
            stats_for_feature.common_stats.update(
                feature_column, feature_type,
                self._num_values_quantiles_combiner, weights)
            if (is_categorical_feature or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                stats_for_feature.string_stats.update(feature_column)
            elif feature_type is not None:
                stats_for_feature.numeric_stats.update(
                    feature_column, self._values_quantiles_combiner, weights)
        return accumulator
Exemple #11
0
    def update(self, feature_column):
        """Update the partial string statistics using the input value."""
        # Iterate through the value array and update the partial stats.
        value_array = feature_column.data.chunk(0)
        flattened_values_array = arrow_util.FlattenListArray(value_array)
        if pa.types.is_binary(
                flattened_values_array.type) or pa.types.is_unicode(
                    flattened_values_array.type):
            # GetBinaryArrayTotalByteSize returns a Python long (to be compatible
            # with Python3). To make sure we do cheaper integer arithemetics in
            # Python2, we first convert it to int.
            self.total_bytes_length += int(
                arrow_util.GetBinaryArrayTotalByteSize(flattened_values_array))
        elif flattened_values_array:
            # We can only do flattened_values_array.to_numpy() when it's not empty.
            # This could be computed faster by taking log10 of the integer.
            def _len_after_conv(s):
                return len(str(s))

            self.total_bytes_length += np.sum(
                np.vectorize(_len_after_conv,
                             otypes=[np.int32
                                     ])(flattened_values_array.to_numpy()))
    def feature_value_slicer(table):
        """A function that generates sliced tables.

    The naive approach of doing this would be to iterate each row, identify
    slice keys for the row and keep track of index ranges for each slice key.
    And then generate an arrow table for each slice key based on the index
    ranges. This would be expensive as we are identifying the slice keys for
    each row individually and we would have to loop over the feature values
    including crossing them when we have to slice on multiple features. The
    current approach generates the slice keys for a batch by performing joins
    over indices of individual features. And then groups the joined table by
    slice key to get the row indices corresponding to a slice.

    Args:
      table: Arrow table.

    Yields:
      Sliced table (slice_key, Arrow table) where the table contains the rows
      corresponding to a slice.
    """
        per_feature_parent_indices = []
        for feature_name, values in six.iteritems(features):
            column = table.column(feature_name)
            # Assume we have a single chunk.
            feature_array = column.data.chunk(0)
            non_missing_values = arrow_util.FlattenListArray(
                feature_array).to_pandas()
            value_parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                feature_array).to_numpy()
            # Create dataframe with feature value and parent index.
            df = pd.DataFrame({
                feature_name: non_missing_values,
                _PARENT_INDEX_COLUMN: value_parent_indices
            })
            df.drop_duplicates(inplace=True)
            # Filter based on slice values
            if values is not None:
                df = df.loc[df[feature_name].isin(values)]
            per_feature_parent_indices.append(df)

        # Join dataframes based on parent indices.
        # Note that we want the parent indices per slice key to be sorted in the
        # merged dataframe. The individual dataframes have the parent indices in
        # sorted order. We use "inner" join type to preserve the order of the left
        # keys (also note that same parent index rows would be consecutive). Hence
        # we expect the merged dataframe to have sorted parent indices per
        # slice key.
        merged_df = functools.reduce(
            lambda base, update: pd.merge(
                base,
                update,
                how='inner',  # pylint: disable=g-long-lambda
                on=_PARENT_INDEX_COLUMN),
            per_feature_parent_indices)

        # Construct a new column in the merged dataframe with the slice keys.
        merged_df[_SLICE_KEY_COLUMN] = ''
        index = 0
        for col_name in sorted(merged_df.columns):
            if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]:
                continue
            slice_key_col = (_to_slice_key(col_name) + '_' +
                             merged_df[col_name].apply(_to_slice_key))
            if index == 0:
                merged_df[_SLICE_KEY_COLUMN] = slice_key_col
                index += 1
            else:
                merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col)

        # Since the parent indices are sorted per slice key, the groupby would
        # preserve the sorted order within each group.
        per_slice_parent_indices = merged_df.groupby(
            _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN]
        for slice_key, parent_indices in per_slice_parent_indices:
            yield (slice_key,
                   merge.SliceTableByRowIndices(table,
                                                parent_indices.to_numpy()))