コード例 #1
0
    def update(self, value_list):
        """Updates the partial Time statistics using the value list.

    Args:
      value_list: A list of the values in an example.
    """
        for value in value_list:
            if not value:
                continue
            if isinstance(value,
                          bytes) and not stats_util.is_valid_utf8(value):
                self.invalidated = True
                return
            self.considered += 1
            for strptime_format, time_regex in _TIME_RE_LIST:
                if time_regex.match(value):
                    self.matching_formats[strptime_format] += 1
コード例 #2
0
    def add_input(self, accumulator, input_batch):
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      input_batch: A list representing a batch of feature value_lists
        (one per example) which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        for value_list in input_batch:
            # If the value_list is None or empty ignore.
            if value_list is None or value_list.size == 0:
                continue

            # Check if the numpy array is of bytes type, if not invalidate the stats.
            # in examples/features to run image stas gen on.
            if stats_util.get_feature_type(
                    value_list.dtype
            ) != statistics_pb2.FeatureNameStatistics.STRING:
                accumulator.invalidate = True
                return accumulator

            # Perform heuristic for a value.
            for value in value_list:
                if not value:
                    continue
                if isinstance(value,
                              bytes) and not stats_util.is_valid_utf8(value):
                    accumulator.invalidate = True
                    return accumulator
                accumulator.considered += 1
                accumulator.matched += self._classifier.classify(value)
        return accumulator
コード例 #3
0
def make_feature_stats_proto_with_topk_stats(feature_name,
                                             top_k_value_count_list,
                                             is_categorical, is_weighted_stats,
                                             num_top_values,
                                             frequency_threshold,
                                             num_rank_histogram_buckets):
    """Makes a FeatureNameStatistics proto containing the top-k stats.

  Args:
    feature_name: The feature name.
    top_k_value_count_list: A list of FeatureValueCount tuples.
    is_categorical: Whether the feature is categorical.
    is_weighted_stats: Whether top_k_value_count_list incorporates weights.
    num_top_values: The number of most frequent feature values to keep for
      string features.
    frequency_threshold: The minimum number of examples (possibly weighted) the
      most frequent values must be present in.
    num_rank_histogram_buckets: The number of buckets in the rank histogram for
      string features.

  Returns:
    A FeatureNameStatistics proto containing the top-k stats.
  """
    # Sort the top_k_value_count_list in descending order by count. Where
    # multiple feature values have the same count, consider the feature with the
    # 'larger' feature value to be larger for purposes of breaking the tie.
    top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]),
                                reverse=True)

    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)

    if is_weighted_stats:
        string_stats = result.string_stats.weighted_string_stats
    else:
        string_stats = result.string_stats

    for i in range(len(top_k_value_count_list)):
        value, count = top_k_value_count_list[i]
        if count < frequency_threshold:
            break
        # Check if we have a valid utf-8 string. If not, assign a default invalid
        # string value.
        if isinstance(value, bytes) and not is_valid_utf8(value):
            logging.warning(
                'Feature "%s" has bytes value "%s" which cannot be '
                'decoded as a UTF-8 string.', feature_name, value)
            value = _INVALID_STRING

        if i < num_top_values:
            freq_and_value = string_stats.top_values.add()
            freq_and_value.value = value
            freq_and_value.frequency = count
        if i < num_rank_histogram_buckets:
            bucket = string_stats.rank_histogram.buckets.add()
            bucket.low_rank = i
            bucket.high_rank = i
            bucket.sample_count = count
            bucket.label = value
    return result
コード例 #4
0
 def test_is_valid_utf8(self):
     self.assertTrue(stats_util.is_valid_utf8(b'This is valid'))
     self.assertFalse(stats_util.is_valid_utf8(b'\xF0'))