def test_basic_stats_generator_categorical_feature(self):
     batches = [
         pa.Table.from_arrays([pa.array([[1, 5, 10], [0]])], ['c']),
         pa.Table.from_arrays([pa.array([[1, 1, 1, 5, 15], [-1]])], ['c']),
     ]
     expected_result = {
         types.FeaturePath(['c']):
         text_format.Parse(
             """
         path {
           step: 'c'
         }
         string_stats {
           common_stats {
             num_non_missing: 4
             min_num_values: 1
             max_num_values: 5
             avg_num_values: 2.5
             num_values_histogram {
               buckets {
                 low_value: 1.0
                 high_value: 1.0
                 sample_count: 1.3333333
               }
               buckets {
                 low_value: 1.0
                 high_value: 3.0
                 sample_count: 1.3333333
               }
               buckets {
                 low_value: 3.0
                 high_value: 5.0
                 sample_count: 1.3333333
               }
               type: QUANTILES
             }
             tot_num_values: 10
           }
           avg_length: 1.29999995232
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     schema = text_format.Parse(
         """
     feature {
       name: "c"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     generator = basic_stats_generator.BasicStatsGenerator(
         schema=schema,
         num_values_histogram_buckets=3,
         num_histogram_buckets=3,
         num_quantiles_histogram_buckets=4)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Example #2
0
def _merge_dataset_feature_stats_protos(
    stats_protos: Iterable[statistics_pb2.DatasetFeatureStatistics]
) -> statistics_pb2.DatasetFeatureStatistics:
  """Merges together a list of DatasetFeatureStatistics protos.

  Args:
    stats_protos: A list of DatasetFeatureStatistics protos to merge.

  Returns:
    The merged DatasetFeatureStatistics proto.
  """
  stats_per_feature = {}
  # Create a new DatasetFeatureStatistics proto.
  result = statistics_pb2.DatasetFeatureStatistics()
  # Iterate over each DatasetFeatureStatistics proto and merge the
  # FeatureNameStatistics protos per feature and add the cross feature stats.
  for stats_proto in stats_protos:
    if stats_proto.cross_features:
      result.cross_features.extend(stats_proto.cross_features)
    for feature_stats_proto in stats_proto.features:
      feature_path = types.FeaturePath.from_proto(feature_stats_proto.path)
      if feature_path not in stats_per_feature:
        # Make a copy for the "cache" since we are modifying it in 'else' below.
        new_feature_stats_proto = statistics_pb2.FeatureNameStatistics()
        new_feature_stats_proto.CopyFrom(feature_stats_proto)
        stats_per_feature[feature_path] = new_feature_stats_proto
      else:
        stats_for_feature = stats_per_feature[feature_path]
        # MergeFrom would concatenate repeated fields which is not what we want
        # for path.step.
        del stats_for_feature.path.step[:]
        stats_for_feature.MergeFrom(feature_stats_proto)

  num_examples = None
  for feature_stats_proto in six.itervalues(stats_per_feature):
    # Add the merged FeatureNameStatistics proto for the feature
    # into the DatasetFeatureStatistics proto.
    new_feature_stats_proto = result.features.add()
    new_feature_stats_proto.CopyFrom(feature_stats_proto)

    # Get the number of examples from one of the features that
    # has common stats.
    if num_examples is None:
      stats_type = feature_stats_proto.WhichOneof('stats')
      stats_proto = None
      if stats_type == 'num_stats':
        stats_proto = feature_stats_proto.num_stats
      else:
        stats_proto = feature_stats_proto.string_stats

      if stats_proto.HasField('common_stats'):
        num_examples = (stats_proto.common_stats.num_non_missing +
                        stats_proto.common_stats.num_missing)

  # Set the num_examples field.
  if num_examples is not None:
    result.num_examples = num_examples
  return result
def _get_test_stats_with_mi(feature_paths):
    """Get stats proto for MI test."""
    result = statistics_pb2.DatasetFeatureStatistics()
    for feature_path in feature_paths:
        feature_proto = text_format.Parse(
            """
                custom_stats {
                  name: "max_sklearn_adjusted_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "max_sklearn_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "mean_sklearn_adjusted_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "mean_sklearn_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "median_sklearn_adjusted_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "median_sklearn_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "min_sklearn_adjusted_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "min_sklearn_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "num_partitions_sklearn_adjusted_mutual_information"
                  num: 2.0
                }
                custom_stats {
                  name: "num_partitions_sklearn_mutual_information"
                  num: 2.0
                }
                custom_stats {
                  name: "std_dev_sklearn_adjusted_mutual_information"
                  num: 0.0
                }
                custom_stats {
                  name: "std_dev_sklearn_mutual_information"
                  num: 0.0
                }
        """, statistics_pb2.FeatureNameStatistics())
        feature_proto.path.CopyFrom(feature_path.to_proto())
        result.features.add().CopyFrom(feature_proto)
    return result
def make_feature_stats_proto_with_topk_stats(feature_name,
                                             top_k_value_count_list,
                                             is_categorical, is_weighted_stats,
                                             num_top_values,
                                             num_rank_histogram_buckets):
    """Makes a FeatureNameStatistics proto containing the top-k stats.

  Args:
    feature_name: The feature name.
    top_k_value_count_list: A list of FeatureValueCount tuples.
    is_categorical: Whether the feature is categorical.
    is_weighted_stats: Whether top_k_value_count_list incorporates weights.
    num_top_values: The number of most frequent feature values to keep for
      string features.
    num_rank_histogram_buckets: The number of buckets in the rank histogram for
      string features.
  Returns:
    A FeatureNameStatistics proto containing the top-k stats.
  """
    # Sort the top_k_value_count_list in descending order by count. Where
    # multiple feature values have the same count, consider the feature with the
    # 'larger' feature value to be larger for purposes of breaking the tie.
    top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]),
                                reverse=True)

    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)

    if is_weighted_stats:
        string_stats = result.string_stats.weighted_string_stats
    else:
        string_stats = result.string_stats

    for i in range(len(top_k_value_count_list)):
        value, count = top_k_value_count_list[i]
        # Check if we have a valid utf-8 string. If not, assign a default invalid
        # string value.
        if isinstance(value, bytes) and not _is_valid_utf8(value):
            logging.warning(
                'Feature "%s" has bytes value "%s" which cannot be '
                'decoded as a UTF-8 string.', feature_name, value)
            value = _INVALID_STRING

        if i < num_top_values:
            freq_and_value = string_stats.top_values.add()
            freq_and_value.value = value
            freq_and_value.frequency = count
        if i < num_rank_histogram_buckets:
            bucket = string_stats.rank_histogram.buckets.add()
            bucket.low_rank = i
            bucket.high_rank = i
            bucket.sample_count = count
            bucket.label = value
    return result
 def test_cases_with_no_image_stats(self, batches):
     """Test cases that should not generate image statistics."""
     image_decoder = FakeImageDecoder()
     generator = image_stats_generator.ImageStatsGenerator(
         image_decoder=image_decoder,
         values_threshold=1,
         enable_size_stats=True)
     self.assertCombinerOutputEqual(batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
Example #6
0
    def extract_output(
            self, accumulator: _PartialNLStats
    ) -> statistics_pb2.FeatureNameStatistics:
        """Return result of converting accumulator into the output value.

    Args:
      accumulator: The final accumulator value.

    Returns:
      A proto representing the result of this stats generator.
    """
        result = statistics_pb2.FeatureNameStatistics()
        if accumulator.invalidate:
            return result

        nls = statistics_pb2.NaturalLanguageStatistics()
        if accumulator.total_num_tokens:
            nls.feature_coverage = (float(accumulator.num_in_vocab_tokens) /
                                    accumulator.total_num_tokens)
            result.custom_stats.add(name='nl_feature_coverage',
                                    num=nls.feature_coverage)
        if accumulator.num_in_vocab_tokens:
            nls.avg_token_length = (
                float(accumulator.sum_in_vocab_token_lengths) /
                accumulator.num_in_vocab_tokens)
            result.custom_stats.add(name='nl_avg_token_length',
                                    num=nls.avg_token_length)
        if self._num_quantiles_histogram_buckets:
            _populate_token_length_histogram(
                nls, accumulator, self._num_quantiles_histogram_buckets)
            if nls.token_length_histogram.buckets:
                result.custom_stats.add(name='nl_token_length_histogram',
                                        histogram=nls.token_length_histogram)
        if self._num_rank_histogram_buckets:
            _populate_token_rank_histogram(nls, accumulator,
                                           self._num_rank_histogram_buckets)
            if nls.rank_histogram.buckets:
                result.custom_stats.add(name='nl_rank_tokens',
                                        rank_histogram=nls.rank_histogram)
        if accumulator.token_statistics:
            for name, stats in accumulator.token_statistics.items():
                _populate_token_statistics(name, self._num_histogram_buckets,
                                           accumulator.num_examples,
                                           nls.token_statistics.add(), stats,
                                           result)

        for r in (accumulator.reported_sequences_coverage +
                  accumulator.reported_sequences_avg_token_length):
            str_seq = str(r[0])
            nls.reported_sequences.append(str_seq)
        if nls.reported_sequences:
            reported_sequences = '\n'.join(nls.reported_sequences)
            result.custom_stats.add(name='nl_reported_sequences',
                                    str=reported_sequences)
        my_proto = any_pb2.Any()
        result.custom_stats.add(name='nl_statistics', any=my_proto.Pack(nls))
        return result
Example #7
0
 def assert_on_unequal_feature_protos(self):
     expected = text_format.Parse(
         """
       name: 'a'
       custom_stats {
         name: 'MI'
         num: 2.5
       }
      """, statistics_pb2.FeatureNameStatistics())
     actual = text_format.Parse(
         """
       name: 'a'
       custom_stats {
         name: 'MI'
         num: 2.0
       }
      """, statistics_pb2.FeatureNameStatistics())
     test_util.assert_feature_proto_equal(self, actual, expected)
 def test_common_stats_generator_with_weight_feature(self):
   # input with two batches: first batch has two examples and second batch
   # has a single example.
   batches = [{'a': np.array([np.array([1.0, 2.0]),
                              np.array([3.0, 4.0, 5.0])]),
               'w': np.array([np.array([1.0]), np.array([2.0])])},
              {'a': np.array([np.array([1.0,]), None]),
               'w': np.array([np.array([3.0]), np.array([2.0])])}]
   expected_result = {
       'a': text_format.Parse(
           """
           name: 'a'
           type: FLOAT
           num_stats {
             common_stats {
               num_non_missing: 3
               num_missing: 1
               min_num_values: 1
               max_num_values: 3
               avg_num_values: 2.0
               tot_num_values: 6
               num_values_histogram {
                 buckets {
                   low_value: 1.0
                   high_value: 1.0
                   sample_count: 0.75
                 }
                 buckets {
                   low_value: 1.0
                   high_value: 2.0
                   sample_count: 0.75
                 }
                 buckets {
                   low_value: 2.0
                   high_value: 3.0
                   sample_count: 0.75
                 }
                 buckets {
                   low_value: 3.0
                   high_value: 3.0
                   sample_count: 0.75
                 }
                 type: QUANTILES
               }
               weighted_common_stats {
                 num_non_missing: 6.0
                 num_missing: 2.0
                 avg_num_values: 1.83333333
                 tot_num_values: 11.0
               }
             }
           }
           """, statistics_pb2.FeatureNameStatistics())}
   generator = common_stats_generator.CommonStatsGenerator(
       weight_feature='w',
       num_values_histogram_buckets=4)
   self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_common_stats_generator_categorical_feature(self):
     batches = [{
         'c': np.array([np.array([1, 5, 10]),
                        np.array([0])])
     }, {
         'c': np.array([np.array([1, 1, 1, 5, 15])])
     }]
     expected_result = {
         'c':
         text_format.Parse(
             """
         name: 'c'
         type: INT
         string_stats {
           common_stats {
             num_non_missing: 3
             num_missing: 0
             min_num_values: 1
             max_num_values: 5
             avg_num_values: 3.0
             tot_num_values: 9
             num_values_histogram {
               buckets {
                 low_value: 1.0
                 high_value: 3.0
                 sample_count: 1.0
               }
               buckets {
                 low_value: 3.0
                 high_value: 5.0
                 sample_count: 1.0
               }
               buckets {
                 low_value: 5.0
                 high_value: 5.0
                 sample_count: 1.0
               }
               type: QUANTILES
             }
           }
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     schema = text_format.Parse(
         """
     feature {
       name: "c"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     generator = common_stats_generator.CommonStatsGenerator(
         schema=schema, num_values_histogram_buckets=3)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
def _make_feature_stats_proto_topk(
    feature_path: types.FeaturePath,
    top_k_values_pairs: List[FeatureValueCount], is_categorical: bool,
    is_weighted_stats: bool, num_top_values: int,
    frequency_threshold: Union[float, int],
    num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics:
  """Makes a FeatureNameStatistics proto containing the top-k stats."""
  # Sort (a copy of) the top_k_value_pairs in descending order by count.
  # Where multiple feature values have the same count, consider the feature with
  # the 'larger' feature value to be larger for purposes of breaking the tie.

  top_k_values_pairs = sorted(
      top_k_values_pairs,
      key=lambda pair: (pair.count, pair.feature_value),
      reverse=True)

  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  # If we have a categorical feature, we preserve the type to be the original
  # INT type.
  result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                 else statistics_pb2.FeatureNameStatistics.STRING)

  if is_weighted_stats:
    string_stats = result.string_stats.weighted_string_stats
  else:
    string_stats = result.string_stats

  for i in range(len(top_k_values_pairs)):
    value, count = top_k_values_pairs[i]
    if count < frequency_threshold:
      break
    # Check if we have a valid utf-8 string. If not, assign a default invalid
    # string value.
    if isinstance(value, six.binary_type):
      decoded_value = stats_util.maybe_get_utf8(value)
      if decoded_value is None:
        logging.warning('Feature "%s" has bytes value "%s" which cannot be '
                        'decoded as a UTF-8 string.', feature_path, value)
        value = constants.NON_UTF8_PLACEHOLDER
      else:
        value = decoded_value
    elif not isinstance(value, six.text_type):
      value = str(value)

    if i < num_top_values:
      freq_and_value = string_stats.top_values.add()
      freq_and_value.value = value
      freq_and_value.frequency = count
    if i < num_rank_histogram_buckets:
      bucket = string_stats.rank_histogram.buckets.add()
      bucket.low_rank = i
      bucket.high_rank = i
      bucket.sample_count = count
      bucket.label = value
  return result
    def test_nl_generator_avg_word_heuristic_non_match(self):
        """Tests generator with avg word length heuristic."""
        generator = nlsg.NLDomainInferringStatsGenerator(values_threshold=2)
        input_batches = [
            pa.array([['abc' * 10, 'xxxxxxxxx'], ['xosuhddsofuhg123fdgosh']]),
            pa.array([['Only one valid text?']]),
        ]

        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())
def _make_feature_stats_proto(feature_name, count, is_categorical):
    """Makes a FeatureNameStatistics proto containing the uniques stats."""
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)
    result.string_stats.unique = count
    return result
Example #13
0
 def test_get_custom_stats_numeric(self):
     stats = text_format.Parse(
         """
         name: 'feature'
         custom_stats {
           name: 'abc'
           num: 100.0
         }
     """, statistics_pb2.FeatureNameStatistics())
     self.assertEqual(stats_util.get_custom_stats(stats, 'abc'), 100.0)
 def test_time_stats_generator_non_time_integers(self):
     """Tests that the generator handles integers that are not times."""
     # None of these numbers are valid times.
     input_batches = [
         pa.array([[1, 2]]),
     ]
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.1,
                                                         values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
 def assert_on_two_protos_within_valid_error_but_different_name(self):
     expected = text_format.Parse(
         """
       name: 'a'
       custom_stats {
         name: 'MI'
         num: 2.5
       }
          """, statistics_pb2.FeatureNameStatistics())
     actual = text_format.Parse(
         """
       name: 'b'
       custom_stats {
         name: 'MI'
         num: 2.45
       }
          """, statistics_pb2.FeatureNameStatistics())
     test_util.assert_feature_proto_equal_with_error_on_custom_stats(
         self, actual, expected)
Example #16
0
 def test_get_custom_stats_string(self):
     stats = text_format.Parse(
         """
         name: 'feature'
         custom_stats {
           name: 'abc'
           str: 'xyz'
         }
     """, statistics_pb2.FeatureNameStatistics())
     self.assertEqual(stats_util.get_custom_stats(stats, 'abc'), 'xyz')
Example #17
0
def _make_feature_stats_proto(string_stats, feature_name, is_categorical):
    """Convert the partial string statistics into FeatureNameStatistics proto."""
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)
    result.string_stats.avg_length = (string_stats.total_bytes_length /
                                      string_stats.total_num_values)
    return result
Example #18
0
 def test_get_custom_stats_not_found(self):
   stats = text_format.Parse(
       """
           name: 'feature'
           custom_stats {
             name: 'abc'
             num: 100.0
           }
       """, statistics_pb2.FeatureNameStatistics())
   with self.assertRaisesRegexp(ValueError, 'Custom statistics.*not found'):
     stats_util.get_custom_stats(stats, 'xyz')
 def test_basic_stats_generator_no_value_in_batch(self):
     batches = [
         pa.Table.from_arrays(
             [pa.array([[], [], []], type=pa.list_(pa.int64()))], ['a'])
     ]
     expected_result = {
         types.FeaturePath(['a']):
         text_format.Parse(
             """
         path {
           step: 'a'
         }
         num_stats {
           common_stats {
             num_non_missing: 3
             num_values_histogram {
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               buckets {
                 sample_count: 0.3
               }
               type: QUANTILES
             }
           }
         }""", statistics_pb2.FeatureNameStatistics())
     }
     generator = basic_stats_generator.BasicStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
def _make_feature_stats_proto(numeric_stats, feature_name, quantiles_combiner,
                              num_histogram_buckets,
                              num_quantiles_histogram_buckets):
    """Convert the partial numeric statistics into FeatureNameStatistics proto."""
    numeric_stats_proto = statistics_pb2.NumericStatistics()

    # Set the stats in the proto only if we have at least one value for the
    # feature.
    if numeric_stats.total_num_values > 0:
        mean = numeric_stats.sum / numeric_stats.total_num_values
        variance = max(
            0,
            (numeric_stats.sum_of_squares / numeric_stats.total_num_values) -
            mean * mean)
        numeric_stats_proto.mean = float(mean)
        numeric_stats_proto.std_dev = math.sqrt(variance)
        numeric_stats_proto.num_zeros = numeric_stats.num_zeros
        numeric_stats_proto.min = float(numeric_stats.min)
        numeric_stats_proto.max = float(numeric_stats.max)

        # Extract the quantiles from the summary.
        quantiles = quantiles_combiner.extract_output(
            numeric_stats.quantiles_summary)

        # Find the median from the quantiles and update the numeric stats proto.
        numeric_stats_proto.median = float(
            quantiles_util.find_median(quantiles))

        # Construct the equi-width histogram from the quantiles and add it to the
        # numeric stats proto.
        std_histogram = quantiles_util.generate_equi_width_histogram(
            quantiles, numeric_stats.min, numeric_stats.max,
            numeric_stats.total_num_values, num_histogram_buckets)
        std_histogram.num_nan = numeric_stats.num_nan
        new_std_histogram = numeric_stats_proto.histograms.add()
        new_std_histogram.CopyFrom(std_histogram)

        # Construct the quantiles histogram from the quantiles and add it to the
        # numeric stats proto.
        q_histogram = quantiles_util.generate_quantiles_histogram(
            quantiles, numeric_stats.min, numeric_stats.max,
            numeric_stats.total_num_values, num_quantiles_histogram_buckets)
        q_histogram.num_nan = numeric_stats.num_nan
        new_q_histogram = numeric_stats_proto.histograms.add()
        new_q_histogram.CopyFrom(q_histogram)

    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    result.type = numeric_stats.type
    result.num_stats.CopyFrom(numeric_stats_proto)

    return result
def _make_feature_stats_proto_uniques(
        feature_path: types.FeaturePath, num_uniques: int,
        is_categorical: bool) -> statistics_pb2.FeatureNameStatistics:
    """Makes a FeatureNameStatistics proto containing the uniques stats."""
    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)
    result.string_stats.unique = num_uniques
    return result
 def test_time_stats_generator_no_valid_formats(self):
     """Tests that the generator handles batches that contain no valid values."""
     # None of these values is a valid format.
     input_batches = [
         pa.array([['', '2018-Nov-30', '20183011']]),
         pa.array([['all/invalid', '2018-11-30invalid']]),
         pa.array([['invalid2018-11-30', 'invalid\n2018-11-30']])
     ]
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.1,
                                                         values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
    def test_make_feature_stats_proto_topk_uniques_unordered(self):
        expected_result = text_format.Parse(
            """
        path {
          step: 'fa'
        }
        type: INT
        string_stats {
          unique: 4
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          top_values {
            value: 'd'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
          }
    }""", statistics_pb2.FeatureNameStatistics())

        value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)]
        top_k_value_count_list = [
            top_k_uniques_stats_util.FeatureValueCount(value_count[0],
                                                       value_count[1])
            for value_count in value_counts
        ]
        result = (
            top_k_uniques_stats_util.make_feature_stats_proto_topk_uniques(
                types.FeaturePath(['fa']),
                is_categorical=True,
                num_top_values=3,
                frequency_threshold=1,
                num_rank_histogram_buckets=2,
                num_unique=4,
                value_count_list=top_k_value_count_list))
        test_util.assert_feature_proto_equal(self, result, expected_result)
 def test_nl_generator_invalidation_check(self):
     """Tests generator invalidation with fake heuristic."""
     # Expected to give 6 matches.
     input_batches = [
         pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]),
         pa.array([['MATCH', 'MATCH']]),
         # Incorrect type invalidates accumulator.
         pa.array([[42]]),
     ]
     # No domain_info is generated as the incorrect type of 42 value invalidated
     # the stats.
     generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
 def test_nl_generator_utf8_check(self):
     """Tests generator utf8 check with fake heuristic."""
     # Expected to give 6 matches.
     input_batches = [
         pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]),
         pa.array([['MATCH', 'MATCH']]),
         # Non utf-8 string invalidates accumulator.
         pa.array([[b'\xF0']]),
     ]
     # Try generators with values_threshold=1 which should have generated
     # stats without the non utf-8 value.
     generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
 def test_basic_stats_generator_only_nan(self):
     b1 = pa.Table.from_arrays(
         [pa.array([[np.NaN]], type=pa.list_(pa.float32()))], ['a'])
     batches = [b1]
     expected_result = {
         types.FeaturePath(['a']):
         text_format.Parse(
             """
         path {
           step: 'a'
         }
         type: FLOAT
         num_stats {
           common_stats {
             num_non_missing: 1
             min_num_values: 1
             max_num_values: 1
             avg_num_values: 1.0
             tot_num_values: 1
             num_values_histogram {
               buckets {
                 low_value: 1.0
                 high_value: 1.0
                 sample_count: 0.5
               }
               buckets {
                 low_value: 1.0
                 high_value: 1.0
                 sample_count: 0.5
               }
               type: QUANTILES
             }
           }
           histograms {
             num_nan: 1
             type: STANDARD
           }
           histograms {
             num_nan: 1
             type: QUANTILES
           }
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     generator = basic_stats_generator.BasicStatsGenerator(
         num_values_histogram_buckets=2,
         num_histogram_buckets=3,
         num_quantiles_histogram_buckets=4)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_make_feature_stats_proto_with_topk_stats_weighted(self):
   expected_result = text_format.Parse(
       """
       path {
         step: 'fa'
       }
       type: STRING
       string_stats {
         weighted_string_stats {
           top_values {
             value: 'a'
             frequency: 4
           }
           top_values {
             value: 'c'
             frequency: 3
           }
           top_values {
             value: 'd'
             frequency: 2
           }
           rank_histogram {
             buckets {
               low_rank: 0
               high_rank: 0
               label: "a"
               sample_count: 4.0
             }
             buckets {
               low_rank: 1
               high_rank: 1
               label: "c"
               sample_count: 3.0
             }
           }
         }
   }""", statistics_pb2.FeatureNameStatistics())
   value_counts = [('a', 4), ('c', 3), ('d', 2), ('b', 2)]
   top_k_value_count_list = [
       top_k_uniques_stats_generator.FeatureValueCount(
           value_count[0], value_count[1])
       for value_count in value_counts
   ]
   result = (
       top_k_uniques_stats_generator
       .make_feature_stats_proto_with_topk_stats(
           types.FeaturePath(['fa']),
           top_k_value_count_list, False, True, 3, 1, 2))
   compare.assertProtoEqual(self, result, expected_result)
    def test_nl_generator_values_threshold_check(self):
        """Tests generator values threshold with fake heuristic."""
        # Expected to give 6 matches.
        input_batches = [
            pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]),
            pa.array([['MATCH', 'MATCH']]),
            # Nones should be ignored.
            pa.array([None, None]),
        ]
        # Try generators with values_threshold=7 (should not create stats) and
        # 6 (should create stats)
        generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=7)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())

        generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=6)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info', str='natural_language_domain {}'),
                statistics_pb2.CustomStatistic(
                    name='natural_language_match_rate', num=1.0)
            ]))
    def extract_output(
        self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats]
    ) -> statistics_pb2.DatasetFeatureStatistics:
        result = statistics_pb2.DatasetFeatureStatistics()
        for feature_path, partial_stats in accumulator.items():
            feature_result = statistics_pb2.FeatureNameStatistics()
            feature_result.path.CopyFrom(feature_path.to_proto())
            feature_result.custom_stats.add(
                name='missing_value',
                num=partial_stats.value_feature_num_missing)
            index_features_num_missing_histogram = statistics_pb2.RankHistogram(
            )
            max_length_diff_histogram = statistics_pb2.RankHistogram()
            min_length_diff_histogram = statistics_pb2.RankHistogram()
            # Sort to get deterministic ordering of the buckets in the custom stat.
            for index_feature in sorted(
                    partial_stats.index_features_num_missing):
                # The label is the last step in the feature path (and shares the parent
                # with the sparse feature).
                label = index_feature.steps()[-1]
                missing_bucket = index_features_num_missing_histogram.buckets.add(
                )
                missing_bucket.label = label
                missing_bucket.sample_count = partial_stats.index_features_num_missing[
                    index_feature]
                max_length_bucket = max_length_diff_histogram.buckets.add()
                max_length_bucket.label = label
                max_length_bucket.sample_count = (
                    partial_stats.index_features_max_length_diff[index_feature]
                )
                min_length_bucket = min_length_diff_histogram.buckets.add()
                min_length_bucket.label = label
                min_length_bucket.sample_count = (
                    partial_stats.index_features_min_length_diff[index_feature]
                )
            feature_result.custom_stats.add(
                name='missing_index',
                rank_histogram=index_features_num_missing_histogram)
            feature_result.custom_stats.add(
                name='max_length_diff',
                rank_histogram=max_length_diff_histogram)
            feature_result.custom_stats.add(
                name='min_length_diff',
                rank_histogram=min_length_diff_histogram)

            new_feature_stats_proto = result.features.add()
            new_feature_stats_proto.CopyFrom(feature_result)
        return result
 def test_time_stats_generator_inconsistent_type_invalidation_check(self):
     """Tests that generator invalidates stats if inconsistent types are used."""
     # Absent invalidation, this is expected to give 6 matches.
     input_batches = [
         pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                   ['2018-11-30']]),
         pa.array([['2018-11-30', '2018-11-30']]),
         pa.array([[1.0]]),
     ]
     # No domain_info should be generated as the incorrect type of the 1.0 value
     # should invalidate the stats. Absent this type issue, these examples would
     # satisfy the specified match_ratio and values_threshold.
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.5,
                                                         values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())