def test_common_stats_generator_invalid_weight_feature(self):
     batches = [{'a': np.array([np.array([1])])}]
     generator = common_stats_generator.CommonStatsGenerator(
         weight_feature='w')
     with self.assertRaisesRegexp(ValueError,
                                  'Weight feature.*not present.*'):
         self.assertCombinerOutputEqual(batches, generator, None)
 def test_common_stats_generator_with_weight_feature(self):
   # input with two batches: first batch has two examples and second batch
   # has a single example.
   batches = [{'a': np.array([np.array([1.0, 2.0]),
                              np.array([3.0, 4.0, 5.0])]),
               'w': np.array([np.array([1.0]), np.array([2.0])])},
              {'a': np.array([np.array([1.0,]), None]),
               'w': np.array([np.array([3.0]), np.array([2.0])])}]
   expected_result = {
       'a': text_format.Parse(
           """
           name: 'a'
           type: FLOAT
           num_stats {
             common_stats {
               num_non_missing: 3
               num_missing: 1
               min_num_values: 1
               max_num_values: 3
               avg_num_values: 2.0
               tot_num_values: 6
               num_values_histogram {
                 buckets {
                   low_value: 1.0
                   high_value: 1.0
                   sample_count: 0.75
                 }
                 buckets {
                   low_value: 1.0
                   high_value: 2.0
                   sample_count: 0.75
                 }
                 buckets {
                   low_value: 2.0
                   high_value: 3.0
                   sample_count: 0.75
                 }
                 buckets {
                   low_value: 3.0
                   high_value: 3.0
                   sample_count: 0.75
                 }
                 type: QUANTILES
               }
               weighted_common_stats {
                 num_non_missing: 6.0
                 num_missing: 2.0
                 avg_num_values: 1.83333333
                 tot_num_values: 11.0
               }
             }
           }
           """, statistics_pb2.FeatureNameStatistics())}
   generator = common_stats_generator.CommonStatsGenerator(
       weight_feature='w',
       num_values_histogram_buckets=4)
   self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_common_stats_generator_categorical_feature(self):
     batches = [{
         'c': np.array([np.array([1, 5, 10]),
                        np.array([0])])
     }, {
         'c': np.array([np.array([1, 1, 1, 5, 15])])
     }]
     expected_result = {
         'c':
         text_format.Parse(
             """
         name: 'c'
         type: INT
         string_stats {
           common_stats {
             num_non_missing: 3
             num_missing: 0
             min_num_values: 1
             max_num_values: 5
             avg_num_values: 3.0
             tot_num_values: 9
             num_values_histogram {
               buckets {
                 low_value: 1.0
                 high_value: 3.0
                 sample_count: 1.0
               }
               buckets {
                 low_value: 3.0
                 high_value: 5.0
                 sample_count: 1.0
               }
               buckets {
                 low_value: 5.0
                 high_value: 5.0
                 sample_count: 1.0
               }
               type: QUANTILES
             }
           }
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     schema = text_format.Parse(
         """
     feature {
       name: "c"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     generator = common_stats_generator.CommonStatsGenerator(
         schema=schema, num_values_histogram_buckets=3)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_common_stats_generator_weight_feature_multiple_values(self):
     batches = [{
         'a': np.array([np.array([1])]),
         'w': np.array([np.array([2, 3])])
     }]
     generator = common_stats_generator.CommonStatsGenerator(
         weight_feature='w')
     with self.assertRaisesRegexp(ValueError,
                                  'Weight feature.*single value.*'):
         self.assertCombinerOutputEqual(batches, generator, None)
 def test_common_stats_generator_weight_feature_string_type(self):
     batches = [{
         'a': np.array([np.array([1])]),
         'w': np.array([np.array(['a'])])
     }]
     generator = common_stats_generator.CommonStatsGenerator(
         weight_feature='w')
     with self.assertRaisesRegexp(ValueError,
                                  'Weight feature.*numeric type.*'):
         self.assertCombinerOutputEqual(batches, generator, None)
Ejemplo n.º 6
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile and then batch input examples.
        batched_dataset = (dataset
                           | 'Profile' >> profile_util.Profile()
                           | 'BatchInputs' >> batch_util.BatchExamples())

        # If a set of whitelist features are provided, keep only those features.
        filtered_dataset = batched_dataset
        if self._options.feature_whitelist:
            filtered_dataset = (
                batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map(
                    _filter_features,
                    feature_whitelist=self._options.feature_whitelist))

        return (filtered_dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
Ejemplo n.º 7
0
def _get_default_generators(
    options, in_memory = False
):
  """Initialize default list of stats generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
  stats_generators = [
      common_stats_generator.CommonStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_values_histogram_buckets=options.num_values_histogram_buckets,
          epsilon=options.epsilon),
      numeric_stats_generator.NumericStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_histogram_buckets=options.num_histogram_buckets,
          num_quantiles_histogram_buckets=\
            options.num_quantiles_histogram_buckets,
          epsilon=options.epsilon),
      string_stats_generator.StringStatsGenerator(
          schema=options.schema)
  ]
  if in_memory:
    stats_generators.append(
        top_k_uniques_combiner_stats_generator.
        TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets))
  else:
    stats_generators.extend([
        top_k_stats_generator.TopKStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
        uniques_stats_generator.UniquesStatsGenerator(schema=options.schema)
    ])
  return stats_generators
 def test_common_stats_generator_empty_batch(self):
   batches = [{'a': np.array([])}]
   expected_result = {
       'a': text_format.Parse(
           """
           name: 'a'
           type: STRING
           string_stats {
             common_stats {
               num_non_missing: 0
               num_missing: 0
               tot_num_values: 0
             }
           }
           """, statistics_pb2.FeatureNameStatistics())}
   generator = common_stats_generator.CommonStatsGenerator()
   self.assertCombinerOutputEqual(batches, generator, expected_result)
Ejemplo n.º 9
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile the input examples.
        dataset |= 'ProfileExamples' >> profile_util.Profile()

        # Sample input data if sample_count option is provided.
        if self._options.sample_count is not None:
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[types.Example]], which we then flatten to get a
            # PCollection[types.Example].
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(
                            self._options.sample_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
 def test_common_stats_generator_invalid_value_numpy_dtype(self):
   batches = [{'a': np.array([np.array([1+2j])])}]
   generator = common_stats_generator.CommonStatsGenerator()
   with self.assertRaises(TypeError):
     self.assertCombinerOutputEqual(batches, generator, None)
 def test_common_stats_generator_empty_list(self):
   batches = []
   expected_result = {}
   generator = common_stats_generator.CommonStatsGenerator()
   self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_common_stats_generator_with_multiple_features(self):
   # input with two batches: first batch has two examples and second batch
   # has a single example.
   batches = [{'a': np.array([np.array([1.0, 2.0]),
                              np.array([3.0, 4.0, 5.0])]),
               'b': np.array([np.array(['x', 'y', 'z', 'w']),
                              np.array(['qwe', 'abc'])]),
               'c': np.array([np.array([1, 5, 10]), np.array([0])])},
              {'a': np.array([np.array([1.0])]),
               'b': np.array([np.array(['ab'])]),
               'c': np.array([np.array([1, 1, 1, 5, 15])])}]
   expected_result = {
       'a': text_format.Parse(
           """
           name: 'a'
           type: FLOAT
           num_stats {
             common_stats {
               num_non_missing: 3
               num_missing: 0
               min_num_values: 1
               max_num_values: 3
               avg_num_values: 2.0
               tot_num_values: 6
               num_values_histogram {
                 buckets {
                   low_value: 1.0
                   high_value: 2.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 2.0
                   high_value: 3.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 3.0
                   high_value: 3.0
                   sample_count: 1.0
                 }
                 type: QUANTILES
               }
             }
           }
           """, statistics_pb2.FeatureNameStatistics()),
       'b': text_format.Parse(
           """
           name: 'b'
           type: STRING
           string_stats {
             common_stats {
               num_non_missing: 3
               num_missing: 0
               min_num_values: 1
               max_num_values: 4
               avg_num_values: 2.33333333
               tot_num_values: 7
               num_values_histogram {
                 buckets {
                   low_value: 1.0
                   high_value: 2.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 2.0
                   high_value: 4.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 4.0
                   high_value: 4.0
                   sample_count: 1.0
                 }
                 type: QUANTILES
               }
             }
           }
           """, statistics_pb2.FeatureNameStatistics()),
       'c': text_format.Parse(
           """
           name: 'c'
           type: INT
           num_stats {
             common_stats {
               num_non_missing: 3
               num_missing: 0
               min_num_values: 1
               max_num_values: 5
               avg_num_values: 3.0
               tot_num_values: 9
               num_values_histogram {
                 buckets {
                   low_value: 1.0
                   high_value: 3.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 3.0
                   high_value: 5.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 5.0
                   high_value: 5.0
                   sample_count: 1.0
                 }
                 type: QUANTILES
               }
             }
           }
           """, statistics_pb2.FeatureNameStatistics())}
   generator = common_stats_generator.CommonStatsGenerator(
       num_values_histogram_buckets=3, epsilon=0.001)
   self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_common_stats_generator_with_entire_feature_value_list_missing(self):
   # input with two batches: first batch has three examples and second batch
   # has two examples.
   batches = [{'a': np.array([np.array([1.0, 2.0]), None,
                              np.array([3.0, 4.0, 5.0])], dtype=np.object),
               'b': np.array([np.array(['x', 'y', 'z', 'w']), None,
                              np.array(['qwe', 'abc'])], dtype=np.object)},
              {'a': np.array([np.array([1.0]), None], dtype=np.object),
               'b': np.array([None, np.array(['qwe'])], dtype=np.object)}]
   expected_result = {
       'a': text_format.Parse(
           """
           name: 'a'
           type: FLOAT
           num_stats {
             common_stats {
               num_non_missing: 3
               num_missing: 2
               min_num_values: 1
               max_num_values: 3
               avg_num_values: 2.0
               tot_num_values: 6
               num_values_histogram {
                 buckets {
                   low_value: 1.0
                   high_value: 2.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 2.0
                   high_value: 3.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 3.0
                   high_value: 3.0
                   sample_count: 1.0
                 }
                 type: QUANTILES
               }
             }
           }
           """, statistics_pb2.FeatureNameStatistics()),
       'b': text_format.Parse(
           """
           name: 'b'
           type: STRING
           string_stats {
             common_stats {
               num_non_missing: 3
               num_missing: 2
               min_num_values: 1
               max_num_values: 4
               avg_num_values: 2.33333333
               tot_num_values: 7
               num_values_histogram {
                 buckets {
                   low_value: 1.0
                   high_value: 2.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 2.0
                   high_value: 4.0
                   sample_count: 1.0
                 }
                 buckets {
                   low_value: 4.0
                   high_value: 4.0
                   sample_count: 1.0
                 }
                 type: QUANTILES
               }
             }
           }
           """, statistics_pb2.FeatureNameStatistics())}
   generator = common_stats_generator.CommonStatsGenerator(
       num_values_histogram_buckets=3)
   self.assertCombinerOutputEqual(batches, generator, expected_result)
Ejemplo n.º 14
0
  def test_tfdv_telemetry(self):
    batches = [
        {
            'a': np.array([
                np.array([1.0, 2.0], dtype=np.floating),
                np.array([3.0, 4.0, np.NaN, 5.0], dtype=np.floating)]),
            'b': np.array([
                np.array(['a', 'b', 'c', 'e'], dtype=np.object),
                np.array(['d', 'e', 'f'], dtype=np.object)]),
            'c': np.array([None, None])
        },
        {
            'a': np.array([None]),
            'b': np.array([np.array(['a', 'b', 'c'], dtype=np.object)]),
            'c': np.array([np.array([10, 20, 30], dtype=np.integer)])
        },
        {
            'a': np.array([np.array([5.0], dtype=np.floating)]),
            'b': np.array([np.array(['d', 'e', 'f'], dtype=np.object)]),
            'c': np.array([np.array([1], dtype=np.integer)])
        }
    ]

    p = beam.Pipeline()
    _ = (p
         | 'CreateBatches' >> beam.Create(batches)
         | 'CommonStatsCombiner' >> beam.CombineGlobally(
             stats_impl._CombineFnWrapper(
                 common_stats_generator.CommonStatsGenerator())))

    runner = p.run()
    runner.wait_until_finish()
    result_metrics = runner.metrics()

    num_metrics = len(
        result_metrics.query(beam.metrics.metric.MetricsFilter().with_namespace(
            constants.METRICS_NAMESPACE))['counters'])
    self.assertEqual(num_metrics, 14)

    expected_result = {
        'num_instances': 4,
        'num_missing_feature_values': 3,
        'num_int_feature_values': 2,
        'int_feature_values_min_count': 1,
        'int_feature_values_max_count': 3,
        'int_feature_values_mean_count': 2,
        'num_float_feature_values': 3,
        'float_feature_values_min_count': 1,
        'float_feature_values_max_count': 4,
        'float_feature_values_mean_count': 2,
        'num_string_feature_values': 4,
        'string_feature_values_min_count': 3,
        'string_feature_values_max_count': 4,
        'string_feature_values_mean_count': 3,
    }
    # Check number of counters.
    actual_metrics = result_metrics.query(
        beam.metrics.metric.MetricsFilter().with_namespace(
            constants.METRICS_NAMESPACE))['counters']
    self.assertLen(actual_metrics, len(expected_result))

    # Check each counter.
    for counter_name in expected_result:
      actual_counter = result_metrics.query(
          beam.metrics.metric.MetricsFilter().with_name(counter_name)
          )['counters']
      self.assertLen(actual_counter, 1)
      self.assertEqual(actual_counter[0].committed,
                       expected_result[counter_name])
Ejemplo n.º 15
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        result_protos = []
        # Iterate over the stats generators. For each generator,
        #   a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn
        #      and run it.
        #   b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform
        #      and run it.
        for generator in stats_generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> beam.CombineGlobally(
                                         _CombineFnWrapper(generator)))
            elif isinstance(generator,
                            stats_generator.TransformStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> generator.ptransform)
            else:
                raise TypeError(
                    'Statistics generator must extend one of '
                    'CombinerStatsGenerator or TransformStatsGenerator, '
                    'found object of type %s' % generator.__class__.__name__)

        # Each stats generator will output a PCollection of DatasetFeatureStatistics
        # protos. We now flatten the list of PCollections into a single PCollection,
        # then merge the DatasetFeatureStatistics protos in the PCollection into a
        # single DatasetFeatureStatisticsList proto.
        return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten()
                | 'MergeDatasetFeatureStatisticsProtos' >>
                beam.CombineGlobally(_merge_dataset_feature_stats_protos)
                | 'MakeDatasetFeatureStatisticsListProto' >>
                beam.Map(_make_dataset_feature_statistics_list_proto))
Ejemplo n.º 16
0
def generate_statistics_in_memory(examples,
                                  options=stats_options.StatsOptions()):
    """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """

    stats_generators = [
        common_stats_generator.CommonStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_values_histogram_buckets=\
              options.num_values_histogram_buckets,
            epsilon=options.epsilon),

        numeric_stats_generator.NumericStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_histogram_buckets=options.num_histogram_buckets,
            num_quantiles_histogram_buckets=\
              options.num_quantiles_histogram_buckets,
            epsilon=options.epsilon),

        string_stats_generator.StringStatsGenerator(schema=options.schema),

        top_k_uniques_combiner_stats_generator.TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
    ]

    if options.generators is not None:
        for generator in options.generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                stats_generators.append(generator)
            else:
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of type '
                    '%s.' % generator.__class__.__name__)

    batch = batch_util.merge_single_batch(examples)

    # If whitelist features are provided, keep only those features.
    if options.feature_whitelist:
        batch = {
            feature_name: batch[feature_name]
            for feature_name in options.feature_whitelist
        }

    outputs = [
        generator.extract_output(
            generator.add_input(generator.create_accumulator(), batch))
        for generator in stats_generators
    ]

    return _make_dataset_feature_statistics_list_proto(
        _merge_dataset_feature_stats_protos(outputs))