Beispiel #1
0
 def test_string_stats_generator_with_multiple_features(self):
     # input with two batches: first batch has two examples and second batch
     # has a single example.
     batches = [{
         'a':
         np.array([np.array(['xyz']), np.array(['qwe'])]),
         'b':
         np.array([np.array(['hello', 'world']),
                   np.array(['foo', 'bar'])])
     }, {
         'a': np.array([np.array(['ab'])]),
         'b': np.array([np.array(['zzz', 'aaa', 'ddd'])])
     }]
     expected_result = {
         'a':
         text_format.Parse(
             """
         name: 'a'
         type: STRING
         string_stats {
           avg_length: 2.66666666
         }
         """, statistics_pb2.FeatureNameStatistics()),
         'b':
         text_format.Parse(
             """
         name: 'b'
         type: STRING
         string_stats {
           avg_length: 3.57142857
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     generator = string_stats_generator.StringStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Beispiel #2
0
 def test_string_stats_generator_categorical_feature(self):
     # input with two batches: first batch has two examples and second batch
     # has a single example.
     batches = [{
         'a': np.array([np.array([123]), np.array([45])])
     }, {
         'a': np.array([np.array([456])])
     }]
     expected_result = {
         'a':
         text_format.Parse(
             """
         name: 'a'
         type: INT
         string_stats {
           avg_length: 2.66666666
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     schema = text_format.Parse(
         """
     feature {
       name: "a"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     generator = string_stats_generator.StringStatsGenerator(schema=schema)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Beispiel #3
0
 def test_string_stats_generator_with_one_numeric_feature(self):
     # input with two batches: first batch has two examples and second batch
     # has a single example.
     batches = [{
         'a':
         np.array([np.array(['xyz']), np.array(['qwe'])]),
         'b':
         np.array([np.array([1.0, 2.0, 3.0]),
                   np.array([4.0, 5.0])])
     }, {
         'a': np.array([np.array(['ab'])]),
         'b': np.array([np.array([5.0, 6.0])])
     }]
     expected_result = {
         'a':
         text_format.Parse(
             """
         name: 'a'
         type: STRING
         string_stats {
           avg_length: 2.66666666
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     generator = string_stats_generator.StringStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
  def test_generate_stats_impl(self):
    # input with two batches: first batch has two examples and second batch
    # has a single example.
    batches = [{'a': np.array([np.array(['xyz']), np.array(['qwe'])])},
               {'a': np.array([np.array(['ab'])])}]

    generator1 = string_stats_generator.StringStatsGenerator()
    generator2 = uniques_stats_generator.UniquesStatsGenerator()

    expected_result = text_format.Parse(
        """
        datasets {
          features {
            name: 'a'
            type: STRING
            string_stats {
              avg_length: 2.66666666
              unique: 3
            }

          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

    with beam.Pipeline() as p:
      result = (p | beam.Create(batches) |
                stats_impl.GenerateStatisticsImpl(
                    generators=[generator1, generator2]))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))
Beispiel #5
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile and then batch input examples.
        batched_dataset = (dataset
                           | 'Profile' >> profile_util.Profile()
                           | 'BatchInputs' >> batch_util.BatchExamples())

        # If a set of whitelist features are provided, keep only those features.
        filtered_dataset = batched_dataset
        if self._options.feature_whitelist:
            filtered_dataset = (
                batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map(
                    _filter_features,
                    feature_whitelist=self._options.feature_whitelist))

        return (filtered_dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
Beispiel #6
0
def _get_default_generators(
    options, in_memory = False
):
  """Initialize default list of stats generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
  stats_generators = [
      common_stats_generator.CommonStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_values_histogram_buckets=options.num_values_histogram_buckets,
          epsilon=options.epsilon),
      numeric_stats_generator.NumericStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_histogram_buckets=options.num_histogram_buckets,
          num_quantiles_histogram_buckets=\
            options.num_quantiles_histogram_buckets,
          epsilon=options.epsilon),
      string_stats_generator.StringStatsGenerator(
          schema=options.schema)
  ]
  if in_memory:
    stats_generators.append(
        top_k_uniques_combiner_stats_generator.
        TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets))
  else:
    stats_generators.extend([
        top_k_stats_generator.TopKStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
        uniques_stats_generator.UniquesStatsGenerator(schema=options.schema)
    ])
  return stats_generators
Beispiel #7
0
 def test_string_stats_generator_with_missing_values(self):
     # input with two batches: first batch has three examples and second batch
     # has two examples.
     batches = [{
         'a':
         np.array([np.array(['xyz']), None,
                   np.array(['qwe'])],
                  dtype=np.object)
     }, {
         'a': np.array([np.array(['ab']), None], dtype=np.object)
     }]
     expected_result = {
         'a':
         text_format.Parse(
             """
         name: 'a'
         type: STRING
         string_stats {
           avg_length: 2.66666666
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     generator = string_stats_generator.StringStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Beispiel #8
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile the input examples.
        dataset |= 'ProfileExamples' >> profile_util.Profile()

        # Sample input data if sample_count option is provided.
        if self._options.sample_count is not None:
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[types.Example]], which we then flatten to get a
            # PCollection[types.Example].
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(
                            self._options.sample_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        result_protos = []
        # Iterate over the stats generators. For each generator,
        #   a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn
        #      and run it.
        #   b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform
        #      and run it.
        for generator in stats_generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> beam.CombineGlobally(
                                         _CombineFnWrapper(generator)))
            elif isinstance(generator,
                            stats_generator.TransformStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> generator.ptransform)
            else:
                raise TypeError(
                    'Statistics generator must extend one of '
                    'CombinerStatsGenerator or TransformStatsGenerator, '
                    'found object of type %s' % generator.__class__.__name__)

        # Each stats generator will output a PCollection of DatasetFeatureStatistics
        # protos. We now flatten the list of PCollections into a single PCollection,
        # then merge the DatasetFeatureStatistics protos in the PCollection into a
        # single DatasetFeatureStatisticsList proto.
        return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten()
                | 'MergeDatasetFeatureStatisticsProtos' >>
                beam.CombineGlobally(_merge_dataset_feature_stats_protos)
                | 'MakeDatasetFeatureStatisticsListProto' >>
                beam.Map(_make_dataset_feature_statistics_list_proto))
Beispiel #10
0
def generate_statistics_in_memory(examples,
                                  options=stats_options.StatsOptions()):
    """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """

    stats_generators = [
        common_stats_generator.CommonStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_values_histogram_buckets=\
              options.num_values_histogram_buckets,
            epsilon=options.epsilon),

        numeric_stats_generator.NumericStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_histogram_buckets=options.num_histogram_buckets,
            num_quantiles_histogram_buckets=\
              options.num_quantiles_histogram_buckets,
            epsilon=options.epsilon),

        string_stats_generator.StringStatsGenerator(schema=options.schema),

        top_k_uniques_combiner_stats_generator.TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
    ]

    if options.generators is not None:
        for generator in options.generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                stats_generators.append(generator)
            else:
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of type '
                    '%s.' % generator.__class__.__name__)

    batch = batch_util.merge_single_batch(examples)

    # If whitelist features are provided, keep only those features.
    if options.feature_whitelist:
        batch = {
            feature_name: batch[feature_name]
            for feature_name in options.feature_whitelist
        }

    outputs = [
        generator.extract_output(
            generator.add_input(generator.create_accumulator(), batch))
        for generator in stats_generators
    ]

    return _make_dataset_feature_statistics_list_proto(
        _merge_dataset_feature_stats_protos(outputs))
Beispiel #11
0
 def test_string_stats_generator_empty_list(self):
     batches = []
     expected_result = {}
     generator = string_stats_generator.StringStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Beispiel #12
0
 def test_string_stats_generator_empty_batch(self):
     batches = [{'a': np.array([])}]
     expected_result = {}
     generator = string_stats_generator.StringStatsGenerator()
     self.assertCombinerOutputEqual(batches, generator, expected_result)