def test_all_string_features(self):
   # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
   # fb: 1 'a', 2 'b', 3 'c'
   batches = [{'fa': np.array([np.array(['a', 'b', 'c', 'e']), None,
                               np.array(['a', 'c', 'd'])], dtype=np.object),
               'fb': np.array([np.array(['a', 'c', 'c']), np.array(['b']),
                               None], dtype=np.object)},
              {'fa': np.array([np.array(['a', 'a', 'b', 'c', 'd']), None],
                              dtype=np.object),
               'fb': np.array([None, np.array(['b', 'c'])],
                              dtype=np.object)}]
   expected_result_fa = text_format.Parse(
       """
     features {
       name: 'fa'
       type: STRING
       string_stats {
         unique: 5
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
   expected_result_fb = text_format.Parse(
       """
     features {
       name: 'fb'
       type: STRING
       string_stats {
         unique: 3
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
   generator = uniques_stats_generator.UniquesStatsGenerator()
   self.assertTransformOutputEqual(batches, generator,
                                   [expected_result_fa, expected_result_fb])
 def test_with_categorical_feature(self):
   batches = [{'fa': np.array([np.array([12, 23, 34, 12]),
                               np.array([45, 23])])},
              {'fa': np.array([np.array([12, 12, 34, 45])])}]
   expected_result_fa = text_format.Parse(
       """
     features {
       name: 'fa'
       type: INT
       string_stats {
         unique: 4
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
   schema = text_format.Parse(
       """
       feature {
         name: "fa"
         type: INT
         int_domain {
           is_categorical: true
         }
       }
       """, schema_pb2.Schema())
   generator = uniques_stats_generator.UniquesStatsGenerator(schema=schema)
   self.assertTransformOutputEqual(batches, generator, [expected_result_fa])
 def test_one_numeric_feature(self):
     # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
     examples = [{
         'fa': np.array(['a', 'b', 'c', 'e']),
         'fb': np.array([1.0, 2.0, 3.0])
     }, {
         'fa': None,
         'fb': np.array([4.0, 5.0])
     }, {
         'fa': np.array(['a', 'c', 'd']),
         'fb': None
     }, {
         'fa': np.array(['a', 'a', 'b', 'c', 'd']),
         'fb': None
     }]
     expected_result_fa = text_format.Parse(
         """
   features {
     name: 'fa'
     type: STRING
     string_stats {
       unique: 5
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
     generator = uniques_stats_generator.UniquesStatsGenerator()
     self.assertTransformOutputEqual(examples, generator,
                                     [expected_result_fa])
  def test_generate_stats_impl(self):
    # input with two batches: first batch has two examples and second batch
    # has a single example.
    batches = [{'a': np.array([np.array(['xyz']), np.array(['qwe'])])},
               {'a': np.array([np.array(['ab'])])}]

    generator1 = string_stats_generator.StringStatsGenerator()
    generator2 = uniques_stats_generator.UniquesStatsGenerator()

    expected_result = text_format.Parse(
        """
        datasets {
          features {
            name: 'a'
            type: STRING
            string_stats {
              avg_length: 2.66666666
              unique: 3
            }

          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

    with beam.Pipeline() as p:
      result = (p | beam.Create(batches) |
                stats_impl.GenerateStatisticsImpl(
                    generators=[generator1, generator2]))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))
Exemple #5
0
    def test_single_unicode_feature(self):
        # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
        examples = [{
            'fa': np.array(['a', 'b', 'c', 'e'], dtype=np.unicode_)
        }, {
            'fa': np.array(['a', 'c', 'd', 'a'], dtype=np.unicode_)
        }, {
            'fa': np.array(['a', 'b', 'c', 'd'], dtype=np.unicode_)
        }]

        expected_result = [
            text_format.Parse(
                """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          unique: 5
        }
    }""", statistics_pb2.DatasetFeatureStatistics())
        ]

        generator = uniques_stats_generator.UniquesStatsGenerator()
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)
Exemple #6
0
    def test_unique_stats_with_slicing(self):
        examples = [('slice1', {
            'fa': np.array(['a', 'b', 'a', 'e']),
            'fb': np.array(['1', '1', '0'])
        }),
                    ('slice2', {
                        'fa': np.array(['a', 'a', 'a']),
                        'fb': np.array(['0', '1', '2', '3', '0'])
                    }), ('slice1', {
                        'fa': None,
                        'fb': np.array(['2', '0'])
                    }), ('slice2', {
                        'fa': np.array(['b', 'a']),
                        'fb': None
                    })]

        expected_result = [('slice1',
                            text_format.Parse(
                                """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          unique: 3
        }
    }""", statistics_pb2.DatasetFeatureStatistics())),
                           ('slice1',
                            text_format.Parse(
                                """
      features {
        name: 'fb'
        type: STRING
        string_stats {
          unique: 3
        }
    }""", statistics_pb2.DatasetFeatureStatistics())),
                           ('slice2',
                            text_format.Parse(
                                """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          unique: 2
        }
    }""", statistics_pb2.DatasetFeatureStatistics())),
                           ('slice2',
                            text_format.Parse(
                                """
      features {
        name: 'fb'
        type: STRING
        string_stats {
          unique: 4
        }
    }""", statistics_pb2.DatasetFeatureStatistics()))]

        generator = uniques_stats_generator.UniquesStatsGenerator()
        self.assertSlicingAwareTransformOutputEqual(examples, generator,
                                                    expected_result)
Exemple #7
0
 def test_with_empty_dict(self):
     examples = [{}]
     expected_result = []
     generator = uniques_stats_generator.UniquesStatsGenerator()
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
Exemple #8
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile and then batch input examples.
        batched_dataset = (dataset
                           | 'Profile' >> profile_util.Profile()
                           | 'BatchInputs' >> batch_util.BatchExamples())

        # If a set of whitelist features are provided, keep only those features.
        filtered_dataset = batched_dataset
        if self._options.feature_whitelist:
            filtered_dataset = (
                batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map(
                    _filter_features,
                    feature_whitelist=self._options.feature_whitelist))

        return (filtered_dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
Exemple #9
0
def _get_default_generators(
    options, in_memory = False
):
  """Initialize default list of stats generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
  stats_generators = [
      common_stats_generator.CommonStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_values_histogram_buckets=options.num_values_histogram_buckets,
          epsilon=options.epsilon),
      numeric_stats_generator.NumericStatsGenerator(
          schema=options.schema,
          weight_feature=options.weight_feature,
          num_histogram_buckets=options.num_histogram_buckets,
          num_quantiles_histogram_buckets=\
            options.num_quantiles_histogram_buckets,
          epsilon=options.epsilon),
      string_stats_generator.StringStatsGenerator(
          schema=options.schema)
  ]
  if in_memory:
    stats_generators.append(
        top_k_uniques_combiner_stats_generator.
        TopKUniquesCombinerStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets))
  else:
    stats_generators.extend([
        top_k_stats_generator.TopKStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_top_values=options.num_top_values,
            num_rank_histogram_buckets=options.num_rank_histogram_buckets),
        uniques_stats_generator.UniquesStatsGenerator(schema=options.schema)
    ])
  return stats_generators
 def test_single_unicode_feature(self):
   # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
   batches = [{'fa': np.array([np.array(['a', 'b', 'c', 'e']),
                               np.array(['a', 'c', 'd', 'a'])],
                              dtype=np.unicode_)},
              {'fa': np.array([np.array(['a', 'b', 'c', 'd'])],
                              dtype=np.unicode_)}]
   expected_result_fa = text_format.Parse(
       """
     features {
       name: 'fa'
       type: STRING
       string_stats {
         unique: 5
       }
     }""", statistics_pb2.DatasetFeatureStatistics())
   generator = uniques_stats_generator.UniquesStatsGenerator()
   self.assertTransformOutputEqual(batches, generator, [expected_result_fa])
Exemple #11
0
    def test_with_categorical_feature(self):
        examples = [{
            'fa': np.array([12, 23, 34, 12])
        }, {
            'fa': np.array([45, 23])
        }, {
            'fa': np.array([12, 12, 34, 45])
        }]

        expected_result = [
            text_format.Parse(
                """
      features {
        name: 'fa'
        type: INT
        string_stats {
          unique: 4
        }
    }""", statistics_pb2.DatasetFeatureStatistics())
        ]

        schema = text_format.Parse(
            """
        feature {
          name: "fa"
          type: INT
          int_domain {
            is_categorical: true
          }
        }
        """, schema_pb2.Schema())
        generator = uniques_stats_generator.UniquesStatsGenerator(
            schema=schema)
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)
 def test_with_missing_feature(self):
     # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
     # fb: 1 'a', 1 'b', 2 'c'
     examples = [{
         'fa': np.array(['a', 'b', 'c', 'e']),
         'fb': np.array(['a', 'c', 'c'])
     }, {
         'fa': None,
         'fb': np.array(['b'])
     }, {
         'fa': np.array(['a', 'c', 'd']),
         'fb': None
     }, {
         'fa': np.array(['a', 'a', 'b', 'c', 'd'])
     }, {
         'fa': None
     }]
     expected_result_fa = text_format.Parse(
         """
   features {
     name: 'fa'
     type: STRING
     string_stats {
       unique: 5
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
     expected_result_fb = text_format.Parse(
         """
   features {
     name: 'fb'
     type: STRING
     string_stats {
       unique: 3
     }
   }""", statistics_pb2.DatasetFeatureStatistics())
     generator = uniques_stats_generator.UniquesStatsGenerator()
     self.assertTransformOutputEqual(
         examples, generator, [expected_result_fa, expected_result_fb])
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile the input examples.
        dataset |= 'ProfileExamples' >> profile_util.Profile()

        # Sample input data if sample_count option is provided.
        if self._options.sample_count is not None:
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[types.Example]], which we then flatten to get a
            # PCollection[types.Example].
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(
                            self._options.sample_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
 def test_with_empty_list(self):
   batches = []
   expected_result = []
   generator = uniques_stats_generator.UniquesStatsGenerator()
   self.assertTransformOutputEqual(batches, generator, expected_result)
 def test_with_empty_dict(self):
     examples = [{}]
     expected_result = []
     generator = uniques_stats_generator.UniquesStatsGenerator()
     self.assertTransformOutputEqual(examples, generator, expected_result)
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                weight_feature=self._options.weight_feature,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        result_protos = []
        # Iterate over the stats generators. For each generator,
        #   a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn
        #      and run it.
        #   b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform
        #      and run it.
        for generator in stats_generators:
            if isinstance(generator, stats_generator.CombinerStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> beam.CombineGlobally(
                                         _CombineFnWrapper(generator)))
            elif isinstance(generator,
                            stats_generator.TransformStatsGenerator):
                result_protos.append(dataset
                                     | generator.name >> generator.ptransform)
            else:
                raise TypeError(
                    'Statistics generator must extend one of '
                    'CombinerStatsGenerator or TransformStatsGenerator, '
                    'found object of type %s' % generator.__class__.__name__)

        # Each stats generator will output a PCollection of DatasetFeatureStatistics
        # protos. We now flatten the list of PCollections into a single PCollection,
        # then merge the DatasetFeatureStatistics protos in the PCollection into a
        # single DatasetFeatureStatisticsList proto.
        return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten()
                | 'MergeDatasetFeatureStatisticsProtos' >>
                beam.CombineGlobally(_merge_dataset_feature_stats_protos)
                | 'MakeDatasetFeatureStatisticsListProto' >>
                beam.Map(_make_dataset_feature_statistics_list_proto))