Ejemplo n.º 1
0
    def expand(self, dataset):
        # Sample input data if sample_count option is provided.
        # TODO(b/117229955): Consider providing an option to write the sample
        # to a file.
        if self._options.sample_count is not None:
            # TODO(pachristopher): Consider moving the sampling logic to decoders.
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[pa.Table]], which we then flatten to get a
            # PCollection[pa.Table].
            batch_size = (self._options.desired_batch_size
                          if self._options.desired_batch_size
                          and self._options.desired_batch_size > 0 else
                          constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
            batch_count = (
                int(self._options.sample_count / batch_size) +
                (1 if self._options.sample_count % batch_size else 0))
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(batch_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(self._options))
Ejemplo n.º 2
0
  def test_generate_stats_impl(self):
    # input with two batches: first batch has two examples and second batch
    # has a single example.
    batches = [{'a': np.array([np.array(['xyz']), np.array(['qwe'])])},
               {'a': np.array([np.array(['ab'])])}]

    generator1 = string_stats_generator.StringStatsGenerator()
    generator2 = uniques_stats_generator.UniquesStatsGenerator()

    expected_result = text_format.Parse(
        """
        datasets {
          features {
            name: 'a'
            type: STRING
            string_stats {
              avg_length: 2.66666666
              unique: 3
            }

          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

    with beam.Pipeline() as p:
      result = (p | beam.Create(batches) |
                stats_impl.GenerateStatisticsImpl(
                    generators=[generator1, generator2]))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))
Ejemplo n.º 3
0
  def expand(self, dataset: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    if self._options.sample_rate is not None:
      dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate >>
                  beam.FlatMap(_sample_at_rate,
                               sample_rate=self._options.sample_rate))

    return (dataset | 'RunStatsGenerators' >>
            stats_impl.GenerateStatisticsImpl(self._options))
Ejemplo n.º 4
0
    def expand(
        self, dataset: beam.PCollection[pa.RecordBatch]
    ) -> beam.PCollection[statistics_pb2.DatasetFeatureStatisticsList]:
        if self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(self._options))
Ejemplo n.º 5
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile and then batch input examples.
        batched_dataset = (dataset
                           | 'Profile' >> profile_util.Profile()
                           | 'BatchInputs' >> batch_util.BatchExamples())

        # If a set of whitelist features are provided, keep only those features.
        filtered_dataset = batched_dataset
        if self._options.feature_whitelist:
            filtered_dataset = (
                batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map(
                    _filter_features,
                    feature_whitelist=self._options.feature_whitelist))

        return (filtered_dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
Ejemplo n.º 6
0
    def expand(self, dataset):
        # Sample input data if sample_count option is provided.
        if self._options.sample_count is not None:
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[types.Example]], which we then flatten to get a
            # PCollection[types.Example].
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(
                            self._options.sample_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(self._options))
Ejemplo n.º 7
0
 def test_stats_impl(self,
                     examples,
                     options,
                     expected_result_proto_text,
                     schema=None):
     expected_result = text_format.Parse(
         expected_result_proto_text,
         statistics_pb2.DatasetFeatureStatisticsList())
     if schema is not None:
         options.schema = schema
     with beam.Pipeline() as p:
         result = (p | beam.Create(examples)
                   | stats_impl.GenerateStatisticsImpl(options))
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))
Ejemplo n.º 8
0
    def expand(self, dataset):
        # Initialize a list of stats generators to run.
        stats_generators = [
            # Create common stats generator.
            common_stats_generator.CommonStatsGenerator(
                schema=self._options.schema,
                num_values_histogram_buckets=\
                    self._options.num_values_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create numeric stats generator.
            numeric_stats_generator.NumericStatsGenerator(
                schema=self._options.schema,
                num_histogram_buckets=self._options.num_histogram_buckets,
                num_quantiles_histogram_buckets=\
                    self._options.num_quantiles_histogram_buckets,
                epsilon=self._options.epsilon),

            # Create string stats generator.
            string_stats_generator.StringStatsGenerator(
                schema=self._options.schema),

            # Create topk stats generator.
            top_k_stats_generator.TopKStatsGenerator(
                schema=self._options.schema,
                num_top_values=self._options.num_top_values,
                num_rank_histogram_buckets=\
                    self._options.num_rank_histogram_buckets),

            # Create uniques stats generator.
            uniques_stats_generator.UniquesStatsGenerator(
                schema=self._options.schema)
        ]
        if self._options.generators is not None:
            # Add custom stats generators.
            stats_generators.extend(self._options.generators)

        # Profile the input examples.
        dataset |= 'ProfileExamples' >> profile_util.Profile()

        # Sample input data if sample_count option is provided.
        if self._options.sample_count is not None:
            # beam.combiners.Sample.FixedSizeGlobally returns a
            # PCollection[List[types.Example]], which we then flatten to get a
            # PCollection[types.Example].
            dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
                        beam.combiners.Sample.FixedSizeGlobally(
                            self._options.sample_count)
                        | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
        elif self._options.sample_rate is not None:
            dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate
                        >> beam.FlatMap(_sample_at_rate,
                                        sample_rate=self._options.sample_rate))

        # Batch the input examples.
        desired_batch_size = (None if self._options.sample_count is None else
                              self._options.sample_count)
        dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples(
            desired_batch_size=desired_batch_size))

        # If a set of whitelist features are provided, keep only those features.
        if self._options.feature_whitelist:
            dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map(
                _filter_features,
                feature_whitelist=self._options.feature_whitelist))

        return (dataset | 'RunStatsGenerators' >>
                stats_impl.GenerateStatisticsImpl(stats_generators))
Ejemplo n.º 9
0
  def test_stats_impl_custom_generators(self):

    # Dummy PTransform that returns two DatasetFeatureStatistics protos.
    class CustomPTransform(beam.PTransform):

      def expand(self, pcoll):
        stats_proto1 = statistics_pb2.DatasetFeatureStatistics()
        proto1_feat = stats_proto1.features.add()
        proto1_feat.name = 'a'
        custom_stat1 = proto1_feat.custom_stats.add()
        custom_stat1.name = 'my_stat_a'
        custom_stat1.str = 'my_val_a'

        stats_proto2 = statistics_pb2.DatasetFeatureStatistics()
        proto2_feat = stats_proto2.features.add()
        proto2_feat.name = 'b'
        custom_stat2 = proto2_feat.custom_stats.add()
        custom_stat2.name = 'my_stat_b'
        custom_stat2.str = 'my_val_b'
        return [stats_proto1, stats_proto2]

    examples = [{'a': np.array([], dtype=np.int32),
                 'b': np.array([], dtype=np.int32)}]
    expected_result = text_format.Parse("""
    datasets {
      num_examples: 1
      features {
        name: 'a'
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            tot_num_values: 0
            num_values_histogram {
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
        }
        custom_stats {
          name: 'my_stat_a'
          str: 'my_val_a'
        }
      }
      features {
        name: 'b'
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            tot_num_values: 0
            num_values_histogram {
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
        }
        custom_stats {
          name: 'my_stat_b'
          str: 'my_val_b'
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

    # Create a transform stats generator.
    transform_stats_gen = stats_generator.TransformStatsGenerator(
        name='CustomStatsGenerator',
        ptransform=CustomPTransform())
    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          generators=[transform_stats_gen], num_values_histogram_buckets=2)
      result = (p | beam.Create(examples) |
                stats_impl.GenerateStatisticsImpl(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))