def test_time_stats_generator_match_ratio_with_same_valid_format(self):
     """Tests match ratio where all valid values have the same format."""
     input_batches = [
         pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                   ['2018-11-30', '2018-11-30']]),
         pa.array([['not-valid', 'not-valid', 'not-valid'],
                   ['not-valid', 'not-valid']]),
     ]
     # Try generator with match_ratio 0.51 (should not create stats).
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.51,
                                                         values_threshold=5)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
     # Try generator with match_ratio 0.49 (should create stats).
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.49,
                                                         values_threshold=5)
     self.assertCombinerOutputEqual(
         input_batches, generator,
         statistics_pb2.FeatureNameStatistics(custom_stats=[
             statistics_pb2.CustomStatistic(
                 name='domain_info',
                 str="time_domain {string_format: '%Y-%m-%d'}"),
             statistics_pb2.CustomStatistic(name='time_match_ratio',
                                            num=0.50),
         ]))
    def test_time_stats_generator_match_ratio_with_different_valid_formats(
            self):
        """Tests match ratio where valid values have different formats."""
        input_batches = [
            pa.array([[
                '2018-11-30', '2018/11/30', '20181130', '18-11-30', '18/11/30'
            ], [
                '11-30-2018', '11/30/2018', '11302018', '11/30/18', '11/30/18'
            ]]),
        ]
        # Any single format could satisfy the match_ratio, but this should identify
        # only the most common as the time format.
        generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.05,
                                                            values_threshold=1)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info',
                    str="time_domain {string_format: '%m/%d/%y'}"),
                statistics_pb2.CustomStatistic(name='time_match_ratio',
                                               num=0.2),
            ]))

        # No single valid format satisfies the specified match_ratio, so this should
        # not create stats.
        generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.3,
                                                            values_threshold=1)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())
    def test_time_stats_generator_values_threshold_check(self):
        """Tests generator values threshold."""
        # Expected to give 6 matches with the same format.
        input_batches = [
            pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                      ['2018-11-30']]),
            pa.array([['2018-11-30', '2018-11-30']]),
            pa.array([None, None]),
        ]
        # Try generator with values_threshold=7 (should not create stats).
        generator = time_stats_generator.TimeStatsGenerator(values_threshold=7)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())

        # Try generator with values_threshold=6 (should create stats).
        generator = time_stats_generator.TimeStatsGenerator(values_threshold=6)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info',
                    str="time_domain {string_format: '%Y-%m-%d'}"),
                statistics_pb2.CustomStatistic(name='time_match_ratio',
                                               num=1.0),
            ]))
Example #4
0
def get_generators(
        options: stats_options.StatsOptions,
        in_memory: bool = False) -> List[stats_generator.StatsGenerator]:
    """Initializes the list of stats generators, including custom generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
    generators = _get_default_generators(options, in_memory)
    if options.generators:
        # Add custom stats generators.
        generators.extend(options.generators)
    if options.enable_semantic_domain_stats:
        semantic_domain_feature_stats_generators = [
            image_stats_generator.ImageStatsGenerator(),
            natural_language_stats_generator.NLStatsGenerator(),
            time_stats_generator.TimeStatsGenerator(),
        ]
        # Wrap semantic domain feature stats generators as a separate combiner
        # stats generator, so that we can apply sampling only for those and other
        # feature stats generators are not affected by it.
        generators.append(
            CombinerFeatureStatsWrapperGenerator(
                semantic_domain_feature_stats_generators,
                weight_feature=options.weight_feature,
                sample_rate=options.semantic_domain_stats_sample_rate))
    if options.schema is not None and _schema_has_sparse_features(
            options.schema):
        generators.append(
            sparse_feature_stats_generator.SparseFeatureStatsGenerator(
                options.schema))
    # Replace all CombinerFeatureStatsGenerator with a single
    # CombinerFeatureStatsWrapperGenerator.
    feature_generators = [
        x for x in generators
        if isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
    ]
    if feature_generators:
        generators = [
            x for x in generators
            if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
        ] + [
            CombinerFeatureStatsWrapperGenerator(
                feature_generators, weight_feature=options.weight_feature)
        ]
    if in_memory:
        for generator in generators:
            if not isinstance(generator,
                              stats_generator.CombinerStatsGenerator):
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of '
                    'type %s.' % generator.__class__.__name__)
    return generators
 def test_time_stats_generator_valid_formats(self, input_batch,
                                             expected_matching_formats):
     """Tests that generator's add_input method properly counts valid formats."""
     generator = time_stats_generator.TimeStatsGenerator(values_threshold=1)
     accumulator = generator.add_input(generator.create_accumulator(),
                                       types.FeaturePath(['']), input_batch)
     self.assertDictEqual(expected_matching_formats,
                          accumulator.matching_formats)
    def test_time_stats_generator_invalid_initialization_values(self):
        """Tests bad initialization values."""
        with self.assertRaises(ValueError) as context:
            time_stats_generator.TimeStatsGenerator(values_threshold=0)
            self.assertIn(
                'TimeStatsGenerator expects a values_threshold > 0, got 0.',
                str(context.exception))

            time_stats_generator.TimeStatsGenerator(match_ratio=1.1)
            self.assertIn(
                'TimeStatsGenerator expects a match_ratio in (0, 1].',
                str(context.exception))

            time_stats_generator.TimeStatsGenerator(match_ratio=0)
            self.assertIn(
                'TimeStatsGenerator expects a match_ratio in (0, 1].',
                str(context.exception))
 def test_time_stats_generator_non_time_integers(self):
     """Tests that the generator handles integers that are not times."""
     # None of these numbers are valid times.
     input_batches = [
         pa.array([[1, 2]]),
     ]
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.1,
                                                         values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
 def test_time_stats_generator_no_valid_formats(self):
     """Tests that the generator handles batches that contain no valid values."""
     # None of these values is a valid format.
     input_batches = [
         pa.array([['', '2018-Nov-30', '20183011']]),
         pa.array([['all/invalid', '2018-11-30invalid']]),
         pa.array([['invalid2018-11-30', 'invalid\n2018-11-30']])
     ]
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.1,
                                                         values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
 def test_time_stats_generator_inconsistent_type_invalidation_check(self):
     """Tests that generator invalidates stats if inconsistent types are used."""
     # Absent invalidation, this is expected to give 6 matches.
     input_batches = [
         pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                   ['2018-11-30']]),
         pa.array([['2018-11-30', '2018-11-30']]),
         pa.array([[1.0]]),
     ]
     # No domain_info should be generated as the incorrect type of the 1.0 value
     # should invalidate the stats. Absent this type issue, these examples would
     # satisfy the specified match_ratio and values_threshold.
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.5,
                                                         values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
    def test_time_stats_generator_invalidated_exits_add_input_early(
            self, mock_update):
        input_batch = pa.array([['2018-11-30']])
        generator = time_stats_generator.TimeStatsGenerator()
        accumulator = generator.create_accumulator()

        # When an accumulator is invalidated is True, it is not updated when an
        # input batch is added.
        accumulator.invalidated = True
        generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
        self.assertFalse(mock_update.called)

        # When an accumulator is not invalidated, it is updated when an input batch
        # is added.
        accumulator.invalidated = False
        generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
        self.assertTrue(mock_update.called)
 def test_time_stats_generator_utf8_check(self):
     """Tests that generator invalidates stats if there is a non-utf8 string."""
     # Absent invalidation, this is expected to give 6 matches.
     input_batches = [
         pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                   ['2018-11-30']]),
         pa.array([['2018-11-30', '2018-11-30']]),
         # Non utf-8 string that will invalidate the accumulator.
         pa.array([[b'\xF0']]),
     ]
     # No domain_info should be generated as the non-utf8 string should
     # invalidate the stats. Absent this type issue, these examples would
     # satisfy the specified match_ratio and values_threshold.
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.5,
                                                         values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
Example #12
0
def get_generators(options, in_memory=False):
    """Initializes the list of stats generators, including custom generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
    generators = _get_default_generators(options, in_memory)
    if options.generators is not None:
        # Add custom stats generators.
        generators.extend(options.generators)
    if options.enable_semantic_domain_stats:
        generators += [
            image_stats_generator.ImageStatsGenerator(),
            natural_language_stats_generator.NLStatsGenerator(),
            time_stats_generator.TimeStatsGenerator(),
        ]
    # Replace all CombinerFeatureStatsGenerator with a single
    # CombinerFeatureStatsWrapperGenerator.
    feature_generators = [
        x for x in generators
        if isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
    ]
    if feature_generators:
        generators = [
            x for x in generators
            if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
        ] + [
            CombinerFeatureStatsWrapperGenerator(
                feature_generators, weight_feature=options.weight_feature)
        ]
    if in_memory:
        for generator in generators:
            if not isinstance(generator,
                              stats_generator.CombinerStatsGenerator):
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of '
                    'type %s.' % generator.__class__.__name__)
    return generators
 def test_time_stats_generator_combined_formats(self):
     """Tests that the generator handles combined formats."""
     # The combined format is the most common, since the generator should count
     # it only as the combined format and not its component parts.
     input_batches = [[np.array(['2018/11/30 23:59', '2018/12/01 23:59'])],
                      [np.array(['2018/11/30 23:59', '23:59'])],
                      [np.array(['2018/11/30', '2018/11/30'])]]
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.1,
                                                         values_threshold=1)
     self.assertCombinerOutputEqual(
         input_batches, generator,
         statistics_pb2.FeatureNameStatistics(custom_stats=[
             statistics_pb2.CustomStatistic(
                 name='domain_info',
                 str="time_domain {format: '%Y/%m/%d %H:%M'}"),
             statistics_pb2.CustomStatistic(name='time_match_ratio',
                                            num=0.5),
         ]))
    def test_time_stats_generator_no_values_exits_add_input_early(
            self, mock_update):
        generator = time_stats_generator.TimeStatsGenerator()
        accumulator = generator.create_accumulator()

        # The accumulator is not updated when the values list in an input batch is
        # None.
        input_batch = pa.array([None])
        generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
        self.assertFalse(mock_update.called)

        # The accumulator is not updated when the values list in an input batch is
        # empty.
        input_batch = pa.array([])
        generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
        self.assertFalse(mock_update.called)

        # The accumulator is updated when a non-empty input_batch is added.
        input_batch = pa.array([['2018-11-30']])
        generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
        self.assertTrue(mock_update.called)
 def test_time_stats_generator_integer_formats(self):
     """Tests that the generator handles integer formats."""
     # Three of values are within the valid range for Unix seconds, one is within
     # the valid range for Unix milliseconds, and the other two are not within
     # the valid range for any integer time formats.
     input_batches = [
         pa.array([[631152001, 631152002]]),
         pa.array([[631152003, 631152000001]]),
         pa.array([[1, 2]])
     ]
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.1,
                                                         values_threshold=1)
     assert schema_pb2.TimeDomain.UNIX_SECONDS == 1
     self.assertCombinerOutputEqual(
         input_batches, generator,
         statistics_pb2.FeatureNameStatistics(custom_stats=[
             statistics_pb2.CustomStatistic(
                 name='domain_info',
                 str=('time_domain {integer_format: 1}')),
             statistics_pb2.CustomStatistic(name='time_match_ratio',
                                            num=0.5),
         ]))
Example #16
0
def get_generators(
        options: stats_options.StatsOptions,
        in_memory: bool = False) -> List[stats_generator.StatsGenerator]:
    """Initializes the list of stats generators, including custom generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
    generators = [NumExamplesStatsGenerator(options.weight_feature)]
    if options.add_default_generators:
        generators.extend(_get_default_generators(options, in_memory))
    if options.generators:
        # Add custom stats generators.
        generators.extend(options.generators)
    if options.enable_semantic_domain_stats:
        semantic_domain_feature_stats_generators = [
            image_stats_generator.ImageStatsGenerator(),
            natural_language_domain_inferring_stats_generator.
            NLDomainInferringStatsGenerator(),
            time_stats_generator.TimeStatsGenerator(),
        ]
        # Wrap semantic domain feature stats generators as a separate combiner
        # stats generator, so that we can apply sampling only for those and other
        # feature stats generators are not affected by it.
        generators.append(
            CombinerFeatureStatsWrapperGenerator(
                semantic_domain_feature_stats_generators,
                sample_rate=options.semantic_domain_stats_sample_rate))
    if options.schema is not None:
        if _schema_has_sparse_features(options.schema):
            generators.append(
                sparse_feature_stats_generator.SparseFeatureStatsGenerator(
                    options.schema))
        if _schema_has_natural_language_domains(options.schema):
            generators.append(
                natural_language_stats_generator.NLStatsGenerator(
                    options.schema, options.vocab_paths,
                    options.num_histogram_buckets,
                    options.num_quantiles_histogram_buckets,
                    options.num_rank_histogram_buckets))
        if options.schema.weighted_feature:
            generators.append(
                weighted_feature_stats_generator.WeightedFeatureStatsGenerator(
                    options.schema))
        if options.label_feature and not in_memory:
            # The LiftStatsGenerator is not a CombinerStatsGenerator and therefore
            # cannot currenty be used for in_memory executions.
            generators.append(
                lift_stats_generator.LiftStatsGenerator(
                    y_path=types.FeaturePath([options.label_feature]),
                    schema=options.schema,
                    example_weight_map=options.example_weight_map,
                    output_custom_stats=True))

    # Replace all CombinerFeatureStatsGenerator with a single
    # CombinerFeatureStatsWrapperGenerator.
    feature_generators = [
        x for x in generators
        if isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
    ]
    if feature_generators:
        generators = [
            x for x in generators
            if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
        ] + [CombinerFeatureStatsWrapperGenerator(feature_generators)]
    if in_memory:
        for generator in generators:
            if not isinstance(generator,
                              stats_generator.CombinerStatsGenerator):
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of '
                    'type %s.' % generator.__class__.__name__)
    return generators
 def test_time_stats_generator_empty_input(self):
     """Tests generator on empty input."""
     generator = time_stats_generator.TimeStatsGenerator()
     self.assertCombinerOutputEqual([], generator,
                                    statistics_pb2.FeatureNameStatistics())