Esempio n. 1
0
    def test_nl_generator_match_ratio_check(self):
        """Tests generator match ratio with fake heuristic."""
        input_batches = [
            pa.Column.from_array(
                'feature',
                pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH', 'Nope']])),
            pa.Column.from_array('feature',
                                 pa.array([['MATCH', 'MATCH', 'MATCH']])),
            pa.Column.from_array('feature', pa.array([['12345', 'No']])),
        ]
        # Set values_threshold=5 so it always passes.
        # Try generators with match_ratio 0.71 (should not create stats) and
        # 0.69 (should create stats)
        generator = nlsg.NLStatsGenerator(_FakeHeuristic(),
                                          match_ratio=0.71,
                                          values_threshold=5)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())

        generator = nlsg.NLStatsGenerator(_FakeHeuristic(),
                                          match_ratio=0.69,
                                          values_threshold=5)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info', str='natural_language_domain {}'),
                statistics_pb2.CustomStatistic(
                    name='natural_language_match_rate', num=0.7)
            ]))
    def test_nl_generator_example_threshold_check(self):
        """Tests generator example threshold with fake heuristic."""
        # Expected to give 6 matches.
        input_batches = [
            [
                np.array(['MATCH', 'MATCH', 'MATCH']),
                np.array(['MATCH']),
            ],
            [
                np.array(['MATCH', 'MATCH']),
            ],
            # Nones should be ignored.
            [
                None,
                np.array([None] * 10),
            ],
        ]
        # Try generators with examples_threshold=7 (should not create stats) and
        # 6 (should create stats)
        generator = nlsg.NLStatsGenerator(_FakeHeuristic(),
                                          examples_threshold=7)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())

        generator = nlsg.NLStatsGenerator(_FakeHeuristic(),
                                          examples_threshold=6)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info', str='natural_language_domain {}'),
                statistics_pb2.CustomStatistic(
                    name='natural_language_match_rate', num=1.0)
            ]))
Esempio n. 3
0
    def test_nl_generator_values_threshold_check(self):
        """Tests generator values threshold with fake heuristic."""
        # Expected to give 6 matches.
        input_batches = [
            pa.Column.from_array(
                'feature', pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']])),
            pa.Column.from_array('feature', pa.array([['MATCH', 'MATCH']])),
            # Nones should be ignored.
            pa.Column.from_array('feature', pa.array([None, None])),
        ]
        # Try generators with values_threshold=7 (should not create stats) and
        # 6 (should create stats)
        generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=7)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())

        generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=6)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info', str='natural_language_domain {}'),
                statistics_pb2.CustomStatistic(
                    name='natural_language_match_rate', num=1.0)
            ]))
Esempio n. 4
0
 def test_nl_generator_bad_initialization(self):
   """Tests bad initialization values."""
   with self.assertRaisesRegexp(
       ValueError, 'NLStatsGenerator expects values_threshold > 0.'):
     nlsg.NLStatsGenerator(values_threshold=0)
   with self.assertRaisesRegexp(
       ValueError, r'NLStatsGenerator expects a match_ratio in \[0, 1\].'):
     nlsg.NLStatsGenerator(match_ratio=1.1)
Esempio n. 5
0
def get_generators(
        options: stats_options.StatsOptions,
        in_memory: bool = False) -> List[stats_generator.StatsGenerator]:
    """Initializes the list of stats generators, including custom generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
    generators = _get_default_generators(options, in_memory)
    if options.generators:
        # Add custom stats generators.
        generators.extend(options.generators)
    if options.enable_semantic_domain_stats:
        semantic_domain_feature_stats_generators = [
            image_stats_generator.ImageStatsGenerator(),
            natural_language_stats_generator.NLStatsGenerator(),
            time_stats_generator.TimeStatsGenerator(),
        ]
        # Wrap semantic domain feature stats generators as a separate combiner
        # stats generator, so that we can apply sampling only for those and other
        # feature stats generators are not affected by it.
        generators.append(
            CombinerFeatureStatsWrapperGenerator(
                semantic_domain_feature_stats_generators,
                weight_feature=options.weight_feature,
                sample_rate=options.semantic_domain_stats_sample_rate))
    if options.schema is not None and _schema_has_sparse_features(
            options.schema):
        generators.append(
            sparse_feature_stats_generator.SparseFeatureStatsGenerator(
                options.schema))
    # Replace all CombinerFeatureStatsGenerator with a single
    # CombinerFeatureStatsWrapperGenerator.
    feature_generators = [
        x for x in generators
        if isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
    ]
    if feature_generators:
        generators = [
            x for x in generators
            if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
        ] + [
            CombinerFeatureStatsWrapperGenerator(
                feature_generators, weight_feature=options.weight_feature)
        ]
    if in_memory:
        for generator in generators:
            if not isinstance(generator,
                              stats_generator.CombinerStatsGenerator):
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of '
                    'type %s.' % generator.__class__.__name__)
    return generators
Esempio n. 6
0
 def test_nl_generator_token_and_sequence_histograms(self):
     """Tests generator calculation of token and sequence histograms."""
     with tempfile.NamedTemporaryFile() as vocab_file:
         vocab_file.write(b'Foo\nBar\nBaz\nBazz\nCar\nRazzz\n')
         vocab_file.flush()
         input_batches = [pa.array([[0, 1, 2, 4, 4], [3, 3, 3, 5]])]
         generator = nlsg.NLStatsGenerator(
             schema=self._schema,
             vocab_paths={'my_vocab': vocab_file.name},
             num_quantiles_histogram_buckets=2,
             num_rank_histogram_buckets=2,
             num_histogram_buckets=2)
         expected_reported_sequences = [['Foo', 'Bar', 'Baz', 'Car', 'Car'],
                                        ['Bazz', 'Bazz', 'Bazz', 'Razzz']
                                        ] * 2
         self.assertCombinerOutputEqual(
             input_batches, generator,
             self._create_expected_feature_name_statistics(
                 feature_coverage=0.8571428571428571,
                 avg_token_length=(3 + 3 + 4 + 4 + 4 + 5) / 6,
                 min_sequence_length=3,
                 max_sequence_length=5,
                 token_len_quantiles=[(3, 4, 3), (4, 5, 3)],
                 sequence_len_quantiles=[(3, 5, 1), (5, 5, 1)],
                 sorted_token_names_and_counts=[('Bazz', 3), ('Car', 2)],
                 reported_sequences=expected_reported_sequences),
             self._int_nlp_feature_with_vocab_path)
Esempio n. 7
0
    def test_nl_generator_avg_word_heuristic_match(self):
        """Tests generator with avg word length heuristic."""
        generator = nlsg.NLStatsGenerator(values_threshold=2)
        input_batches = [
            pa.Column.from_array(
                'feature',
                pa.array([[
                    'This looks correct.', 'This one too, it should be text.'
                ], ['xosuhddsofuhg123fdgosh']])),
            pa.Column.from_array(
                'feature',
                pa.array(
                    [['This should be text as well',
                      'Here is another text']])),
            pa.Column.from_array(
                'feature',
                pa.array([['This should also be considered good.']])),
        ]

        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info', str='natural_language_domain {}'),
                statistics_pb2.CustomStatistic(
                    name='natural_language_match_rate', num=0.8333333)
            ]))
Esempio n. 8
0
 def test_nl_generator_int_feature_no_vocab(self):
     """Tests generator calculation with a int domain having no vocab."""
     input_batches = [pa.array([[1], [2], [3]])]
     generator = nlsg.NLStatsGenerator(schema=self._schema)
     self.assertCombinerOutputEqual(
         input_batches, generator,
         self._create_expected_feature_name_statistics(
             feature_coverage=0.0), self._int_nlp_feature_no_vocab_path)
Esempio n. 9
0
 def test_nl_generator_string_feature_no_vocab(self):
     """Tests generator calculation with a string domain having no vocab."""
     input_batches = [pa.array([['Foo'], None, ['Baz']])]
     generator = nlsg.NLStatsGenerator(schema=self._schema)
     self.assertCombinerOutputEqual(
         input_batches, generator,
         self._create_expected_feature_name_statistics(
             feature_coverage=0.5, avg_token_length=3.0),
         self._string_nlp_feature_no_vocab_path)
    def test_nl_generator_avg_word_heuristic_non_match(self):
        """Tests generator with avg word length heuristic."""
        generator = nlsg.NLStatsGenerator(values_threshold=2)
        input_batches = [
            pa.array([['abc' * 10, 'xxxxxxxxx'], ['xosuhddsofuhg123fdgosh']]),
            pa.array([['Only one valid text?']]),
        ]

        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())
Esempio n. 11
0
 def test_nl_generator_invalidation_check_no_nld(self):
     """Tests generator invalidation with no natural language domain."""
     generator = nlsg.NLStatsGenerator(self._schema, None, 0, 0, 0)
     generator.setup()
     accumulator = generator.create_accumulator()
     self.assertFalse(accumulator.invalidate)
     valid_input = pa.array([['Foo'], ['Bar']])
     accumulator = generator.add_input(accumulator,
                                       self._non_nlp_feature_path,
                                       valid_input)
     self.assertTrue(accumulator.invalidate)
Esempio n. 12
0
 def test_nl_generator_invalidation_check_empty_nld(self):
     """Tests generator invalidation whith empty natural language domain."""
     generator = nlsg.NLStatsGenerator(self._schema, None, 0, 0, 0)
     generator.setup()
     accumulator = generator.create_accumulator()
     self.assertFalse(accumulator.invalidate)
     valid_input = pa.array([[0], [1]])
     accumulator = generator.add_input(accumulator,
                                       self._int_nlp_feature_empty_domain,
                                       valid_input)
     self.assertTrue(accumulator.invalidate)
Esempio n. 13
0
 def test_nl_generator_int_feature_no_vocab(self):
     """Tests generator calculation with a int domain having no vocab."""
     input_batches = [pa.array([[1, 2, 3]])]
     generator = nlsg.NLStatsGenerator(self._schema, None, 0, 0, 0)
     expected_reported_sequences = [[1, 2, 3]] * 2
     self.assertCombinerOutputEqual(
         input_batches, generator,
         self._create_expected_feature_name_statistics(
             feature_coverage=0.0,
             reported_sequences=expected_reported_sequences),
         self._int_nlp_feature_no_vocab_path)
Esempio n. 14
0
 def test_nl_generator_int_feature_vocab(self):
     """Tests generator calcualtion with an int domain and a vocab."""
     with tempfile.NamedTemporaryFile() as vocab_file:
         vocab_file.write(b'Foo\nBar\nBaz\nBazz\n')
         vocab_file.flush()
         input_batches = [pa.array([[0], [1], [2], [3], [4]])]
         generator = nlsg.NLStatsGenerator(
             vocab_paths={'my_vocab': vocab_file.name}, schema=self._schema)
         self.assertCombinerOutputEqual(
             input_batches, generator,
             self._create_expected_feature_name_statistics(
                 feature_coverage=float(1) / 3, avg_token_length=4.0),
             self._int_nlp_feature_with_vocab_path)
Esempio n. 15
0
 def test_nl_generator_invalidation_check_float_input(self):
     """Tests generator invalidation with float inputs."""
     generator = nlsg.NLStatsGenerator(self._schema, None, 0, 0, 0)
     generator.setup()
     accumulator = generator.create_accumulator()
     self.assertFalse(accumulator.invalidate)
     valid_input = pa.array([['Foo'], ['Bar']])
     accumulator = generator.add_input(
         accumulator, self._string_nlp_feature_no_vocab_path, valid_input)
     self.assertFalse(accumulator.invalidate)
     invalid_input = pa.array([[1.0], [2.0], [3.0]])
     accumulator = generator.add_input(
         accumulator, self._string_nlp_feature_no_vocab_path, invalid_input)
     self.assertTrue(accumulator.invalidate)
 def test_nl_generator_utf8_check(self):
     """Tests generator utf8 check with fake heuristic."""
     # Expected to give 6 matches.
     input_batches = [
         pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]),
         pa.array([['MATCH', 'MATCH']]),
         # Non utf-8 string invalidates accumulator.
         pa.array([[b'\xF0']]),
     ]
     # Try generators with values_threshold=1 which should have generated
     # stats without the non utf-8 value.
     generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
 def test_nl_generator_invalidation_check(self):
     """Tests generator invalidation with fake heuristic."""
     # Expected to give 6 matches.
     input_batches = [
         pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']]),
         pa.array([['MATCH', 'MATCH']]),
         # Incorrect type invalidates accumulator.
         pa.array([[42]]),
     ]
     # No domain_info is generated as the incorrect type of 42 value invalidated
     # the stats.
     generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
Esempio n. 18
0
    def test_nl_generator_string_feature_vocab(self):
        """Tests generator calculation with a string domain having a vocab."""
        with tempfile.NamedTemporaryFile() as vocab_file:
            vocab_file.write(b'Foo\nBar\nBazz\n')
            vocab_file.flush()

            input_batches = [pa.array([['Bar'], None, ['Bazz']])]
            generator = nlsg.NLStatsGenerator(
                vocab_paths={'my_vocab': vocab_file.name}, schema=self._schema)
            self.assertCombinerOutputEqual(
                input_batches, generator,
                self._create_expected_feature_name_statistics(
                    feature_coverage=1.0, avg_token_length=4.0),
                self._string_nlp_feature_with_vocab_path)
Esempio n. 19
0
def get_generators(options, in_memory=False):
    """Initializes the list of stats generators, including custom generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
    generators = _get_default_generators(options, in_memory)
    if options.generators is not None:
        # Add custom stats generators.
        generators.extend(options.generators)
    if options.enable_semantic_domain_stats:
        generators += [
            image_stats_generator.ImageStatsGenerator(),
            natural_language_stats_generator.NLStatsGenerator(),
            time_stats_generator.TimeStatsGenerator(),
        ]
    # Replace all CombinerFeatureStatsGenerator with a single
    # CombinerFeatureStatsWrapperGenerator.
    feature_generators = [
        x for x in generators
        if isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
    ]
    if feature_generators:
        generators = [
            x for x in generators
            if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
        ] + [
            CombinerFeatureStatsWrapperGenerator(
                feature_generators, weight_feature=options.weight_feature)
        ]
    if in_memory:
        for generator in generators:
            if not isinstance(generator,
                              stats_generator.CombinerStatsGenerator):
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of '
                    'type %s.' % generator.__class__.__name__)
    return generators
Esempio n. 20
0
 def test_nl_generator_token_stats(self):
     """Tests generator calculation of token statistics."""
     with tempfile.NamedTemporaryFile() as vocab_file:
         vocab_file.write(b'Foo\nBar\n')
         vocab_file.flush()
         input_batches = [pa.array([[0, 1, 0], [1, 0, 0]])]
         generator = nlsg.NLStatsGenerator(
             schema=self._schema,
             vocab_paths={'my_vocab': vocab_file.name},
             num_quantiles_histogram_buckets=0,
             num_rank_histogram_buckets=0,
             num_histogram_buckets=3)
         expected_reported_sequences = [['Foo', 'Bar', 'Foo'],
                                        ['Bar', 'Foo', 'Foo']] * 2
         position_histogram_1 = statistics_pb2.Histogram()
         position_histogram_1.buckets.add(low_value=0,
                                          high_value=float(1) / 3,
                                          sample_count=1)
         position_histogram_1.buckets.add(low_value=float(1) / 3,
                                          high_value=float(2) / 3,
                                          sample_count=1)
         position_histogram_foo = statistics_pb2.Histogram()
         position_histogram_foo.buckets.add(low_value=0,
                                            high_value=float(1) / 3,
                                            sample_count=1)
         position_histogram_foo.buckets.add(low_value=float(1) / 3,
                                            high_value=float(2) / 3,
                                            sample_count=1)
         position_histogram_foo.buckets.add(low_value=float(2) / 3,
                                            high_value=1,
                                            sample_count=2)
         expected_token_stats = {
             1: (2, 1.0, 1, 1, 1, position_histogram_1),
             'Foo': (4, 1.0, 2, 2, 2, position_histogram_foo)
         }
         self.assertCombinerOutputEqual(
             input_batches, generator,
             self._create_expected_feature_name_statistics(
                 feature_coverage=1.0,
                 avg_token_length=3,
                 min_sequence_length=3,
                 max_sequence_length=3,
                 reported_sequences=expected_reported_sequences,
                 token_statistics=expected_token_stats),
             self._int_nlp_feature_with_vocab_and_token_constraints_path)
Esempio n. 21
0
 def test_nl_generator_int_feature_vocab(self):
     """Tests generator calcualtion with an int domain and a vocab."""
     with tempfile.NamedTemporaryFile() as vocab_file:
         vocab_file.write(b'Foo\nBar\nBaz\nBazz\n')
         vocab_file.flush()
         input_batches = [pa.array([[0, 1, 2, 3, 4]])]
         generator = nlsg.NLStatsGenerator(self._schema,
                                           {'my_vocab': vocab_file.name}, 0,
                                           0, 0)
         expected_reported_sequences = [['Foo', 'Bar', 'Baz', 'Bazz', 4]
                                        ] * 2
         self.assertCombinerOutputEqual(
             input_batches, generator,
             self._create_expected_feature_name_statistics(
                 feature_coverage=float(1) / 3,
                 avg_token_length=4,
                 reported_sequences=expected_reported_sequences),
             self._int_nlp_feature_with_vocab_path)
 def test_nl_generator_invalidation_check(self):
     """Tests generator example threshold with fake heuristic."""
     # Expected to give 6 matches.
     input_batches = [
         [
             np.array(['MATCH', 'MATCH', 'MATCH']),
             np.array(['MATCH']),
         ],
         [
             np.array(['MATCH', 'MATCH']),
         ],
         [
             # Incorrect type, this would invalidate the stats.
             np.array([42]),
         ],
     ]
     # No domain_info is generated as the incorrect type of 42 value invalidated
     # the stats.
     generator = nlsg.NLStatsGenerator(_FakeHeuristic(),
                                       examples_threshold=1)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
Esempio n. 23
0
 def test_nl_generator_empty_input(self):
     """Tests generator on empty input with fake heuristic."""
     generator = nlsg.NLStatsGenerator(_FakeHeuristic())
     self.assertCombinerOutputEqual([], generator,
                                    statistics_pb2.FeatureNameStatistics())
Esempio n. 24
0
def get_generators(
        options: stats_options.StatsOptions,
        in_memory: bool = False) -> List[stats_generator.StatsGenerator]:
    """Initializes the list of stats generators, including custom generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
    generators = [NumExamplesStatsGenerator(options.weight_feature)]
    if options.add_default_generators:
        generators.extend(_get_default_generators(options, in_memory))
    if options.generators:
        # Add custom stats generators.
        generators.extend(options.generators)
    if options.enable_semantic_domain_stats:
        semantic_domain_feature_stats_generators = [
            image_stats_generator.ImageStatsGenerator(),
            natural_language_domain_inferring_stats_generator.
            NLDomainInferringStatsGenerator(),
            time_stats_generator.TimeStatsGenerator(),
        ]
        # Wrap semantic domain feature stats generators as a separate combiner
        # stats generator, so that we can apply sampling only for those and other
        # feature stats generators are not affected by it.
        generators.append(
            CombinerFeatureStatsWrapperGenerator(
                semantic_domain_feature_stats_generators,
                sample_rate=options.semantic_domain_stats_sample_rate))
    if options.schema is not None:
        if _schema_has_sparse_features(options.schema):
            generators.append(
                sparse_feature_stats_generator.SparseFeatureStatsGenerator(
                    options.schema))
        if _schema_has_natural_language_domains(options.schema):
            generators.append(
                natural_language_stats_generator.NLStatsGenerator(
                    options.schema, options.vocab_paths,
                    options.num_histogram_buckets,
                    options.num_quantiles_histogram_buckets,
                    options.num_rank_histogram_buckets))
        if options.schema.weighted_feature:
            generators.append(
                weighted_feature_stats_generator.WeightedFeatureStatsGenerator(
                    options.schema))
        if options.label_feature and not in_memory:
            # The LiftStatsGenerator is not a CombinerStatsGenerator and therefore
            # cannot currenty be used for in_memory executions.
            generators.append(
                lift_stats_generator.LiftStatsGenerator(
                    y_path=types.FeaturePath([options.label_feature]),
                    schema=options.schema,
                    example_weight_map=options.example_weight_map,
                    output_custom_stats=True))

    # Replace all CombinerFeatureStatsGenerator with a single
    # CombinerFeatureStatsWrapperGenerator.
    feature_generators = [
        x for x in generators
        if isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
    ]
    if feature_generators:
        generators = [
            x for x in generators
            if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
        ] + [CombinerFeatureStatsWrapperGenerator(feature_generators)]
    if in_memory:
        for generator in generators:
            if not isinstance(generator,
                              stats_generator.CombinerStatsGenerator):
                raise TypeError(
                    'Statistics generator used in '
                    'generate_statistics_in_memory must '
                    'extend CombinerStatsGenerator, found object of '
                    'type %s.' % generator.__class__.__name__)
    return generators
Esempio n. 25
0
 def test_nl_generator_empty_input(self):
     generator = nlsg.NLStatsGenerator(None, None, 0, 0, 0)
     self.assertCombinerOutputEqual(
         [], generator, self._create_expected_feature_name_statistics())