def test_topk_empty(self): examples = [] expected_result = [] generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertSlicingAwareTransformOutputEqual(examples, generator, expected_result)
def test_single_string_feature_manual(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' batches = [{ 'fa': np.array([ np.array(['a', 'b', 'c', 'e']), np.array(['a', 'c', 'd', 'a']) ], dtype=np.object) }, { 'fa': np.array([np.array(['a', 'b', 'c', 'd'])], dtype=np.object) }, {}] expected_result = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(batches, generator, [expected_result])
def test_with_categorical_feature(self): batches = [{ 'fa': np.array([np.array([12, 23, 34, 12]), np.array([45, 23])]) }, { 'fa': np.array([np.array([12, 12, 34, 45])]) }] expected_result_fa = text_format.Parse( """ features { name: 'fa' type: INT string_stats { top_values { value: '12' frequency: 4 } top_values { value: '45' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "12" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "45" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "34" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) schema = text_format.Parse( """ feature { name: "fa" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) generator = top_k_stats_generator.TopKStatsGenerator( schema=schema, num_top_values=2, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(batches, generator, [expected_result_fa])
def test_topk_with_numeric_feature(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' examples = [{'fa': np.array(['a', 'b', 'c', 'e']), 'fb': np.array([1.0, 2.0, 3.0])}, {'fa': None, 'fb': np.array([4.0, 5.0])}, {'fa': np.array(['a', 'c', 'd']), 'fb': None}, {'fa': np.array(['a', 'a', 'b', 'c', 'd']), 'fb': None}] expected_result = [ text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) ] generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=2, num_rank_histogram_buckets=3) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_topk_with_single_string_feature(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' examples = [{'fa': np.array(['a', 'b', 'c', 'e'])}, {'fa': np.array(['a', 'c', 'd', 'a'])}, {'fa': np.array(['a', 'b', 'c', 'd'])}] # Note that if two feature values have the same frequency, the one with the # lexicographically larger feature value will be higher in the order. expected_result = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(examples, generator, [expected_result])
def test_topk_with_empty_dict(self): examples = [{}] expected_result = [] generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def expand(self, dataset): # Initialize a list of stats generators to run. stats_generators = [ # Create common stats generator. common_stats_generator.CommonStatsGenerator( schema=self._options.schema, num_values_histogram_buckets=\ self._options.num_values_histogram_buckets, epsilon=self._options.epsilon), # Create numeric stats generator. numeric_stats_generator.NumericStatsGenerator( schema=self._options.schema, num_histogram_buckets=self._options.num_histogram_buckets, num_quantiles_histogram_buckets=\ self._options.num_quantiles_histogram_buckets, epsilon=self._options.epsilon), # Create string stats generator. string_stats_generator.StringStatsGenerator( schema=self._options.schema), # Create topk stats generator. top_k_stats_generator.TopKStatsGenerator( schema=self._options.schema, num_top_values=self._options.num_top_values, num_rank_histogram_buckets=\ self._options.num_rank_histogram_buckets), # Create uniques stats generator. uniques_stats_generator.UniquesStatsGenerator( schema=self._options.schema) ] if self._options.generators is not None: # Add custom stats generators. stats_generators.extend(self._options.generators) # Profile and then batch input examples. batched_dataset = (dataset | 'Profile' >> profile_util.Profile() | 'BatchInputs' >> batch_util.BatchExamples()) # If a set of whitelist features are provided, keep only those features. filtered_dataset = batched_dataset if self._options.feature_whitelist: filtered_dataset = ( batched_dataset | 'RemoveNonWhitelistedFeatures' >> beam.Map( _filter_features, feature_whitelist=self._options.feature_whitelist)) return (filtered_dataset | 'RunStatsGenerators' >> stats_impl.GenerateStatisticsImpl(stats_generators))
def _get_default_generators( options, in_memory = False ): """Initialize default list of stats generators. Args: options: A StatsOptions object. in_memory: Whether the generators will be used to generate statistics in memory (True) or using Beam (False). Returns: A list of stats generator objects. """ stats_generators = [ common_stats_generator.CommonStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_values_histogram_buckets=options.num_values_histogram_buckets, epsilon=options.epsilon), numeric_stats_generator.NumericStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_histogram_buckets=options.num_histogram_buckets, num_quantiles_histogram_buckets=\ options.num_quantiles_histogram_buckets, epsilon=options.epsilon), string_stats_generator.StringStatsGenerator( schema=options.schema) ] if in_memory: stats_generators.append( top_k_uniques_combiner_stats_generator. TopKUniquesCombinerStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_top_values=options.num_top_values, num_rank_histogram_buckets=options.num_rank_histogram_buckets)) else: stats_generators.extend([ top_k_stats_generator.TopKStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_top_values=options.num_top_values, num_rank_histogram_buckets=options.num_rank_histogram_buckets), uniques_stats_generator.UniquesStatsGenerator(schema=options.schema) ]) return stats_generators
def test_topk_with_invalid_utf8_value(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' examples = [{'fa': np.array(['a', b'\x80abc', 'a', b'\x80abc', 'a'], dtype=np.object)}] expected_result = [ text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 3 } top_values { value: '__BYTES_VALUE__' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "__BYTES_VALUE__" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) ] generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_with_empty_list(self): batches = [] expected_result = [] generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(batches, generator, expected_result)
def expand(self, dataset): # Initialize a list of stats generators to run. stats_generators = [ # Create common stats generator. common_stats_generator.CommonStatsGenerator( schema=self._options.schema, num_values_histogram_buckets=\ self._options.num_values_histogram_buckets, epsilon=self._options.epsilon), # Create numeric stats generator. numeric_stats_generator.NumericStatsGenerator( schema=self._options.schema, num_histogram_buckets=self._options.num_histogram_buckets, num_quantiles_histogram_buckets=\ self._options.num_quantiles_histogram_buckets, epsilon=self._options.epsilon), # Create string stats generator. string_stats_generator.StringStatsGenerator( schema=self._options.schema), # Create topk stats generator. top_k_stats_generator.TopKStatsGenerator( schema=self._options.schema, num_top_values=self._options.num_top_values, num_rank_histogram_buckets=\ self._options.num_rank_histogram_buckets), # Create uniques stats generator. uniques_stats_generator.UniquesStatsGenerator( schema=self._options.schema) ] if self._options.generators is not None: # Add custom stats generators. stats_generators.extend(self._options.generators) # Profile the input examples. dataset |= 'ProfileExamples' >> profile_util.Profile() # Sample input data if sample_count option is provided. if self._options.sample_count is not None: # beam.combiners.Sample.FixedSizeGlobally returns a # PCollection[List[types.Example]], which we then flatten to get a # PCollection[types.Example]. dataset |= ('SampleExamples(%s)' % self._options.sample_count >> beam.combiners.Sample.FixedSizeGlobally( self._options.sample_count) | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst)) elif self._options.sample_rate is not None: dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate >> beam.FlatMap(_sample_at_rate, sample_rate=self._options.sample_rate)) # Batch the input examples. desired_batch_size = (None if self._options.sample_count is None else self._options.sample_count) dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples( desired_batch_size=desired_batch_size)) # If a set of whitelist features are provided, keep only those features. if self._options.feature_whitelist: dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map( _filter_features, feature_whitelist=self._options.feature_whitelist)) return (dataset | 'RunStatsGenerators' >> stats_impl.GenerateStatisticsImpl(stats_generators))
def test_topk_with_weights(self): # non-weighted ordering # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b' # weighted ordering # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b' batches = [{'fa': np.array([np.array(['a', 'b', 'c', 'e']), np.array(['a', 'c', 'd', 'a'])], dtype=np.object), 'w': np.array([np.array([5.0]), np.array([5.0])])}, {'fa': np.array([np.array(['d', 'e'])], dtype=np.object), 'w': np.array([np.array([15.0])])}] expected_result = [ text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 3.0 } top_values { value: 'e' frequency: 2.0 } top_values { value: 'd' frequency: 2.0 } top_values { value: 'c' frequency: 2.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { name: 'fa' type: STRING string_stats { weighted_string_stats { top_values { value: 'e' frequency: 20.0 } top_values { value: 'd' frequency: 20.0 } top_values { value: 'a' frequency: 15.0 } top_values { value: 'c' frequency: 10.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 15.0 } } } } }""", statistics_pb2.DatasetFeatureStatistics())] generator = top_k_stats_generator.TopKStatsGenerator( weight_feature='w', num_top_values=4, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(batches, generator, expected_result)
def test_topk_with_slicing(self): examples = [('slice1', { 'fa': np.array(['a', 'b', 'c', 'e']), 'fb': np.array(['1', '1', '0']) }), ('slice2', { 'fa': np.array(['b', 'a', 'e', 'c']), 'fb': np.array(['0', '0', '1']) }), ('slice1', { 'fa': np.array(['a', 'c', 'd', 'a']), 'fb': None }), ('slice2', { 'fa': np.array(['b', 'e', 'd', 'b']), 'fb': None })] # Note that if two feature values have the same frequency, the one with the # lexicographically larger feature value will be higher in the order. expected_result = [('slice1', text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 3 } top_values { value: 'c' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 2.0 } } } } """, statistics_pb2.DatasetFeatureStatistics())), ('slice1', text_format.Parse( """ features { name: 'fb' type: STRING string_stats { top_values { value: '1' frequency: 2 } top_values { value: '0' frequency: 1 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "1" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "0" sample_count: 1.0 } } } } """, statistics_pb2.DatasetFeatureStatistics())), ('slice2', text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'b' frequency: 3 } top_values { value: 'e' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "b" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } } } } """, statistics_pb2.DatasetFeatureStatistics())), ('slice2', text_format.Parse( """ features { name: 'fb' type: STRING string_stats { top_values { value: '0' frequency: 2 } top_values { value: '1' frequency: 1 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "0" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "1" sample_count: 1.0 } } } } """, statistics_pb2.DatasetFeatureStatistics()))] generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=2, num_rank_histogram_buckets=2) self.assertSlicingAwareTransformOutputEqual(examples, generator, expected_result)
def expand(self, dataset): # Initialize a list of stats generators to run. stats_generators = [ # Create common stats generator. common_stats_generator.CommonStatsGenerator( schema=self._options.schema, weight_feature=self._options.weight_feature, num_values_histogram_buckets=\ self._options.num_values_histogram_buckets, epsilon=self._options.epsilon), # Create numeric stats generator. numeric_stats_generator.NumericStatsGenerator( schema=self._options.schema, weight_feature=self._options.weight_feature, num_histogram_buckets=self._options.num_histogram_buckets, num_quantiles_histogram_buckets=\ self._options.num_quantiles_histogram_buckets, epsilon=self._options.epsilon), # Create string stats generator. string_stats_generator.StringStatsGenerator( schema=self._options.schema), # Create topk stats generator. top_k_stats_generator.TopKStatsGenerator( schema=self._options.schema, weight_feature=self._options.weight_feature, num_top_values=self._options.num_top_values, num_rank_histogram_buckets=\ self._options.num_rank_histogram_buckets), # Create uniques stats generator. uniques_stats_generator.UniquesStatsGenerator( schema=self._options.schema) ] if self._options.generators is not None: # Add custom stats generators. stats_generators.extend(self._options.generators) # Batch the input examples. desired_batch_size = (None if self._options.sample_count is None else self._options.sample_count) dataset = (dataset | 'BatchExamples' >> batch_util.BatchExamples( desired_batch_size=desired_batch_size)) # If a set of whitelist features are provided, keep only those features. if self._options.feature_whitelist: dataset |= ('RemoveNonWhitelistedFeatures' >> beam.Map( _filter_features, feature_whitelist=self._options.feature_whitelist)) result_protos = [] # Iterate over the stats generators. For each generator, # a) if it is a CombinerStatsGenerator, wrap it as a beam.CombineFn # and run it. # b) if it is a TransformStatsGenerator, wrap it as a beam.PTransform # and run it. for generator in stats_generators: if isinstance(generator, stats_generator.CombinerStatsGenerator): result_protos.append(dataset | generator.name >> beam.CombineGlobally( _CombineFnWrapper(generator))) elif isinstance(generator, stats_generator.TransformStatsGenerator): result_protos.append(dataset | generator.name >> generator.ptransform) else: raise TypeError( 'Statistics generator must extend one of ' 'CombinerStatsGenerator or TransformStatsGenerator, ' 'found object of type %s' % generator.__class__.__name__) # Each stats generator will output a PCollection of DatasetFeatureStatistics # protos. We now flatten the list of PCollections into a single PCollection, # then merge the DatasetFeatureStatistics protos in the PCollection into a # single DatasetFeatureStatisticsList proto. return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten() | 'MergeDatasetFeatureStatisticsProtos' >> beam.CombineGlobally(_merge_dataset_feature_stats_protos) | 'MakeDatasetFeatureStatisticsListProto' >> beam.Map(_make_dataset_feature_statistics_list_proto))
def test_topk_with_missing_feature(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' # fb: 1 'a', 1 'b', 2 'c' examples = [{'fa': np.array(['a', 'b', 'c', 'e']), 'fb': np.array(['a', 'c', 'c'])}, {'fa': None, 'fb': np.array(['b'])}, {'fa': np.array(['a', 'c', 'd']), 'fb': None}, {'fa': np.array(['a', 'a', 'b', 'c', 'd'])}, {'fa': None}] expected_result_fa = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) expected_result_fb = text_format.Parse( """ features { name: 'fb' type: STRING string_stats { top_values { value: 'c' frequency: 2 } top_values { value: 'b' frequency: 1 } top_values { value: 'a' frequency: 1 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "c" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "b" sample_count: 1.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 1.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()) generator = top_k_stats_generator.TopKStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertTransformOutputEqual(examples, generator, [expected_result_fa, expected_result_fb])