def test_topk_uniques_sketch_with_single_unicode_feature(self): # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' batches = [ pa.RecordBatch.from_arrays( [pa.array([[u'a', u'b', u'c', u'e'], [u'a', u'c', u'd', u'a']])], ['fa']), pa.RecordBatch.from_arrays([pa.array([[u'a', u'b', u'c', u'd']])], ['fa']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } type: STRING string_stats { unique: 5 top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } }""", statistics_pb2.FeatureNameStatistics()) } generator = sketch_generator.TopKUniquesSketchStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_large_bytes_values(self): # 4 'a', 3 large blob strings, 1 'b', 1'c' batches = [ pa.RecordBatch.from_arrays([ pa.array([[b'a', b'b', b'f' * 1025, b'a'], [b'a', b'f' * 1025, b'f' * 1026, b'a']]), ], ['fa']), pa.RecordBatch.from_arrays([ pa.array([['c']]), ], ['fa']), ] generator = sketch_generator.TopKUniquesSketchStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 5 top_values { value: "a" frequency: 4.0 } top_values { value: "__LARGE_BYTES__" frequency: 3.0 } top_values { value: "c" frequency: 1.0 } top_values { value: "b" frequency: 1.0 } rank_histogram { buckets { label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "__LARGE_BYTES__" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "c" sample_count: 1.0 } } } """, statistics_pb2.FeatureNameStatistics()) } self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_sketch_zero_row(self): batches = [ pa.RecordBatch.from_arrays( [pa.array([], type=pa.list_(pa.binary()))], ['f1']) ] expected_result = {} generator = sketch_generator.TopKUniquesSketchStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_schema_claimed_bytes(self): schema = text_format.Parse(""" feature { name: "a" type: BYTES # this makes the feature a bytes feature. image_domain { } }""", schema_pb2.Schema()) batches = [pa.RecordBatch.from_arrays([pa.array([[b'aaa']])], ['a'])] generator = sketch_generator.TopKUniquesSketchStatsGenerator( schema=schema, num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual( batches, generator, expected_feature_stats={})
def test_schema_claims_categorical_but_actually_float(self): schema = text_format.Parse(""" feature { name: "a" type: INT int_domain { is_categorical: true } }""", schema_pb2.Schema()) batches = [pa.RecordBatch.from_arrays([ pa.array([], type=pa.list_(pa.float32()))], ['a'])] generator = sketch_generator.TopKUniquesSketchStatsGenerator( schema=schema, num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual( batches, generator, expected_feature_stats={})
def _get_default_generators( options: stats_options.StatsOptions, in_memory: bool = False ) -> List[stats_generator.StatsGenerator]: """Initializes default list of stats generators. Args: options: A StatsOptions object. in_memory: Whether the generators will be used to generate statistics in memory (True) or using Beam (False). Returns: A list of stats generator objects. """ stats_generators = [ basic_stats_generator.BasicStatsGenerator( schema=options.schema, example_weight_map=options.example_weight_map, num_values_histogram_buckets=options.num_values_histogram_buckets, num_histogram_buckets=options.num_histogram_buckets, num_quantiles_histogram_buckets=options .num_quantiles_histogram_buckets, epsilon=options.epsilon), ] if options.experimental_use_sketch_based_topk_uniques or in_memory: stats_generators.append( top_k_uniques_sketch_stats_generator.TopKUniquesSketchStatsGenerator( schema=options.schema, example_weight_map=options.example_weight_map, num_top_values=options.num_top_values, num_rank_histogram_buckets=options.num_rank_histogram_buckets, frequency_threshold=options.frequency_threshold, weighted_frequency_threshold=options.weighted_frequency_threshold, num_misragries_buckets=_DEFAULT_MG_SKETCH_SIZE, num_kmv_buckets=_DEFAULT_KMV_SKETCH_SIZE)) else: stats_generators.append( top_k_uniques_stats_generator.TopKUniquesStatsGenerator( schema=options.schema, example_weight_map=options.example_weight_map, num_top_values=options.num_top_values, frequency_threshold=options.frequency_threshold, weighted_frequency_threshold=options.weighted_frequency_threshold, num_rank_histogram_buckets=options.num_rank_histogram_buckets), ) return stats_generators
def test_topk_struct_leaves(self): batches = [ pa.RecordBatch.from_arrays([ pa.array([[1.0], [2.0]]), pa.array([[{ 'f1': ['a', 'b'], 'f2': [1, 2] }, { 'f1': ['b'], }], [{ 'f1': ['c', 'd'], 'f2': [2, 3] }, { 'f2': [3] }]]), ], ['w', 'c']), pa.RecordBatch.from_arrays([ pa.array([[3.0]]), pa.array([[{ 'f1': ['d'], 'f2': [4] }]]), ], ['w', 'c']), ] schema = text_format.Parse( """ feature { name: "c" type: STRUCT struct_domain { feature { name: "f2" type: INT int_domain { is_categorical: true } } } } """, schema_pb2.Schema()) expected_result = { types.FeaturePath(['c', 'f1']): text_format.Parse( """ string_stats { unique: 4 top_values { value: "d" frequency: 2.0 } top_values { value: "b" frequency: 2.0 } top_values { value: "c" frequency: 1.0 } rank_histogram { buckets { label: "d" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "b" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "c" sample_count: 1.0 } } weighted_string_stats { top_values { value: "d" frequency: 5.0 } top_values { value: "c" frequency: 2.0 } top_values { value: "b" frequency: 2.0 } rank_histogram { buckets { label: "d" sample_count: 5.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "b" sample_count: 2.0 } } } } path { step: "c" step: "f1" }""", statistics_pb2.FeatureNameStatistics()), types.FeaturePath(['c', 'f2']): text_format.Parse( """ string_stats { unique: 4 top_values { value: "3" frequency: 2.0 } top_values { value: "2" frequency: 2.0 } top_values { value: "4" frequency: 1.0 } rank_histogram { buckets { label: "3" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "2" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "4" sample_count: 1.0 } } weighted_string_stats { top_values { value: "3" frequency: 4.0 } top_values { value: "4" frequency: 3.0 } top_values { value: "2" frequency: 3.0 } rank_histogram { buckets { label: "3" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "4" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "2" sample_count: 3.0 } } } } path { step: "c" step: "f2" }""", statistics_pb2.FeatureNameStatistics()), } generator = sketch_generator.TopKUniquesSketchStatsGenerator( schema=schema, example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=3, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_with_frequency_threshold(self): batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'y', 'b']]), pa.array([[5.0]]), ], ['fa', 'w']), pa.RecordBatch.from_arrays([ pa.array([['a', 'x', 'a', 'z']]), pa.array([[15.0]]), ], ['fa', 'w']) ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 5 top_values { value: 'a' frequency: 3 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "b" sample_count: 2.0 } } weighted_string_stats { top_values { value: 'a' frequency: 35.0 } top_values { value: 'z' frequency: 15.0 } top_values { value: 'x' frequency: 15.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 35.0 } buckets { low_rank: 1 high_rank: 1 label: "z" sample_count: 15.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 15.0 } } } }""", statistics_pb2.FeatureNameStatistics()) } generator = sketch_generator.TopKUniquesSketchStatsGenerator( example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=5, frequency_threshold=2, weighted_frequency_threshold=15, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_sketch_with_categorical_numeric_feature( self, is_float): # fa: 4 12, 2 23, 2 34, 2 45 def _map_nested_list(fn, val): if isinstance(val, list): return list([_map_nested_list(fn, v) for v in val]) return fn(val) data = [[[12, 23, 34, 12], [45, 23]], [[12, 12, 34, 45]]] if is_float == 'float': data = _map_nested_list(float, data) type_enum = 'FLOAT' domain = 'float_domain' else: type_enum = 'INT' domain = 'int_domain' batches = [ pa.RecordBatch.from_arrays([pa.array(data[0])], ['fa']), pa.RecordBatch.from_arrays([pa.array(data[1])], ['fa']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 4 top_values { value: '12' frequency: 4 } top_values { value: '45' frequency: 2 } top_values { value: '34' frequency: 2 } top_values { value: '23' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "12" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "45" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "34" sample_count: 2.0 } } }""", statistics_pb2.FeatureNameStatistics()) } schema = text_format.Parse( """ feature { name: "fa" type: %s %s { is_categorical: true } } """ % (type_enum, domain), schema_pb2.Schema()) generator = sketch_generator.TopKUniquesSketchStatsGenerator( schema=schema, num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_sketch_empty_record_batch(self): batches = [pa.RecordBatch.from_arrays([], [])] expected_result = {} generator = sketch_generator.TopKUniquesSketchStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_sketch_with_single_bytes_feature(self): # 'fa': 4 'a', 2 'b', 3 'c', 2 'd', 1 'e' batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']], type=pa.list_(pa.binary())) ], ['fa']), pa.RecordBatch.from_arrays( [pa.array([['a', 'b', 'c', 'd']], type=pa.list_(pa.binary()))], ['fa']) ] # Note that if two feature values have the same frequency, the one with the # lexicographically larger feature value will be higher in the order. expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 5 top_values { value: 'a' frequency: 4 } top_values { value: 'c' frequency: 3 } top_values { value: 'd' frequency: 2 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } }""", statistics_pb2.FeatureNameStatistics()) } generator = sketch_generator.TopKUniquesSketchStatsGenerator( num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_sketch_with_weights_custom_stats(self): # non-weighted ordering # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b' # weighted ordering # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b' batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']]), pa.array([[5.0], [5.0]]), ], ['fa', 'w']), pa.RecordBatch.from_arrays([ pa.array([['d', 'e']]), pa.array([[15.0]]), ], ['fa', 'w']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } custom_stats { name: 'topk_sketch_rank_histogram' rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } custom_stats { name: 'weighted_topk_sketch_rank_histogram' rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 15.0 } } } custom_stats { name: 'uniques_sketch_num_uniques' num: 5 }""", statistics_pb2.FeatureNameStatistics()) } generator = sketch_generator.TopKUniquesSketchStatsGenerator( example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=4, num_rank_histogram_buckets=3, store_output_in_custom_stats=True) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_sketch_with_int_weights(self): # non-weighted ordering # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b' # weighted ordering # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b' batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']], type=pa.list_(pa.binary())), pa.array([[5], [5]], type=pa.list_(pa.int32())), ], ['fa', 'w']), pa.RecordBatch.from_arrays([ pa.array([['d', 'e']], type=pa.list_(pa.binary())), pa.array([[15]], type=pa.list_(pa.int32())), ], ['fa', 'w']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 5 top_values { value: 'a' frequency: 3.0 } top_values { value: 'e' frequency: 2.0 } top_values { value: 'd' frequency: 2.0 } top_values { value: 'c' frequency: 2.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } weighted_string_stats { top_values { value: 'e' frequency: 20.0 } top_values { value: 'd' frequency: 20.0 } top_values { value: 'a' frequency: 15.0 } top_values { value: 'c' frequency: 10.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 15.0 } } } }""", statistics_pb2.FeatureNameStatistics()) } generator = sketch_generator.TopKUniquesSketchStatsGenerator( example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_combiner_with_weights(self): # non-weighted ordering # fa: 3 'a', 2 'e', 2 'd', 2 'c', 1 'b' # fb: 1 'v', 1 'w', 1 'x', 1 'y', 1 'z' # weighted ordering # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b' # fb: 6 'z', 4 'x', 4 'y', 4 'w', 2 'v' batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']]), pa.array([['v'], ['w', 'x', 'y']]), pa.array([[5.0], [5.0]]), pa.array([[2.0], [4.0]]), ], ['fa', 'fb', 'w', 'w_b']), pa.RecordBatch.from_arrays([ pa.array([['d', 'e']]), pa.array([['z']]), pa.array([[15.0]]), pa.array([[6.0]]), ], ['fa', 'fb', 'w', 'w_b']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 5 top_values { value: 'a' frequency: 3.0 } top_values { value: 'e' frequency: 2.0 } top_values { value: 'd' frequency: 2.0 } top_values { value: 'c' frequency: 2.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } weighted_string_stats { top_values { value: 'e' frequency: 20.0 } top_values { value: 'd' frequency: 20.0 } top_values { value: 'a' frequency: 15.0 } top_values { value: 'c' frequency: 10.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 15.0 } } } }""", statistics_pb2.FeatureNameStatistics()), types.FeaturePath(['fb']): text_format.Parse( """ string_stats { unique: 5 top_values { value: "z" frequency: 1.0 } top_values { value: "y" frequency: 1.0 } top_values { value: "x" frequency: 1.0 } top_values { value: "w" frequency: 1.0 } rank_histogram { buckets { label: "z" sample_count: 1.0 } buckets { low_rank: 1 high_rank: 1 label: "y" sample_count: 1.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 1.0 } } weighted_string_stats { top_values { value: "z" frequency: 6.0 } top_values { value: "y" frequency: 4.0 } top_values { value: "x" frequency: 4.0 } top_values { value: "w" frequency: 4.0 } rank_histogram { buckets { label: "z" sample_count: 6.0 } buckets { low_rank: 1 high_rank: 1 label: "y" sample_count: 4.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 4.0 } } } } path { step: "fb" }""", statistics_pb2.FeatureNameStatistics()), } generator = sketch_generator.TopKUniquesSketchStatsGenerator( example_weight_map=ExampleWeightMap( weight_feature='w', per_feature_override={types.FeaturePath(['fb']): 'w_b'}), num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_sketch_with_categorical_feature(self): # fa: 4 12, 2 23, 2 34, 2 45 batches = [ pa.RecordBatch.from_arrays([pa.array([[12, 23, 34, 12], [45, 23]])], ['fa']), pa.RecordBatch.from_arrays([pa.array([[12, 12, 34, 45]])], ['fa']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } type: INT string_stats { unique: 4 top_values { value: '12' frequency: 4 } top_values { value: '45' frequency: 2 } top_values { value: '34' frequency: 2 } top_values { value: '23' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "12" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "45" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "34" sample_count: 2.0 } } }""", statistics_pb2.FeatureNameStatistics()) } schema = text_format.Parse( """ feature { name: "fa" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) generator = sketch_generator.TopKUniquesSketchStatsGenerator( schema=schema, num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)