コード例 #1
0
    def test_topk_uniques_with_invalid_utf8_value(self):
        examples = [
            pa.Table.from_arrays(
                [pa.array([[b'a', b'\x80abc', b'a', b'\x80abc', b'a']])],
                ['fa'])
        ]
        expected_result = [
            text_format.Parse(
                """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 3
          }
          top_values {
            value: '__BYTES_VALUE__'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "__BYTES_VALUE__"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
    features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 2
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
        ]

        generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
            num_top_values=4, num_rank_histogram_buckets=3)
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)
コード例 #2
0
 def test_topk_uniques_with_empty_input(self):
     examples = []
     expected_result = []
     generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
         num_top_values=4, num_rank_histogram_buckets=3)
     self.assertSlicingAwareTransformOutputEqual(examples, generator,
                                                 expected_result)
コード例 #3
0
def _get_default_generators(
        options: stats_options.StatsOptions,
        in_memory: bool = False) -> List[stats_generator.StatsGenerator]:
    """Initializes default list of stats generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
    stats_generators = [
        basic_stats_generator.BasicStatsGenerator(
            schema=options.schema,
            example_weight_map=options.example_weight_map,
            num_values_histogram_buckets=options.num_values_histogram_buckets,
            num_histogram_buckets=options.num_histogram_buckets,
            num_quantiles_histogram_buckets=options.
            num_quantiles_histogram_buckets,
            epsilon=options.epsilon),
    ]
    if options.experimental_use_sketch_based_topk_uniques:
        stats_generators.append(
            top_k_uniques_sketch_stats_generator.
            TopKUniquesSketchStatsGenerator(
                schema=options.schema,
                example_weight_map=options.example_weight_map,
                num_top_values=options.num_top_values,
                num_rank_histogram_buckets=options.num_rank_histogram_buckets,
                frequency_threshold=options.frequency_threshold,
                weighted_frequency_threshold=options.
                weighted_frequency_threshold,
                num_misragries_buckets=_DEFAULT_MG_SKETCH_SIZE,
                num_kmv_buckets=_DEFAULT_KMV_SKETCH_SIZE))
    elif in_memory:
        stats_generators.append(
            top_k_uniques_combiner_stats_generator.
            TopKUniquesCombinerStatsGenerator(
                schema=options.schema,
                example_weight_map=options.example_weight_map,
                num_top_values=options.num_top_values,
                frequency_threshold=options.frequency_threshold,
                weighted_frequency_threshold=options.
                weighted_frequency_threshold,
                num_rank_histogram_buckets=options.num_rank_histogram_buckets))
    else:
        stats_generators.append(
            top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
                schema=options.schema,
                example_weight_map=options.example_weight_map,
                num_top_values=options.num_top_values,
                frequency_threshold=options.frequency_threshold,
                weighted_frequency_threshold=options.
                weighted_frequency_threshold,
                num_rank_histogram_buckets=options.num_rank_histogram_buckets),
        )
    return stats_generators
コード例 #4
0
 def test_topk_uniques_with_empty_table(self):
     examples = [pa.Table.from_arrays([], [])]
     expected_result = []
     generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
         num_top_values=4, num_rank_histogram_buckets=3)
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
コード例 #5
0
ファイル: stats_impl.py プロジェクト: wsuchy/data-validation
def _get_default_generators(
        options: stats_options.StatsOptions,
        in_memory: bool = False) -> List[stats_generator.StatsGenerator]:
    """Initializes default list of stats generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
    stats_generators = [
        basic_stats_generator.BasicStatsGenerator(
            schema=options.schema,
            weight_feature=options.weight_feature,
            num_values_histogram_buckets=options.num_values_histogram_buckets,
            num_histogram_buckets=options.num_histogram_buckets,
            num_quantiles_histogram_buckets=\
              options.num_quantiles_histogram_buckets,
            epsilon=options.epsilon),
        NumExamplesStatsGenerator(options.weight_feature)
    ]
    if in_memory:
        stats_generators.append(
            top_k_uniques_combiner_stats_generator.
            TopKUniquesCombinerStatsGenerator(
                schema=options.schema,
                weight_feature=options.weight_feature,
                num_top_values=options.num_top_values,
                frequency_threshold=options.frequency_threshold,
                weighted_frequency_threshold=options.
                weighted_frequency_threshold,
                num_rank_histogram_buckets=options.num_rank_histogram_buckets))
    else:
        stats_generators.extend([
            top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
                schema=options.schema,
                weight_feature=options.weight_feature,
                num_top_values=options.num_top_values,
                frequency_threshold=options.frequency_threshold,
                weighted_frequency_threshold=options.
                weighted_frequency_threshold,
                num_rank_histogram_buckets=options.num_rank_histogram_buckets),
        ])
    return stats_generators
 def test_schema_claims_categorical_but_actually_float(self):
   schema = text_format.Parse("""
   feature {
     name: "a"
     type: INT
     int_domain { is_categorical: true }
   }""", schema_pb2.Schema())
   inputs = [pa.RecordBatch.from_arrays([
       pa.array([], type=pa.list_(pa.float32()))], ['a'])]
   generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
       schema=schema,
       num_top_values=3, num_rank_histogram_buckets=3)
   self.assertSlicingAwareTransformOutputEqual(
       inputs,
       generator,
       expected_results=[],
       add_default_slice_key_to_input=True,
       add_default_slice_key_to_output=True)
コード例 #7
0
    def test_topk_uniques_with_categorical_feature(self):
        examples = [
            pa.Table.from_arrays(
                [pa.array([[12, 23, 34, 12], [45, 23], [12, 12, 34, 45]])],
                ['fa']),
            pa.Table.from_arrays([pa.array([None, None], type=pa.null())],
                                 ['fa'])
        ]

        expected_result = [
            text_format.Parse(
                """
      features {
        path {
          step: 'fa'
        }
        type: INT
        string_stats {
          top_values {
            value: '12'
            frequency: 4
          }
          top_values {
            value: '45'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "12"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "45"
              sample_count: 2.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "34"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
    features {
        path {
          step: 'fa'
        }
        type: INT
        string_stats {
          unique: 4
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
        ]

        schema = text_format.Parse(
            """
        feature {
          name: "fa"
          type: INT
          int_domain {
            is_categorical: true
          }
        }
        """, schema_pb2.Schema())
        generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
            schema=schema, num_top_values=2, num_rank_histogram_buckets=3)
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)
コード例 #8
0
    def test_topk_uniques_with_bytes_feature(self):
        # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
        # fb: 1 'a', 2 'b', 3 'c'
        examples = [
            pa.Table.from_arrays([
                pa.array([['a', 'b', 'c', 'e'], None, ['a', 'c', 'd'],
                          ['a', 'a', 'b', 'c', 'd'], None]),
                pa.array([['a', 'c', 'c'], ['b'], None, None, ['b', 'c']])
            ], ['fa', 'fb'])
        ]

        expected_result = [
            text_format.Parse(
                """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          top_values {
            value: 'd'
            frequency: 2
          }
          top_values {
            value: 'b'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
    features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 5
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
        ]

        schema = text_format.Parse(
            """
        feature {
          name: "fb"
          type: BYTES
          image_domain { }
        }
        """, schema_pb2.Schema())
        generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
            schema=schema, num_top_values=4, num_rank_histogram_buckets=3)
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)
コード例 #9
0
    def test_topk_uniques_with_single_unicode_feature(self):
        # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
        examples = [
            pa.Table.from_arrays([
                pa.array([
                    [u'a', u'b', u'c', u'e'],
                    [u'a', u'c', u'd', u'a'],
                    [u'a', u'b', u'c', u'd'],
                ])
            ], ['fa'])
        ]

        expected_result = [
            text_format.Parse(
                """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          top_values {
            value: 'd'
            frequency: 2
          }
          top_values {
            value: 'b'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
    features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 5
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
        ]

        generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
            num_top_values=4, num_rank_histogram_buckets=3)
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)
コード例 #10
0
    def test_topk_uniques_with_weights(self):
        # non-weighted ordering
        # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b'
        # weighted ordering
        # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b'

        examples = [
            pa.Table.from_arrays([
                pa.array([
                    ['a', 'b', 'c', 'e'],
                    ['a', 'c', 'd', 'a'],
                    ['d', 'e'],
                ]),
                pa.array([[5.0], [5.0], [15.0]])
            ], ['fa', 'w'])
        ]

        expected_result = [
            text_format.Parse(
                """
            features {
              path {
                step: 'fa'
              }
              type: STRING
              string_stats {
                top_values {
                  value: 'a'
                  frequency: 3.0
                }
                top_values {
                  value: 'e'
                  frequency: 2.0
                }
                top_values {
                  value: 'd'
                  frequency: 2.0
                }
                top_values {
                  value: 'c'
                  frequency: 2.0
                }
                rank_histogram {
                  buckets {
                    low_rank: 0
                    high_rank: 0
                    label: "a"
                    sample_count: 3.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "e"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "d"
                    sample_count: 2.0
                  }
                }
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
            features {
              path {
                step: 'fa'
              }
              type: STRING
              string_stats {
                weighted_string_stats {
                  top_values {
                    value: 'e'
                    frequency: 20.0
                  }
                  top_values {
                    value: 'd'
                    frequency: 20.0
                  }
                  top_values {
                    value: 'a'
                    frequency: 15.0
                  }
                  top_values {
                    value: 'c'
                    frequency: 10.0
                  }
                  rank_histogram {
                    buckets {
                      low_rank: 0
                      high_rank: 0
                      label: "e"
                      sample_count: 20.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "d"
                      sample_count: 20.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "a"
                      sample_count: 15.0
                    }
                  }
                }
              }
        }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 5
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
        ]

        generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
            weight_feature='w', num_top_values=4, num_rank_histogram_buckets=3)
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)
コード例 #11
0
 def test_topk_uniques_with_struct_leaves(self):
     inputs = [
         pa.Table.from_arrays([
             pa.array([[1.0], [2.0]]),
             pa.array([[{
                 'f1': ['a', 'b'],
                 'f2': [1, 2]
             }, {
                 'f1': ['b'],
             }], [{
                 'f1': ['c', 'd'],
                 'f2': [2, 3]
             }, {
                 'f2': [3]
             }]]),
         ], ['w', 'c']),
         pa.Table.from_arrays([
             pa.array([[3.0]]),
             pa.array([[{
                 'f1': ['d'],
                 'f2': [4]
             }]]),
         ], ['w', 'c']),
     ]
     expected_result = [
         text_format.Parse(
             """
         features{
           type: STRING
           string_stats {
             top_values {
               value: "d"
               frequency: 2.0
             }
             top_values {
               value: "b"
               frequency: 2.0
             }
             top_values {
               value: "c"
               frequency: 1.0
             }
             rank_histogram {
               buckets {
                 label: "d"
                 sample_count: 2.0
               }
               buckets {
                 low_rank: 1
                 high_rank: 1
                 label: "b"
                 sample_count: 2.0
               }
               buckets {
                 low_rank: 2
                 high_rank: 2
                 label: "c"
                 sample_count: 1.0
               }
             }
           }
           path {
             step: "c"
             step: "f1"
           }
         }""", statistics_pb2.DatasetFeatureStatistics()),
         text_format.Parse(
             """
         features {
           string_stats {
             top_values {
               value: "3"
               frequency: 2.0
             }
             top_values {
               value: "2"
               frequency: 2.0
             }
             top_values {
               value: "4"
               frequency: 1.0
             }
             rank_histogram {
               buckets {
                 label: "3"
                 sample_count: 2.0
               }
               buckets {
                 low_rank: 1
                 high_rank: 1
                 label: "2"
                 sample_count: 2.0
               }
               buckets {
                 low_rank: 2
                 high_rank: 2
                 label: "4"
                 sample_count: 1.0
               }
             }
           }
           path {
             step: "c"
             step: "f2"
           }
         }""", statistics_pb2.DatasetFeatureStatistics()),
         text_format.Parse(
             """
         features {
           type: STRING
           string_stats {
             unique: 4
           }
           path {
             step: "c"
             step: "f1"
           }
         }""", statistics_pb2.DatasetFeatureStatistics()),
         text_format.Parse(
             """
         features {
           type: INT
           string_stats {
             unique: 4
           }
           path {
             step: "c"
             step: "f2"
           }
         }""", statistics_pb2.DatasetFeatureStatistics()),
         text_format.Parse(
             """
         features {
           type: STRING
           string_stats {
             weighted_string_stats {
               top_values {
                 value: "d"
                 frequency: 5.0
               }
               top_values {
                 value: "c"
                 frequency: 2.0
               }
               top_values {
                 value: "b"
                 frequency: 2.0
               }
               rank_histogram {
                 buckets {
                   label: "d"
                   sample_count: 5.0
                 }
                 buckets {
                   low_rank: 1
                   high_rank: 1
                   label: "c"
                   sample_count: 2.0
                 }
                 buckets {
                   low_rank: 2
                   high_rank: 2
                   label: "b"
                   sample_count: 2.0
                 }
               }
             }
           }
           path {
             step: "c"
             step: "f1"
           }
         }""", statistics_pb2.DatasetFeatureStatistics()),
         text_format.Parse(
             """
         features {
           string_stats {
             weighted_string_stats {
               top_values {
                 value: "3"
                 frequency: 4.0
               }
               top_values {
                 value: "4"
                 frequency: 3.0
               }
               top_values {
                 value: "2"
                 frequency: 3.0
               }
               rank_histogram {
                 buckets {
                   label: "3"
                   sample_count: 4.0
                 }
                 buckets {
                   low_rank: 1
                   high_rank: 1
                   label: "4"
                   sample_count: 3.0
                 }
                 buckets {
                   low_rank: 2
                   high_rank: 2
                   label: "2"
                   sample_count: 3.0
                 }
               }
             }
           }
           path {
             step: "c"
             step: "f2"
           }
         }""", statistics_pb2.DatasetFeatureStatistics()),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: "c"
       type: STRUCT
       struct_domain {
         feature {
           name: "f2"
           type: INT
           int_domain {
             is_categorical: true
           }
         }
       }
     }
     """, schema_pb2.Schema())
     generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
         schema=schema,
         weight_feature='w',
         num_top_values=3,
         num_rank_histogram_buckets=3)
     self.assertSlicingAwareTransformOutputEqual(
         inputs,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
コード例 #12
0
    def test_topk_uniques_with_slicing(self):
        examples = [('slice1',
                     pa.Table.from_arrays([
                         pa.array([['a', 'b', 'c', 'e']]),
                         pa.array([['1', '1', '0']])
                     ], ['fa', 'fb'])),
                    ('slice2',
                     pa.Table.from_arrays([
                         pa.array([['b', 'a', 'e', 'c']]),
                         pa.array([['0', '0', '1']])
                     ], ['fa', 'fb'])),
                    ('slice1',
                     pa.Table.from_arrays([pa.array([['a', 'c', 'd', 'a']])],
                                          ['fa'])),
                    ('slice2',
                     pa.Table.from_arrays([pa.array([['b', 'e', 'd', 'b']])],
                                          ['fa']))]

        # Note that if two feature values have the same frequency, the one with the
        # lexicographically larger feature value will be higher in the order.
        expected_result = [
            ('slice1',
             text_format.Parse(
                 """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 3
          }
          top_values {
            value: 'c'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 2.0
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatistics())),
            ('slice1',
             text_format.Parse(
                 """
      features {
        path {
          step: 'fb'
        }
        type: STRING
        string_stats {
          top_values {
            value: '1'
            frequency: 2
          }
          top_values {
            value: '0'
            frequency: 1
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "1"
              sample_count: 2.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "0"
              sample_count: 1.0
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatistics())),
            ('slice1',
             text_format.Parse(
                 """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 5
        }
    }""", statistics_pb2.DatasetFeatureStatistics())),
            ('slice1',
             text_format.Parse(
                 """
      features {
        path {
          step: 'fb'
        }
        type: STRING
        string_stats {
          unique: 2
        }
    }""", statistics_pb2.DatasetFeatureStatistics())),
            ('slice2',
             text_format.Parse(
                 """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          top_values {
            value: 'b'
            frequency: 3
          }
          top_values {
            value: 'e'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "b"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "e"
              sample_count: 2.0
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatistics())),
            ('slice2',
             text_format.Parse(
                 """
      features {
        path {
          step: 'fb'
        }
        type: STRING
        string_stats {
          top_values {
            value: '0'
            frequency: 2
          }
          top_values {
            value: '1'
            frequency: 1
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "0"
              sample_count: 2.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "1"
              sample_count: 1.0
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatistics())),
            ('slice2',
             text_format.Parse(
                 """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 5
        }
    }""", statistics_pb2.DatasetFeatureStatistics())),
            ('slice2',
             text_format.Parse(
                 """
      features {
        path {
          step: 'fb'
        }
        type: STRING
        string_stats {
          unique: 2
        }
    }""", statistics_pb2.DatasetFeatureStatistics())),
        ]

        generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
            num_top_values=2, num_rank_histogram_buckets=2)
        self.assertSlicingAwareTransformOutputEqual(examples, generator,
                                                    expected_result)
コード例 #13
0
    def test_topk_uniques_with_frequency_threshold(self):
        examples = [
            pa.Table.from_arrays([
                pa.array([['a', 'b', 'y', 'b'], ['a', 'x', 'a', 'z']]),
                pa.array([[5.0], [15.0]])
            ], ['fa', 'w'])
        ]

        expected_result = [
            text_format.Parse(
                """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 3
          }
          top_values {
            value: 'b'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "b"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          weighted_string_stats {
            top_values {
              value: 'a'
              frequency: 35.0
            }
            top_values {
              value: 'z'
              frequency: 15.0
            }
            top_values {
              value: 'x'
              frequency: 15.0
            }
            rank_histogram {
              buckets {
                low_rank: 0
                high_rank: 0
                label: "a"
                sample_count: 35.0
              }
              buckets {
                low_rank: 1
                high_rank: 1
                label: "z"
                sample_count: 15.0
              }
              buckets {
                low_rank: 2
                high_rank: 2
                label: "x"
                sample_count: 15.0
              }
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 5
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
        ]

        generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
            weight_feature='w',
            num_top_values=5,
            frequency_threshold=2,
            weighted_frequency_threshold=15,
            num_rank_histogram_buckets=3)
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)
コード例 #14
0
  def test_topk_uniques_with_single_string_feature(self):
    # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'

    examples = [
        pa.RecordBatch.from_arrays([
            pa.array([
                ['a', 'b', 'c', 'e'],
                ['a', 'c', 'd', 'a'],
                ['a', 'b', 'c', 'd'],
            ])
        ], ['fa'])
    ]

    # Note that if two feature values have the same frequency, the one with the
    # lexicographically larger feature value will be higher in the order.
    expected_result = [
        text_format.Parse(
            """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          top_values {
            value: 'd'
            frequency: 2
          }
          top_values {
            value: 'b'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 5
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
    ]

    generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
        num_top_values=4, num_rank_histogram_buckets=3)
    self.assertSlicingAwareTransformOutputEqual(
        examples,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True)
コード例 #15
0
  def test_topk_uniques_with_weights(self):
    # non-weighted ordering
    # fa: 3 'a', 2 'e', 2 'd', 2 'c', 1 'b'
    # fb: 1 'v', 1 'w', 1 'x', 1 'y', 1 'z'
    # weighted ordering
    # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b'
    # fb: 6 'z', 4 'x', 4 'y', 4 'w', 2 'v'
    examples = [
        pa.RecordBatch.from_arrays([
            pa.array([
                ['a', 'b', 'c', 'e'],
                ['a', 'c', 'd', 'a'],
                ['d', 'e'],
            ]),
            pa.array([[5.0], [5.0], [15.0]]),
            pa.array([['v'], ['w', 'x', 'y'], ['z']]),
            pa.array([[2], [4], [6]]),
        ], ['fa', 'w', 'fb', 'w_b'])
    ]

    expected_result = [
        text_format.Parse(
            """
            features {
              path {
                step: 'fa'
              }
              type: STRING
              string_stats {
                top_values {
                  value: 'a'
                  frequency: 3.0
                }
                top_values {
                  value: 'e'
                  frequency: 2.0
                }
                top_values {
                  value: 'd'
                  frequency: 2.0
                }
                top_values {
                  value: 'c'
                  frequency: 2.0
                }
                rank_histogram {
                  buckets {
                    low_rank: 0
                    high_rank: 0
                    label: "a"
                    sample_count: 3.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "e"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "d"
                    sample_count: 2.0
                  }
                }
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              type: STRING
              string_stats {
                top_values {
                  value: "z"
                  frequency: 1.0
                }
                top_values {
                  value: "y"
                  frequency: 1.0
                }
                top_values {
                  value: "x"
                  frequency: 1.0
                }
                top_values {
                  value: "w"
                  frequency: 1.0
                }
                rank_histogram {
                  buckets {
                    label: "z"
                    sample_count: 1.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "y"
                    sample_count: 1.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "x"
                    sample_count: 1.0
                  }
                }
              }
              path {
                step: "fb"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              path {
                step: 'fa'
              }
              type: STRING
              string_stats {
                weighted_string_stats {
                  top_values {
                    value: 'e'
                    frequency: 20.0
                  }
                  top_values {
                    value: 'd'
                    frequency: 20.0
                  }
                  top_values {
                    value: 'a'
                    frequency: 15.0
                  }
                  top_values {
                    value: 'c'
                    frequency: 10.0
                  }
                  rank_histogram {
                    buckets {
                      low_rank: 0
                      high_rank: 0
                      label: "e"
                      sample_count: 20.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "d"
                      sample_count: 20.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "a"
                      sample_count: 15.0
                    }
                  }
                }
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              type: STRING
              string_stats {
                weighted_string_stats {
                  top_values {
                    value: "z"
                    frequency: 6.0
                  }
                  top_values {
                    value: "y"
                    frequency: 4.0
                  }
                  top_values {
                    value: "x"
                    frequency: 4.0
                  }
                  top_values {
                    value: "w"
                    frequency: 4.0
                  }
                  rank_histogram {
                    buckets {
                      label: "z"
                      sample_count: 6.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "y"
                      sample_count: 4.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "x"
                      sample_count: 4.0
                    }
                  }
                }
              }
              path {
                step: "fb"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              path {
                step: 'fa'
              }
              type: STRING
              string_stats {
                unique: 5
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              type: STRING
              string_stats {
                unique: 5
              }
              path {
                step: "fb"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
    ]

    generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
        example_weight_map=ExampleWeightMap(
            weight_feature='w',
            per_feature_override={types.FeaturePath(['fb']): 'w_b'}),
        num_top_values=4, num_rank_histogram_buckets=3)
    self.assertSlicingAwareTransformOutputEqual(
        examples,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True)
コード例 #16
0
    def test_topk_uniques_with_numeric_feature(self):
        # fa: 4 'a', 2 'b', 3 'c', 2 'd', 1 'e'
        examples = [{
            'fa': np.array(['a', 'b', 'c', 'e']),
            'fb': np.array([1.0, 2.0, 3.0])
        }, {
            'fa': None,
            'fb': np.array([4.0, 5.0])
        }, {
            'fa': np.array(['a', 'c', 'd']),
            'fb': None
        }, {
            'fa': np.array(['a', 'a', 'b', 'c', 'd']),
            'fb': None
        }]

        expected_result = [
            text_format.Parse(
                """
      features {
        name: 'fa'
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 4
          }
          top_values {
            value: 'c'
            frequency: 3
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
            text_format.Parse(
                """
    features {
        name: 'fa'
        type: STRING
        string_stats {
          unique: 5
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
        ]

        generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
            num_top_values=2, num_rank_histogram_buckets=3)
        self.assertSlicingAwareTransformOutputEqual(
            examples,
            generator,
            expected_result,
            add_default_slice_key_to_input=True,
            add_default_slice_key_to_output=True)