def test_validate_stats_invalid_environment(self): statistics = statistics_pb2.DatasetFeatureStatisticsList() statistics.datasets.extend([statistics_pb2.DatasetFeatureStatistics()]) schema = text_format.Parse( """ default_environment: "TRAINING" default_environment: "SERVING" feature { name: "label" not_in_environment: "SERVING" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) with self.assertRaisesRegexp(ValueError, 'Environment.*not found in the schema.*'): _ = validation_api.validate_statistics(statistics, schema, environment='INVALID')
def test_validate_stats_invalid_schema_input(self): statistics = statistics_pb2.DatasetFeatureStatisticsList() statistics.datasets.extend([statistics_pb2.DatasetFeatureStatistics()]) with self.assertRaisesRegexp(TypeError, '.*should be a Schema proto.*'): _ = validation_api.validate_statistics(statistics, {})
def test_validate_stats_invalid_statistics_input(self): schema = schema_pb2.Schema() with self.assertRaisesRegexp(TypeError, 'statistics is of type.*'): _ = validation_api.validate_statistics({}, schema)
def test_validate_stats_with_previous_and_serving_stats(self): statistics = text_format.Parse( """ datasets { num_examples: 10 features { name: 'bar' type: STRING string_stats { common_stats { num_missing: 0 num_non_missing: 10 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 1 } buckets { label: "b" sample_count: 2 } buckets { label: "c" sample_count: 7 } } } } features { name: 'annotated_enum' type: STRING string_stats { common_stats { num_missing: 0 num_non_missing: 10 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 1 } buckets { label: "b" sample_count: 1 } } } } }""", statistics_pb2.DatasetFeatureStatisticsList()) previous_statistics = text_format.Parse( """ datasets { num_examples: 10 features { name: 'annotated_enum' type: STRING string_stats { common_stats { num_non_missing: 10 num_missing: 0 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 3 } buckets { label: "b" sample_count: 1 } } } } features { name: 'bar' type: STRING string_stats { common_stats { num_missing: 0 num_non_missing: 10 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 3 } buckets { label: "b" sample_count: 1 } buckets { label: "c" sample_count: 6 } } } } }""", statistics_pb2.DatasetFeatureStatisticsList()) serving_statistics = text_format.Parse( """ datasets { num_examples: 10 features { name: 'bar' type: STRING string_stats { common_stats { num_missing: 0 num_non_missing: 10 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 3 } buckets { label: "b" sample_count: 1 } buckets { label: "c" sample_count: 6 } } } } features { name: 'annotated_enum' type: STRING string_stats { common_stats { num_non_missing: 10 num_missing: 0 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 3 } buckets { label: "b" sample_count: 1 } } } } }""", statistics_pb2.DatasetFeatureStatisticsList()) schema = text_format.Parse( """ feature { name: 'bar' type: BYTES skew_comparator { infinity_norm { threshold: 0.1 } } } feature { name: "annotated_enum" type: BYTES domain: "annotated_enum" drift_comparator { infinity_norm { threshold: 0.01 } } } string_domain { name: "annotated_enum" value: "a" } """, schema_pb2.Schema()) expected_anomalies = { 'bar': text_format.Parse(self._bar_anomaly_info, anomalies_pb2.AnomalyInfo()), 'annotated_enum': text_format.Parse(self._annotated_enum_anomaly_info, anomalies_pb2.AnomalyInfo()) } # Validate the stats. anomalies = validation_api.validate_statistics( statistics, schema, previous_statistics=previous_statistics, serving_statistics=serving_statistics) self._assert_equal_anomalies(anomalies, expected_anomalies)
def test_validate_stats_with_environment(self): statistics = text_format.Parse( """ datasets { num_examples: 1000 features { name: 'feature' type: STRING string_stats { common_stats { num_non_missing: 1000 min_num_values: 1 max_num_values: 1 } unique: 3 } } }""", statistics_pb2.DatasetFeatureStatisticsList()) schema = text_format.Parse( """ default_environment: "TRAINING" default_environment: "SERVING" feature { name: "label" not_in_environment: "SERVING" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } feature { name: "feature" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) expected_anomalies_training = { 'label': text_format.Parse( """ description: "Column is completely missing" severity: ERROR short_description: "Column dropped" reason { type: SCHEMA_MISSING_COLUMN short_description: "Column dropped" description: "Column is completely missing" } """, anomalies_pb2.AnomalyInfo()) } # Validate the stats in TRAINING environment. anomalies_training = validation_api.validate_statistics( statistics, schema, environment='TRAINING') self._assert_equal_anomalies(anomalies_training, expected_anomalies_training) # Validate the stats in SERVING environment. anomalies_serving = validation_api.validate_statistics( statistics, schema, environment='SERVING') self._assert_equal_anomalies(anomalies_serving, {})
def test_validate_stats(self): schema = text_format.Parse( """ string_domain { name: "MyAloneEnum" value: "A" value: "B" value: "C" } feature { name: "annotated_enum" value_count { min:1 max:1 } presence { min_count: 1 } type: BYTES domain: "MyAloneEnum" } feature { name: "ignore_this" lifecycle_stage: DEPRECATED value_count { min:1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) statistics = text_format.Parse( """ datasets{ num_examples: 10 features { name: 'annotated_enum' type: STRING string_stats { common_stats { num_missing: 3 num_non_missing: 7 min_num_values: 1 max_num_values: 1 } unique: 3 rank_histogram { buckets { label: "D" sample_count: 1 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) expected_anomalies = { 'annotated_enum': text_format.Parse( """ description: "Examples contain values missing from the schema: D (?). " severity: ERROR short_description: "Unexpected string values" reason { type: ENUM_TYPE_UNEXPECTED_STRING_VALUES short_description: "Unexpected string values" description: "Examples contain values missing from the schema: D (?). " } """, anomalies_pb2.AnomalyInfo()) } # Validate the stats. anomalies = validation_api.validate_statistics(statistics, schema) self._assert_equal_anomalies(anomalies, expected_anomalies)