def test_validate_instance_global_only_anomaly_type(self): instance = {'annotated_enum': np.array(['D'])} # This schema has a presence.min_count > 1, which will generate an anomaly # of type FEATURE_TYPE_LOW_NUMBER_PRESENT when any single example is # validated using this schema. This test checks that this anomaly type # (which is not meaningful in per-example validation) is not included in the # Anomalies proto that validate_instance returns. schema = text_format.Parse( """ string_domain { name: "MyAloneEnum" value: "A" value: "B" value: "C" } feature { name: "annotated_enum" value_count { min:1 max:1 } presence { min_count: 5 } type: BYTES domain: "MyAloneEnum" } feature { name: "ignore_this" lifecycle_stage: DEPRECATED value_count { min:1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) expected_anomalies = { 'annotated_enum': text_format.Parse( """ description: "Examples contain values missing from the schema: D " "(~100%). " severity: ERROR short_description: "Unexpected string values" reason { type: ENUM_TYPE_UNEXPECTED_STRING_VALUES short_description: "Unexpected string values" description: "Examples contain values missing from the schema: D " "(~100%). " } """, anomalies_pb2.AnomalyInfo()) } options = stats_options.StatsOptions(schema=schema) anomalies = validation_api.validate_instance(instance, options) self._assert_equal_anomalies(anomalies, expected_anomalies)
def test_validate_stats_with_previous_stats(self): statistics = text_format.Parse( """ datasets { num_examples: 2 features { name: 'annotated_enum' type: STRING string_stats { common_stats { num_non_missing: 2 num_missing: 0 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 1 } buckets { label: "b" sample_count: 1 } } } } }""", statistics_pb2.DatasetFeatureStatisticsList()) previous_statistics = text_format.Parse( """ datasets { num_examples: 4 features { name: 'annotated_enum' type: STRING string_stats { common_stats { num_non_missing: 4 num_missing: 0 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 3 } buckets { label: "b" sample_count: 1 } } } } }""", statistics_pb2.DatasetFeatureStatisticsList()) schema = text_format.Parse( """ feature { name: "annotated_enum" type: BYTES domain: "annotated_enum" drift_comparator { infinity_norm { threshold: 0.01 } } } string_domain { name: "annotated_enum" value: "a" } """, schema_pb2.Schema()) expected_anomalies = { 'annotated_enum': text_format.Parse(self._annotated_enum_anomaly_info, anomalies_pb2.AnomalyInfo()) } # Validate the stats. anomalies = validation_api.validate_statistics( statistics, schema, previous_statistics=previous_statistics) self._assert_equal_anomalies(anomalies, expected_anomalies)
def test_validate_instance(self): instance = {'annotated_enum': np.array(['D'])} schema = text_format.Parse( """ string_domain { name: "MyAloneEnum" value: "A" value: "B" value: "C" } feature { name: "annotated_enum" value_count { min:1 max:1 } presence { min_count: 1 } type: BYTES domain: "MyAloneEnum" } feature { name: "ignore_this" lifecycle_stage: DEPRECATED value_count { min:1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) expected_anomalies = { 'annotated_enum': text_format.Parse( """ description: "Examples contain values missing from the schema: D " "(~100%). " severity: ERROR short_description: "Unexpected string values" reason { type: ENUM_TYPE_UNEXPECTED_STRING_VALUES short_description: "Unexpected string values" description: "Examples contain values missing from the schema: D " "(~100%). " } """, anomalies_pb2.AnomalyInfo()) } options = stats_options.StatsOptions(schema=schema) anomalies = validation_api.validate_instance(instance, options) self._assert_equal_anomalies(anomalies, expected_anomalies)
def test_validate_instance_environment(self): instance = {'feature': np.array(['A'])} schema = text_format.Parse( """ default_environment: "TRAINING" default_environment: "SERVING" feature { name: "label" not_in_environment: "SERVING" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } feature { name: "feature" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) options = stats_options.StatsOptions(schema=schema) # Validate the instance in TRAINING environment. expected_anomalies_training = { 'label': text_format.Parse( """ description: "Column is completely missing" severity: ERROR short_description: "Column dropped" reason { type: SCHEMA_MISSING_COLUMN short_description: "Column dropped" description: "Column is completely missing" } """, anomalies_pb2.AnomalyInfo()) } anomalies_training = validation_api.validate_instance( instance, options, environment='TRAINING') self._assert_equal_anomalies(anomalies_training, expected_anomalies_training) # Validate the instance in SERVING environment. anomalies_serving = validation_api.validate_instance( instance, options, environment='SERVING') self._assert_equal_anomalies(anomalies_serving, {})
def test_validate_stats_with_environment(self): statistics = text_format.Parse( """ datasets { num_examples: 1000 features { name: 'feature' type: STRING string_stats { common_stats { num_non_missing: 1000 min_num_values: 1 max_num_values: 1 } unique: 3 } } }""", statistics_pb2.DatasetFeatureStatisticsList()) schema = text_format.Parse( """ default_environment: "TRAINING" default_environment: "SERVING" feature { name: "label" not_in_environment: "SERVING" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } feature { name: "feature" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) expected_anomalies_training = { 'label': text_format.Parse( """ description: "Column is completely missing" severity: ERROR short_description: "Column dropped" reason { type: SCHEMA_MISSING_COLUMN short_description: "Column dropped" description: "Column is completely missing" } """, anomalies_pb2.AnomalyInfo()) } # Validate the stats in TRAINING environment. anomalies_training = validation_api.validate_statistics( statistics, schema, environment='TRAINING') self._assert_equal_anomalies(anomalies_training, expected_anomalies_training) # Validate the stats in SERVING environment. anomalies_serving = validation_api.validate_statistics( statistics, schema, environment='SERVING') self._assert_equal_anomalies(anomalies_serving, {})
def test_validate_stats_with_serving_stats(self): statistics = text_format.Parse( """ datasets { num_examples: 10 features { name: 'bar' type: STRING string_stats { common_stats { num_missing: 0 num_non_missing: 10 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 1 } buckets { label: "b" sample_count: 2 } buckets { label: "c" sample_count: 7 } } } } }""", statistics_pb2.DatasetFeatureStatisticsList()) serving_statistics = text_format.Parse( """ datasets { num_examples: 10 features { name: 'bar' type: STRING string_stats { common_stats { num_missing: 0 num_non_missing: 10 max_num_values: 1 } rank_histogram { buckets { label: "a" sample_count: 3 } buckets { label: "b" sample_count: 1 } buckets { label: "c" sample_count: 6 } } } } }""", statistics_pb2.DatasetFeatureStatisticsList()) schema = text_format.Parse( """ feature { name: 'bar' type: BYTES skew_comparator { infinity_norm { threshold: 0.1} } }""", schema_pb2.Schema()) expected_anomalies = { 'bar': text_format.Parse(self._bar_anomaly_info, anomalies_pb2.AnomalyInfo()) } # Validate the stats. anomalies = validation_api.validate_statistics( statistics, schema, serving_statistics=serving_statistics) self._assert_equal_anomalies(anomalies, expected_anomalies)
def test_validate_stats(self): schema = text_format.Parse( """ string_domain { name: "MyAloneEnum" value: "A" value: "B" value: "C" } feature { name: "annotated_enum" value_count { min:1 max:1 } presence { min_count: 1 } type: BYTES domain: "MyAloneEnum" } feature { name: "ignore_this" lifecycle_stage: DEPRECATED value_count { min:1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) statistics = text_format.Parse( """ datasets{ num_examples: 10 features { name: 'annotated_enum' type: STRING string_stats { common_stats { num_missing: 3 num_non_missing: 7 min_num_values: 1 max_num_values: 1 } unique: 3 rank_histogram { buckets { label: "D" sample_count: 1 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) expected_anomalies = { 'annotated_enum': text_format.Parse( """ description: "Examples contain values missing from the schema: D (?). " severity: ERROR short_description: "Unexpected string values" reason { type: ENUM_TYPE_UNEXPECTED_STRING_VALUES short_description: "Unexpected string values" description: "Examples contain values missing from the schema: D (?). " } """, anomalies_pb2.AnomalyInfo()) } # Validate the stats. anomalies = validation_api.validate_statistics(statistics, schema) self._assert_equal_anomalies(anomalies, expected_anomalies)