def test_validate_stats_invalid_environment(self):
     statistics = statistics_pb2.DatasetFeatureStatisticsList()
     statistics.datasets.extend([statistics_pb2.DatasetFeatureStatistics()])
     schema = text_format.Parse(
         """
     default_environment: "TRAINING"
     default_environment: "SERVING"
     feature {
       name: "label"
       not_in_environment: "SERVING"
       value_count { min: 1 max: 1 }
       presence { min_count: 1 }
       type: BYTES
     }
     """, schema_pb2.Schema())
     with self.assertRaisesRegexp(ValueError,
                                  'Environment.*not found in the schema.*'):
         _ = validation_api.validate_statistics(statistics,
                                                schema,
                                                environment='INVALID')
 def test_validate_stats_invalid_schema_input(self):
     statistics = statistics_pb2.DatasetFeatureStatisticsList()
     statistics.datasets.extend([statistics_pb2.DatasetFeatureStatistics()])
     with self.assertRaisesRegexp(TypeError,
                                  '.*should be a Schema proto.*'):
         _ = validation_api.validate_statistics(statistics, {})
 def test_validate_stats_invalid_statistics_input(self):
     schema = schema_pb2.Schema()
     with self.assertRaisesRegexp(TypeError, 'statistics is of type.*'):
         _ = validation_api.validate_statistics({}, schema)
    def test_validate_stats_with_previous_and_serving_stats(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 10
          features {
            name: 'bar'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 0
                num_non_missing: 10
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 1 }
                buckets { label: "b" sample_count: 2 }
                buckets { label: "c" sample_count: 7 }
              }
            }
          }
          features {
            name: 'annotated_enum'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 0
                num_non_missing: 10
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 1 }
                buckets { label: "b" sample_count: 1 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        previous_statistics = text_format.Parse(
            """
        datasets {
          num_examples: 10
          features {
            name: 'annotated_enum'
            type: STRING
            string_stats {
              common_stats {
                num_non_missing: 10
                num_missing: 0
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 3 }
                buckets { label: "b" sample_count: 1 }
              }
            }
          }
          features {
            name: 'bar'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 0
                num_non_missing: 10
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 3 }
                buckets { label: "b" sample_count: 1 }
                buckets { label: "c" sample_count: 6 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        serving_statistics = text_format.Parse(
            """
        datasets {
          num_examples: 10
          features {
            name: 'bar'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 0
                num_non_missing: 10
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 3 }
                buckets { label: "b" sample_count: 1 }
                buckets { label: "c" sample_count: 6 }
              }
            }
          }
          features {
            name: 'annotated_enum'
            type: STRING
            string_stats {
              common_stats {
                num_non_missing: 10
                num_missing: 0
                max_num_values: 1
              }
              rank_histogram {
                buckets { label: "a" sample_count: 3 }
                buckets { label: "b" sample_count: 1 }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        schema = text_format.Parse(
            """
        feature {
          name: 'bar'
          type: BYTES
          skew_comparator { infinity_norm { threshold: 0.1 } }
        }
        feature {
          name: "annotated_enum"
          type: BYTES
          domain: "annotated_enum"
          drift_comparator { infinity_norm { threshold: 0.01 } }
        }
        string_domain { name: "annotated_enum" value: "a" }
        """, schema_pb2.Schema())

        expected_anomalies = {
            'bar':
            text_format.Parse(self._bar_anomaly_info,
                              anomalies_pb2.AnomalyInfo()),
            'annotated_enum':
            text_format.Parse(self._annotated_enum_anomaly_info,
                              anomalies_pb2.AnomalyInfo())
        }

        # Validate the stats.
        anomalies = validation_api.validate_statistics(
            statistics,
            schema,
            previous_statistics=previous_statistics,
            serving_statistics=serving_statistics)
        self._assert_equal_anomalies(anomalies, expected_anomalies)
    def test_validate_stats_with_environment(self):
        statistics = text_format.Parse(
            """
        datasets {
          num_examples: 1000
          features {
            name: 'feature'
            type: STRING
            string_stats {
              common_stats {
                num_non_missing: 1000
                min_num_values: 1
                max_num_values: 1
              }
              unique: 3
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

        schema = text_format.Parse(
            """
        default_environment: "TRAINING"
        default_environment: "SERVING"
        feature {
          name: "label"
          not_in_environment: "SERVING"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        feature {
          name: "feature"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        """, schema_pb2.Schema())

        expected_anomalies_training = {
            'label':
            text_format.Parse(
                """
            description: "Column is completely missing"
            severity: ERROR
            short_description: "Column dropped"
            reason {
              type: SCHEMA_MISSING_COLUMN
              short_description: "Column dropped"
              description: "Column is completely missing"
            }
            """, anomalies_pb2.AnomalyInfo())
        }
        # Validate the stats in TRAINING environment.
        anomalies_training = validation_api.validate_statistics(
            statistics, schema, environment='TRAINING')
        self._assert_equal_anomalies(anomalies_training,
                                     expected_anomalies_training)

        # Validate the stats in SERVING environment.
        anomalies_serving = validation_api.validate_statistics(
            statistics, schema, environment='SERVING')
        self._assert_equal_anomalies(anomalies_serving, {})
    def test_validate_stats(self):
        schema = text_format.Parse(
            """
        string_domain {
          name: "MyAloneEnum"
          value: "A"
          value: "B"
          value: "C"
        }
        feature {
          name: "annotated_enum"
          value_count {
            min:1
            max:1
          }
          presence {
            min_count: 1
          }
          type: BYTES
          domain: "MyAloneEnum"
        }
        feature {
          name: "ignore_this"
          lifecycle_stage: DEPRECATED
          value_count {
            min:1
          }
          presence {
            min_count: 1
          }
          type: BYTES
        }
        """, schema_pb2.Schema())
        statistics = text_format.Parse(
            """
        datasets{
          num_examples: 10
          features {
            name: 'annotated_enum'
            type: STRING
            string_stats {
              common_stats {
                num_missing: 3
                num_non_missing: 7
                min_num_values: 1
                max_num_values: 1
              }
              unique: 3
              rank_histogram {
                buckets {
                  label: "D"
                  sample_count: 1
                }
              }
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
        expected_anomalies = {
            'annotated_enum':
            text_format.Parse(
                """
      description: "Examples contain values missing from the schema: D (?). "
      severity: ERROR
      short_description: "Unexpected string values"
      reason {
        type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
        short_description: "Unexpected string values"
        description: "Examples contain values missing from the schema: D (?). "
      }
            """, anomalies_pb2.AnomalyInfo())
        }

        # Validate the stats.
        anomalies = validation_api.validate_statistics(statistics, schema)
        self._assert_equal_anomalies(anomalies, expected_anomalies)