def test_e2e(self, stats_options, expected_stats_pbtxt,
                 expected_schema_pbtxt):
        tfxio = tf_sequence_example_record.TFSequenceExampleRecord(
            self._input_file, ['tfdv', 'test'])
        stats_file = os.path.join(self._output_dir, 'stats')
        with beam.Pipeline() as p:
            _ = (p
                 | 'TFXIORead' >> tfxio.BeamSource()
                 | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options)
                 | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file))

        actual_stats = tfdv.load_statistics(stats_file)
        test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self,
            text_format.Parse(
                expected_stats_pbtxt,
                statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats])
        actual_schema = tfdv.infer_schema(actual_stats,
                                          infer_feature_shape=True)

        if hasattr(actual_schema, 'generate_legacy_feature_spec'):
            actual_schema.ClearField('generate_legacy_feature_spec')
        self._assert_schema_equal(
            actual_schema,
            text_format.Parse(expected_schema_pbtxt, schema_pb2.Schema()))
    def test_stats_pipeline_with_sample_count(self):
        # input with three examples.
        examples = [{
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }, {
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }, {
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }]

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                sample_count=1,
                num_top_values=2,
                num_rank_histogram_buckets=2,
                num_values_histogram_buckets=2,
                num_histogram_buckets=2,
                num_quantiles_histogram_buckets=2,
                epsilon=0.001)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, self._sampling_test_expected_result))
  def test_validate_examples_in_csv_with_examples(self):
    data_location, _, options, expected_result = (
        self._get_anomalous_csv_test(
            delimiter=',',
            output_column_names=False,
            generate_single_file=True,
            has_schema=True))

    result, sampled_examples = validation_lib.validate_examples_in_csv(
        data_location=data_location,
        stats_options=options,
        column_names=None,
        delimiter=',',
        num_sampled_examples=99)
    compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
        self, expected_result)
    compare_fn([result])
    self.assertCountEqual([
        'annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES',
    ], sampled_examples.keys())
    got_df = sampled_examples[
        'annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES']
    expected_df = pd.DataFrame.from_records(
        [['D', 1]], columns=['annotated_enum', 'other_feature'])
    expected_df['annotated_enum'] = expected_df['annotated_enum'].astype(bytes)
    # We can't be too picky about dtypes; try to coerce to expected types.
    for col in got_df.columns:
      if col in expected_df.columns:
        got_df[col] = got_df[col].astype(expected_df[col].dtype)
    self.assertTrue(expected_df.equals(got_df))
    def test_stats_pipeline_with_sample_count(self):
        record_batches = [
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
        ]

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                sample_count=3000,
                num_top_values=2,
                num_rank_histogram_buckets=2,
                num_values_histogram_buckets=2,
                num_histogram_buckets=2,
                num_quantiles_histogram_buckets=2,
                epsilon=0.001,
                desired_batch_size=3000)
            result = (p | beam.Create(record_batches)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, self._sampling_test_expected_result))
  def test_generate_stats_impl(self):
    # input with two batches: first batch has two examples and second batch
    # has a single example.
    batches = [{'a': np.array([np.array(['xyz']), np.array(['qwe'])])},
               {'a': np.array([np.array(['ab'])])}]

    generator1 = string_stats_generator.StringStatsGenerator()
    generator2 = uniques_stats_generator.UniquesStatsGenerator()

    expected_result = text_format.Parse(
        """
        datasets {
          features {
            name: 'a'
            type: STRING
            string_stats {
              avg_length: 2.66666666
              unique: 3
            }

          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

    with beam.Pipeline() as p:
      result = (p | beam.Create(batches) |
                stats_impl.GenerateStatisticsImpl(
                    generators=[generator1, generator2]))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))
  def test_stats_gen_with_csv_tab_delimiter_no_header_in_file(self):
    records, header, expected_result = self._get_csv_test(delimiter='\t',
                                                          with_header=False)
    input_data_path = self._write_records_to_csv(records, self._get_temp_dir(),
                                                 'input_data.tsv')

    result = stats_gen_lib.generate_statistics_from_csv(
        data_location=input_data_path,
        column_names=header,
        delimiter='\t',
        stats_options=self._default_stats_options)
    compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
        self, expected_result)
    compare_fn([result])
Exemple #7
0
    def test_merges_two_shards(self):
        stats1 = text_format.Parse(
            """
      datasets {
        name: 'x'
        num_examples: 100
        features: {
           path: {
              step: "f1"
           }
        }
      }
      """, statistics_pb2.DatasetFeatureStatisticsList())
        stats2 = text_format.Parse(
            """
        datasets {
          name: 'x'
          num_examples: 100
          features: {
             path: {
                step: "f2"
             }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        stats_combined = text_format.Parse(
            """
        datasets {
          name: 'x'
          num_examples: 100
          features: {
             path: {
                step: "f1"
             }
          }
          features: {
             path: {
                step: "f2"
             }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
        with beam.Pipeline() as p:
            result = (p | beam.Create([stats1, stats2])
                      | stats_api.MergeDatasetFeatureStatisticsList())
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, stats_combined))
 def test_empty_input(self):
     examples = []
     expected_result = text_format.Parse(
         """
 datasets {
   num_examples: 0
 }
 """, statistics_pb2.DatasetFeatureStatisticsList())
     with beam.Pipeline() as p:
         result = p | beam.Create(examples) | stats_api.GenerateStatistics(
             stats_options.StatsOptions())
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))
    def test_validate_examples_in_csv_multiple_files(self):
        data_location, column_names, options, expected_result = (
            self._get_anomalous_csv_test(delimiter=',',
                                         output_column_names=True,
                                         generate_single_file=False,
                                         has_schema=True))

        result = validation_lib.validate_examples_in_csv(
            data_location=data_location,
            stats_options=options,
            column_names=column_names,
            delimiter=',')
        compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self, expected_result)
        compare_fn([result])
Exemple #10
0
 def test_stats_pipeline_with_zero_examples(self):
     expected_result = statistics_pb2.DatasetFeatureStatisticsList()
     with beam.Pipeline() as p:
         options = stats_options.StatsOptions(
             num_top_values=1,
             num_rank_histogram_buckets=1,
             num_values_histogram_buckets=2,
             num_histogram_buckets=1,
             num_quantiles_histogram_buckets=1,
             epsilon=0.001)
         result = (p | beam.Create([])
                   | stats_api.GenerateStatistics(options))
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))
 def test_stats_impl(self,
                     examples,
                     options,
                     expected_result_proto_text,
                     schema=None):
     expected_result = text_format.Parse(
         expected_result_proto_text,
         statistics_pb2.DatasetFeatureStatisticsList())
     if schema is not None:
         options.schema = schema
     with beam.Pipeline() as p:
         result = (p | beam.Create(examples)
                   | stats_impl.GenerateStatisticsImpl(options))
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))
Exemple #12
0
    def test_stats_gen_with_csv_missing_column(self):
        records = [',', ',']
        input_data_path = self._write_records_to_csv(records,
                                                     self._get_temp_dir(),
                                                     'input_data.csv')
        expected_result = text_format.Parse(
            """
        datasets {
          num_examples: 2
          features {
            path {
              step: "feature1"
            }
            type: STRING
            string_stats {
              common_stats {
                num_missing: 2
              }
            }
          }
          features {
            path {
              step: "feature2"
            }
            type: STRING
            string_stats {
              common_stats {
                num_missing: 2
              }
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        result = stats_gen_lib.generate_statistics_from_csv(
            data_location=input_data_path,
            column_names=['feature1', 'feature2'],
            delimiter=',',
            stats_options=self._default_stats_options)
        compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self, expected_result)
        compare_fn([result])
Exemple #13
0
    def test_stats_gen_with_csv_header_in_multiple_files(self):
        records, _, expected_result = self._get_csv_test(delimiter=',',
                                                         with_header=True)
        header = records.pop(0)
        # Split the records into two subsets and write to separate files.
        records1 = [header] + records[0:3]
        records2 = [header] + records[3:]
        tmp_dir = self._get_temp_dir()
        self._write_records_to_csv(records1, tmp_dir, 'input_data1.csv')
        self._write_records_to_csv(records2, tmp_dir, 'input_data2.csv')
        input_data_path = os.path.join(tmp_dir, 'input_data*')

        result = stats_gen_lib.generate_statistics_from_csv(
            data_location=input_data_path,
            column_names=None,
            delimiter=',',
            stats_options=self._default_stats_options)
        compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self, expected_result)
        compare_fn([result])
Exemple #14
0
  def test_stats_gen_with_csv_no_header_in_file(self, compression_type):
    records, header, expected_result = self._get_csv_test(delimiter=',',
                                                          with_header=False)
    compression_type_lookup = {
        CompressionTypes.AUTO: '',
        CompressionTypes.GZIP: 'gzip'
    }
    input_data_path = self._write_records_to_csv(
        records, self._get_temp_dir(), 'input_data.csv',
        compression_type=compression_type_lookup[compression_type])

    result = stats_gen_lib.generate_statistics_from_csv(
        data_location=input_data_path,
        column_names=header,
        delimiter=',',
        stats_options=self._default_stats_options,
        compression_type=compression_type)
    compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
        self, expected_result)
    compare_fn([result])
Exemple #15
0
    def test_stats_gen_with_csv_with_schema(self):
        records = ['feature1', '1']
        input_data_path = self._write_records_to_csv(records,
                                                     self._get_temp_dir(),
                                                     'input_data.csv')
        schema = text_format.Parse(
            """
        feature { name: "feature1" type: BYTES }
        """, schema_pb2.Schema())

        expected_result = text_format.Parse(
            """
    datasets {
  num_examples: 1
  features {
    path {
      step: "feature1"
    }
    type: STRING
    string_stats {
      common_stats {
        num_non_missing: 1
        min_num_values: 1
        max_num_values: 1
        avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 0.5
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 0.5
          }
          type: QUANTILES
        }
        tot_num_values: 1
      }
      unique: 1
      top_values {
        value: "1"
        frequency: 1.0
      }
      avg_length: 1.0
      rank_histogram {
        buckets {
          label: "1"
          sample_count: 1.0
        }
      }
    }
  }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        self._default_stats_options.schema = schema
        self._default_stats_options.infer_type_from_schema = True
        result = stats_gen_lib.generate_statistics_from_csv(
            data_location=input_data_path,
            delimiter=',',
            stats_options=self._default_stats_options)
        compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self, expected_result)
        compare_fn([result])
Exemple #16
0
    def test_stats_gen_with_tfrecords_of_tfexamples(self, compression_type):
        examples = [
            self._make_example({
                'a': ('float', [1.0, 2.0]),
                'b': ('bytes', [b'a', b'b', b'c', b'e'])
            }),
            self._make_example({
                'a': ('float', [3.0, 4.0, float('nan'), 5.0]),
                'b': ('bytes', [b'a', b'c', b'd', b'a'])
            }),
            self._make_example({
                'a': ('float', [1.0]),
                'b': ('bytes', [b'a', b'b', b'c', b'd'])
            })
        ]
        tf_compression_lookup = {
            CompressionTypes.AUTO:
            tf.compat.v1.python_io.TFRecordCompressionType.NONE,
            CompressionTypes.GZIP:
            tf.compat.v1.python_io.TFRecordCompressionType.GZIP
        }
        input_data_path = self._write_tfexamples_to_tfrecords(
            examples, tf_compression_lookup[compression_type])

        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 3
      features {
        path {
          step: "a"
        }
        type: FLOAT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 0
            min_num_values: 1
            max_num_values: 4
            avg_num_values: 2.33333333
            tot_num_values: 7
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 4.0
                sample_count: 1.5
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.5
              }
              type: QUANTILES
            }
          }
          mean: 2.66666666
          std_dev: 1.49071198
          num_zeros: 0
          min: 1.0
          max: 5.0
          median: 3.0
          histograms {
            num_nan: 1
            buckets {
              low_value: 1.0
              high_value: 3.0
              sample_count: 3.0
            }
            buckets {
              low_value: 3.0
              high_value: 5.0
              sample_count: 3.0
            }
            type: STANDARD
          }
          histograms {
            num_nan: 1
            buckets {
              low_value: 1.0
              high_value: 3.0
              sample_count: 3.0
            }
            buckets {
              low_value: 3.0
              high_value: 5.0
              sample_count: 3.0
            }
            type: QUANTILES
          }
        }
      }
      features {
        path {
          step: "b"
        }
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 3
            min_num_values: 4
            max_num_values: 4
            avg_num_values: 4.0
            tot_num_values: 12
            num_values_histogram {
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.5
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.5
              }
              type: QUANTILES
            }
          }
          unique: 5
          top_values {
            value: "a"
            frequency: 4.0
          }
          top_values {
            value: "c"
            frequency: 3.0
          }
          avg_length: 1.0
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        result = stats_gen_lib.generate_statistics_from_tfrecord(
            data_location=input_data_path,
            stats_options=self._default_stats_options,
            compression_type=compression_type)
        compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self, expected_result)
        compare_fn([result])
 def test_stats_pipeline_with_examples_with_no_values(self):
   examples = [{'a': np.array([], dtype=np.floating),
                'b': np.array([], dtype=np.object),
                'c': np.array([], dtype=np.int32),
                'w': np.array([2])},
               {'a': np.array([], dtype=np.floating),
                'b': np.array([], dtype=np.object),
                'c': np.array([], dtype=np.int32),
                'w': np.array([2])},
               {'a': np.array([], dtype=np.floating),
                'b': np.array([], dtype=np.object),
                'c': np.array([], dtype=np.int32),
                'w': np.array([2])}]
   expected_result = text_format.Parse(
       """
     datasets{
       num_examples: 3
       features {
         name: 'a'
         type: FLOAT
         num_stats {
           common_stats {
             num_non_missing: 3
             num_values_histogram {
               buckets {
                 sample_count: 1.5
               }
               buckets {
                 sample_count: 1.5
               }
               type: QUANTILES
             }
             weighted_common_stats {
               num_non_missing: 6
             }
           }
         }
       }
       features {
         name: 'b'
         type: STRING
         string_stats {
           common_stats {
             num_non_missing: 3
             num_values_histogram {
               buckets {
                 sample_count: 1.5
               }
               buckets {
                 sample_count: 1.5
               }
               type: QUANTILES
             }
             weighted_common_stats {
               num_non_missing: 6
             }
           }
         }
       }
       features {
         name: 'c'
         type: INT
         num_stats {
           common_stats {
             num_non_missing: 3
             num_values_histogram {
               buckets {
                 sample_count: 1.5
               }
               buckets {
                 sample_count: 1.5
               }
               type: QUANTILES
             }
             weighted_common_stats {
               num_non_missing: 6
             }
           }
         }
       }
     }
   """, statistics_pb2.DatasetFeatureStatisticsList())
   with beam.Pipeline() as p:
     options = stats_options.StatsOptions(
         weight_feature='w',
         num_top_values=1,
         num_rank_histogram_buckets=1,
         num_values_histogram_buckets=2,
         num_histogram_buckets=1,
         num_quantiles_histogram_buckets=1,
         epsilon=0.001)
     result = (
         p | beam.Create(examples) | stats_api.GenerateStatistics(options))
     util.assert_that(
         result,
         test_util.make_dataset_feature_stats_list_proto_equal_fn(
             self, expected_result))
    def test_stats_pipeline_with_examples_with_no_values(self):
        record_batches = [
            pa.RecordBatch.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.RecordBatch.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.RecordBatch.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w'])
        ]

        expected_result = text_format.Parse(
            """
      datasets{
        num_examples: 3
        features {
          path {
            step: 'a'
          }
          type: FLOAT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'b'
          }
          type: STRING
          string_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'c'
          }
          type: INT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
          step: 'w'
        }
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 0
            min_num_values: 1
            max_num_values: 1
            avg_num_values: 1.0
            tot_num_values: 3
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 1.5
              }
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 1.5
              }
              type: QUANTILES
            }
            weighted_common_stats {
                num_non_missing: 6.0
                avg_num_values: 1.0
                tot_num_values: 6.0
            }
          }
          mean: 2.0
          std_dev: 0.0
          min: 2.0
          max: 2.0
          median: 2.0
          histograms {
            buckets {
              low_value: 2.0
              high_value: 2.0
              sample_count: 3.0
            }
            type: STANDARD
          }
          histograms {
            buckets {
              low_value: 2.0
              high_value: 2.0
              sample_count: 3.0
            }
            type: QUANTILES
          }
          weighted_numeric_stats {
            mean: 2.0
            median: 2.0
            histograms {
              buckets {
                low_value: 2.0
                high_value: 2.0
                sample_count: 6.0
              }
              type: STANDARD
            }
            histograms {
              buckets {
                low_value: 2.0
                high_value: 2.0
                sample_count: 6.0
              }
              type: QUANTILES
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())
        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                weight_feature='w',
                num_top_values=1,
                num_rank_histogram_buckets=1,
                num_values_histogram_buckets=2,
                num_histogram_buckets=1,
                num_quantiles_histogram_buckets=1,
                epsilon=0.001)
            result = (p | beam.Create(record_batches)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
    def test_stats_pipeline(self):
        record_batches = [
            pa.RecordBatch.from_arrays([
                pa.array([[1.0, 2.0]]),
                pa.array([['a', 'b', 'c', 'e']]),
                pa.array([np.linspace(1, 500, 500, dtype=np.int32)]),
            ], ['a', 'b', 'c']),
            pa.RecordBatch.from_arrays([
                pa.array([[3.0, 4.0, np.NaN, 5.0]]),
                pa.array([['a', 'c', 'd', 'a']]),
                pa.array([np.linspace(501, 1250, 750, dtype=np.int32)]),
            ], ['a', 'b', 'c']),
            pa.RecordBatch.from_arrays([
                pa.array([[1.0]]),
                pa.array([['a', 'b', 'c', 'd']]),
                pa.array([np.linspace(1251, 3000, 1750, dtype=np.int32)]),
            ], ['a', 'b', 'c'])
        ]

        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 3
      features {
        path {
          step: 'a'
        }
        type: FLOAT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 0
            min_num_values: 1
            max_num_values: 4
            avg_num_values: 2.33333333
            tot_num_values: 7
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 1.0
              }
              buckets {
                low_value: 1.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              type: QUANTILES
            }
          }
          mean: 2.66666666
          std_dev: 1.49071198
          num_zeros: 0
          min: 1.0
          max: 5.0
          median: 3.0
          histograms {
            num_nan: 1
            buckets {
              low_value: 1.0
              high_value: 2.3333333
              sample_count: 2.9866667
            }
            buckets {
              low_value: 2.3333333
              high_value: 3.6666667
              sample_count: 1.0066667
            }
            buckets {
              low_value: 3.6666667
              high_value: 5.0
              sample_count: 2.0066667
            }
            type: STANDARD
          }
          histograms {
            num_nan: 1
            buckets {
              low_value: 1.0
              high_value: 1.0
              sample_count: 1.5
            }
            buckets {
              low_value: 1.0
              high_value: 3.0
              sample_count: 1.5
            }
            buckets {
              low_value: 3.0
              high_value: 4.0
              sample_count: 1.5
            }
            buckets {
              low_value: 4.0
              high_value: 5.0
              sample_count: 1.5
            }
            type: QUANTILES
          }
        }
      }
      features {
        path {
          step: 'c'
        }
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 0
            min_num_values: 500
            max_num_values: 1750
            avg_num_values: 1000.0
            tot_num_values: 3000
            num_values_histogram {
              buckets {
                low_value: 500.0
                high_value: 500.0
                sample_count: 1.0
              }
              buckets {
                low_value: 500.0
                high_value: 1750.0
                sample_count: 1.0
              }
              buckets {
                low_value: 1750.0
                high_value: 1750.0
                sample_count: 1.0
              }
              type: QUANTILES
            }
          }
          mean: 1500.5
          std_dev: 866.025355672
          min: 1.0
          max: 3000.0
          median: 1501.0
          histograms {
            buckets {
              low_value: 1.0
              high_value: 1000.66666667
              sample_count: 999.666666667
            }
            buckets {
              low_value: 1000.66666667
              high_value: 2000.33333333
              sample_count: 999.666666667
            }
            buckets {
              low_value: 2000.33333333
              high_value: 3000.0
              sample_count: 1000.66666667
            }
            type: STANDARD
          }
          histograms {
            buckets {
              low_value: 1.0
              high_value: 751.0
              sample_count: 750.0
            }
            buckets {
              low_value: 751.0
              high_value: 1501.0
              sample_count: 750.0
            }
            buckets {
              low_value: 1501.0
              high_value: 2250.0
              sample_count: 750.0
            }
            buckets {
              low_value: 2250.0
              high_value: 3000.0
              sample_count: 750.0
            }
            type: QUANTILES
          }
        }
      }
      features {
        path {
          step: 'b'
        }
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 3
            min_num_values: 4
            max_num_values: 4
            avg_num_values: 4.0
            tot_num_values: 12
            num_values_histogram {
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              type: QUANTILES
            }
          }
          unique: 5
          top_values {
            value: "a"
            frequency: 4.0
          }
          top_values {
            value: "c"
            frequency: 3.0
          }
          avg_length: 1.0
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                num_top_values=2,
                num_rank_histogram_buckets=3,
                num_values_histogram_buckets=3,
                num_histogram_buckets=3,
                num_quantiles_histogram_buckets=4,
                epsilon=0.001)
            result = (p | beam.Create(record_batches)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
  def test_validate_examples_in_tfrecord(self, num_sampled_examples):
    input_examples = [
        # This example is anomalous because its feature contains a value that is
        # not in the string_domain specified in the schema.
        """
          features {
              feature {
                key: 'annotated_enum'
                value { bytes_list { value: [ 'D' ] } }
              }
          }
        """,
        # This example is anomalous because it contains a feature that is not
        # in the schema.
        """
          features {
              feature {
                key: 'annotated_enum'
                value { bytes_list { value: [ 'A' ] } }
              }
              feature {
                key: 'unknown_feature'
                value { bytes_list { value: [ 'A' ] } }
              }
          }
        """,
    ]
    schema = text_format.Parse(
        """
              string_domain {
                name: "MyAloneEnum"
                value: "A"
                value: "B"
                value: "C"
              }
              feature {
                name: "annotated_enum"
                value_count {
                  min:1
                  max:1
                }
                presence {
                  min_count: 1
                }
                type: BYTES
                domain: "MyAloneEnum"
              }
              """, schema_pb2.Schema())
    options = stats_options.StatsOptions(
        schema=schema,
        num_top_values=2,
        num_rank_histogram_buckets=2,
        num_values_histogram_buckets=2,
        num_histogram_buckets=2,
        num_quantiles_histogram_buckets=2)

    temp_dir_path = self.create_tempdir().full_path
    input_data_path = os.path.join(temp_dir_path, 'input_data.tfrecord')
    with tf.io.TFRecordWriter(input_data_path) as writer:
      for example in input_examples:
        example = text_format.Parse(example, tf.train.Example())
        writer.write(example.SerializeToString())

    expected_result = text_format.Parse(
        """
    datasets {
      name: 'annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES'
      num_examples: 1
      features {
        path: {
          step: 'annotated_enum'
        }
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            min_num_values: 1
            max_num_values: 1
            avg_num_values: 1.0
            tot_num_values: 1
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 0.5
              }
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
          unique: 1
          top_values {
            value: "D"
            frequency: 1.0
          }
          avg_length: 1.0
          rank_histogram {
            buckets {
              label: "D"
              sample_count: 1.0
            }
          }
        }
      }
    }
    datasets {
      name: 'unknown_feature_SCHEMA_NEW_COLUMN'
      num_examples: 1
      features {
        path: {
          step: 'unknown_feature'
        }
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            min_num_values: 1
            max_num_values: 1
            avg_num_values: 1.0
            tot_num_values: 1
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 0.5
              }
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
          unique: 1
          top_values {
            value: "A"
            frequency: 1.0
          }
          avg_length: 1.0
          rank_histogram {
            buckets {
              label: "A"
              sample_count: 1.0
            }
          }
        }
      }
      features {
        path: {
          step: 'annotated_enum'
        }
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            min_num_values: 1
            max_num_values: 1
            avg_num_values: 1.0
            tot_num_values: 1
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 0.5
              }
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
          unique: 1
          top_values {
            value: "A"
            frequency: 1.0
          }
          avg_length: 1.0
          rank_histogram {
            buckets {
              label: "A"
              sample_count: 1.0
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

    actual_result = validation_lib.validate_examples_in_tfrecord(
        data_location=input_data_path,
        stats_options=options,
        num_sampled_examples=num_sampled_examples)
    if num_sampled_examples:
      actual_result, sampled_examples = actual_result
      self.assertCountEqual(
          [('annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES',
            [text_format.Parse(input_examples[0], tf.train.Example())]),
           ('unknown_feature_SCHEMA_NEW_COLUMN',
            [text_format.Parse(input_examples[1], tf.train.Example())])],
          sampled_examples.items())
    compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
        self, expected_result)
    compare_fn([actual_result])
Exemple #21
0
    def test_stats_pipeline_with_examples_with_no_values(self):
        tables = [
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w'])
        ]

        expected_result = text_format.Parse(
            """
      datasets{
        num_examples: 3
        features {
          path {
            step: 'a'
          }
          type: FLOAT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'b'
          }
          type: STRING
          string_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'c'
          }
          type: INT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatisticsList())
        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                weight_feature='w',
                num_top_values=1,
                num_rank_histogram_buckets=1,
                num_values_histogram_buckets=2,
                num_histogram_buckets=1,
                num_quantiles_histogram_buckets=1,
                epsilon=0.001)
            result = (p | beam.Create(tables)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
    def test_stats_pipeline_with_feature_whitelist(self):
        # input with three examples.
        examples = [{
            'a': np.array([1.0, 2.0]),
            'b': np.array(['a', 'b', 'c', 'e']),
            'c': np.linspace(1, 500, 500, dtype=np.int32)
        }, {
            'a': np.array([3.0, 4.0, np.NaN, 5.0]),
            'b': np.array(['a', 'c', 'd', 'a']),
            'c': np.linspace(501, 1250, 750, dtype=np.int32)
        }, {
            'a': np.array([1.0]),
            'b': np.array(['a', 'b', 'c', 'd']),
            'c': np.linspace(1251, 3000, 1750, dtype=np.int32)
        }]

        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 3
      features {
        name: "b"
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 3
            min_num_values: 4
            max_num_values: 4
            avg_num_values: 4.0
            tot_num_values: 12
            num_values_histogram {
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              type: QUANTILES
            }
          }
          unique: 5
          top_values {
            value: "a"
            frequency: 4.0
          }
          top_values {
            value: "c"
            frequency: 3.0
          }
          avg_length: 1.0
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                feature_whitelist=['b'],
                num_top_values=2,
                num_rank_histogram_buckets=3,
                num_values_histogram_buckets=3,
                num_histogram_buckets=3,
                num_quantiles_histogram_buckets=4)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
    def test_custom_generators(self):

        # Dummy PTransform that returns two DatasetFeatureStatistics protos.
        class CustomPTransform(beam.PTransform):
            def expand(self, pcoll):
                stats_proto1 = statistics_pb2.DatasetFeatureStatistics()
                proto1_feat = stats_proto1.features.add()
                proto1_feat.name = 'a'
                custom_stat1 = proto1_feat.custom_stats.add()
                custom_stat1.name = 'my_stat_a'
                custom_stat1.str = 'my_val_a'

                stats_proto2 = statistics_pb2.DatasetFeatureStatistics()
                proto2_feat = stats_proto2.features.add()
                proto2_feat.name = 'b'
                custom_stat2 = proto2_feat.custom_stats.add()
                custom_stat2.name = 'my_stat_b'
                custom_stat2.str = 'my_val_b'
                return [stats_proto1, stats_proto2]

        examples = [{
            'a': np.array([], dtype=np.int32),
            'b': np.array([], dtype=np.int32)
        }]
        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 1
      features {
        name: 'a'
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            tot_num_values: 0
            num_values_histogram {
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
        }
        custom_stats {
          name: 'my_stat_a'
          str: 'my_val_a'
        }
      }
      features {
        name: 'b'
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            tot_num_values: 0
            num_values_histogram {
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
        }
        custom_stats {
          name: 'my_stat_b'
          str: 'my_val_b'
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        # Create a transform stats generator.
        transform_stats_gen = stats_generator.TransformStatsGenerator(
            name='CustomStatsGenerator', ptransform=CustomPTransform())
        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                generators=[transform_stats_gen],
                num_values_histogram_buckets=2)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
    def test_stats_pipeline_with_weight_feature(self):
        # input with four examples.
        examples = [{
            'a': np.array([1.0, 2.0]),
            'b': np.array(['a', 'b', 'c', 'e']),
            'w': np.array([1.0])
        }, {
            'a': np.array([3.0, 4.0, 5.0]),
            'b': None,
            'w': np.array([2.0])
        }, {
            'a': np.array([
                1.0,
            ]),
            'b': np.array(['d', 'e']),
            'w': np.array([
                3.0,
            ])
        }, {
            'a': None,
            'b': np.array(['a', 'c', 'd', 'a']),
            'w': np.array([1.0])
        }]

        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 4
      features {
        name: 'a'
        type: FLOAT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 1
            min_num_values: 1
            max_num_values: 3
            avg_num_values: 2.0
            tot_num_values: 6
            num_values_histogram {
              buckets { low_value: 1.0 high_value: 2.0 sample_count: 1 }
              buckets { low_value: 2.0 high_value: 3.0 sample_count: 1 }
              buckets { low_value: 3.0 high_value: 3.0 sample_count: 1 }
              type: QUANTILES
            }
            weighted_common_stats {
              num_non_missing: 6.0
              num_missing: 1.0
              avg_num_values: 1.83333333
              tot_num_values: 11.0
            }
          }
          mean: 2.66666666
          std_dev: 1.49071198
          num_zeros: 0
          min: 1.0
          max: 5.0
          median: 3.0
          histograms {
            buckets {
              low_value: 1.0
              high_value: 2.3333333
              sample_count: 2.9866667
            }
            buckets {
              low_value: 2.3333333
              high_value: 3.6666667
              sample_count: 1.0066667
            }
            buckets {
              low_value: 3.6666667
              high_value: 5.0
              sample_count: 2.0066667
            }
            type: STANDARD
          }
          histograms {
            buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.5 }
            buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.5 }
            buckets { low_value: 3.0 high_value: 4.0 sample_count: 1.5 }
            buckets { low_value: 4.0 high_value: 5.0 sample_count: 1.5 }
            type: QUANTILES
          }
          weighted_numeric_stats {
            mean: 2.7272727
            std_dev: 1.5427784
            median: 3.0
            histograms {
              buckets {
              low_value: 1.0
                high_value: 2.3333333
                sample_count: 4.9988889
              }
              buckets {
                low_value: 2.3333333
                high_value: 3.6666667
                sample_count: 1.9922222
              }
              buckets {
                low_value: 3.6666667
                high_value: 5.0
                sample_count: 4.0088889
              }
            }
            histograms {
              buckets { low_value: 1.0 high_value: 1.0 sample_count: 2.75 }
              buckets { low_value: 1.0 high_value: 3.0 sample_count: 2.75 }
              buckets { low_value: 3.0 high_value: 4.0 sample_count: 2.75 }
              buckets { low_value: 4.0 high_value: 5.0 sample_count: 2.75 }
              type: QUANTILES
            }
          }
        }
      }
      features {
        name: 'b'
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 1
            min_num_values: 2
            max_num_values: 4
            avg_num_values: 3.33333301544
            num_values_histogram {
              buckets { low_value: 2.0 high_value: 4.0 sample_count: 1.0 }
              buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 }
              buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 }
              type: QUANTILES
            }
            weighted_common_stats {
              num_non_missing: 5.0
              num_missing: 2.0
              avg_num_values: 2.8
              tot_num_values: 14.0
            }
            tot_num_values: 10
          }
          avg_length: 1.0
          unique: 5
          top_values { value: 'a' frequency: 3.0 }
          top_values { value: 'e' frequency: 2.0 }
          rank_histogram {
            buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 }
            buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 }
            buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 }
          }
          weighted_string_stats {
            top_values { value: 'e' frequency: 4.0 }
            top_values { value: 'd' frequency: 4.0 }
            rank_histogram {
              buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 4.0 }
              buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 4.0 }
              buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 3.0 }
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                weight_feature='w',
                num_top_values=2,
                num_rank_histogram_buckets=3,
                num_values_histogram_buckets=3,
                num_histogram_buckets=3,
                num_quantiles_histogram_buckets=4)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
 def test_stats_pipeline_with_schema(self):
     # input with three examples.
     examples = [{
         'a': np.array([1, 3, 5, 7])
     }, {
         'a': np.array([2, 4, 6, 8])
     }, {
         'a': np.array([0, 3, 6, 9])
     }]
     schema = text_format.Parse(
         """
     feature {
       name: "a"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     expected_result = text_format.Parse(
         """
 datasets {
   num_examples: 3
   features {
     name: "a"
     type: INT
     string_stats {
       common_stats {
         num_non_missing: 3
         min_num_values: 4
         max_num_values: 4
         avg_num_values: 4.0
         tot_num_values: 12
         num_values_histogram {
           buckets {
             low_value: 4.0
             high_value: 4.0
             sample_count: 1.0
           }
           buckets {
             low_value: 4.0
             high_value: 4.0
             sample_count: 1.0
           }
           buckets {
             low_value: 4.0
             high_value: 4.0
             sample_count: 1.0
           }
           type: QUANTILES
         }
       }
       unique: 10
       top_values {
         value: "6"
         frequency: 2.0
       }
       top_values {
         value: "3"
         frequency: 2.0
       }
       avg_length: 1.0
       rank_histogram {
         buckets {
           low_rank: 0
           high_rank: 0
           label: "6"
           sample_count: 2.0
         }
         buckets {
           low_rank: 1
           high_rank: 1
           label: "3"
           sample_count: 2.0
         }
         buckets {
           low_rank: 2
           high_rank: 2
           label: "9"
           sample_count: 1.0
         }
       }
     }
   }
 }
 """, statistics_pb2.DatasetFeatureStatisticsList())
     with beam.Pipeline() as p:
         options = stats_options.StatsOptions(
             schema=schema,
             num_top_values=2,
             num_rank_histogram_buckets=3,
             num_values_histogram_buckets=3)
         result = (p | beam.Create(examples)
                   | stats_api.GenerateStatistics(options))
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))