def test_e2e(self, stats_options, expected_stats_pbtxt, expected_schema_pbtxt): tfxio = tf_sequence_example_record.TFSequenceExampleRecord( self._input_file, ['tfdv', 'test']) stats_file = os.path.join(self._output_dir, 'stats') with beam.Pipeline() as p: _ = (p | 'TFXIORead' >> tfxio.BeamSource() | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options) | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file)) actual_stats = tfdv.load_statistics(stats_file) test_util.make_dataset_feature_stats_list_proto_equal_fn( self, text_format.Parse( expected_stats_pbtxt, statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats]) actual_schema = tfdv.infer_schema(actual_stats, infer_feature_shape=True) if hasattr(actual_schema, 'generate_legacy_feature_spec'): actual_schema.ClearField('generate_legacy_feature_spec') self._assert_schema_equal( actual_schema, text_format.Parse(expected_schema_pbtxt, schema_pb2.Schema()))
def test_stats_pipeline_with_sample_count(self): # input with three examples. examples = [{ 'c': np.linspace(1, 3000, 3000, dtype=np.int32) }, { 'c': np.linspace(1, 3000, 3000, dtype=np.int32) }, { 'c': np.linspace(1, 3000, 3000, dtype=np.int32) }] with beam.Pipeline() as p: options = stats_options.StatsOptions( sample_count=1, num_top_values=2, num_rank_histogram_buckets=2, num_values_histogram_buckets=2, num_histogram_buckets=2, num_quantiles_histogram_buckets=2, epsilon=0.001) result = (p | beam.Create(examples) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, self._sampling_test_expected_result))
def test_validate_examples_in_csv_with_examples(self): data_location, _, options, expected_result = ( self._get_anomalous_csv_test( delimiter=',', output_column_names=False, generate_single_file=True, has_schema=True)) result, sampled_examples = validation_lib.validate_examples_in_csv( data_location=data_location, stats_options=options, column_names=None, delimiter=',', num_sampled_examples=99) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result]) self.assertCountEqual([ 'annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES', ], sampled_examples.keys()) got_df = sampled_examples[ 'annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES'] expected_df = pd.DataFrame.from_records( [['D', 1]], columns=['annotated_enum', 'other_feature']) expected_df['annotated_enum'] = expected_df['annotated_enum'].astype(bytes) # We can't be too picky about dtypes; try to coerce to expected types. for col in got_df.columns: if col in expected_df.columns: got_df[col] = got_df[col].astype(expected_df[col].dtype) self.assertTrue(expected_df.equals(got_df))
def test_stats_pipeline_with_sample_count(self): record_batches = [ pa.RecordBatch.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), pa.RecordBatch.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), pa.RecordBatch.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), ] with beam.Pipeline() as p: options = stats_options.StatsOptions( sample_count=3000, num_top_values=2, num_rank_histogram_buckets=2, num_values_histogram_buckets=2, num_histogram_buckets=2, num_quantiles_histogram_buckets=2, epsilon=0.001, desired_batch_size=3000) result = (p | beam.Create(record_batches) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, self._sampling_test_expected_result))
def test_generate_stats_impl(self): # input with two batches: first batch has two examples and second batch # has a single example. batches = [{'a': np.array([np.array(['xyz']), np.array(['qwe'])])}, {'a': np.array([np.array(['ab'])])}] generator1 = string_stats_generator.StringStatsGenerator() generator2 = uniques_stats_generator.UniquesStatsGenerator() expected_result = text_format.Parse( """ datasets { features { name: 'a' type: STRING string_stats { avg_length: 2.66666666 unique: 3 } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: result = (p | beam.Create(batches) | stats_impl.GenerateStatisticsImpl( generators=[generator1, generator2])) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_stats_gen_with_csv_tab_delimiter_no_header_in_file(self): records, header, expected_result = self._get_csv_test(delimiter='\t', with_header=False) input_data_path = self._write_records_to_csv(records, self._get_temp_dir(), 'input_data.tsv') result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=header, delimiter='\t', stats_options=self._default_stats_options) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_merges_two_shards(self): stats1 = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f1" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) stats2 = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f2" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) stats_combined = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f1" } } features: { path: { step: "f2" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: result = (p | beam.Create([stats1, stats2]) | stats_api.MergeDatasetFeatureStatisticsList()) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, stats_combined))
def test_empty_input(self): examples = [] expected_result = text_format.Parse( """ datasets { num_examples: 0 } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: result = p | beam.Create(examples) | stats_api.GenerateStatistics( stats_options.StatsOptions()) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_validate_examples_in_csv_multiple_files(self): data_location, column_names, options, expected_result = ( self._get_anomalous_csv_test(delimiter=',', output_column_names=True, generate_single_file=False, has_schema=True)) result = validation_lib.validate_examples_in_csv( data_location=data_location, stats_options=options, column_names=column_names, delimiter=',') compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_pipeline_with_zero_examples(self): expected_result = statistics_pb2.DatasetFeatureStatisticsList() with beam.Pipeline() as p: options = stats_options.StatsOptions( num_top_values=1, num_rank_histogram_buckets=1, num_values_histogram_buckets=2, num_histogram_buckets=1, num_quantiles_histogram_buckets=1, epsilon=0.001) result = (p | beam.Create([]) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_stats_impl(self, examples, options, expected_result_proto_text, schema=None): expected_result = text_format.Parse( expected_result_proto_text, statistics_pb2.DatasetFeatureStatisticsList()) if schema is not None: options.schema = schema with beam.Pipeline() as p: result = (p | beam.Create(examples) | stats_impl.GenerateStatisticsImpl(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_stats_gen_with_csv_missing_column(self): records = [',', ','] input_data_path = self._write_records_to_csv(records, self._get_temp_dir(), 'input_data.csv') expected_result = text_format.Parse( """ datasets { num_examples: 2 features { path { step: "feature1" } type: STRING string_stats { common_stats { num_missing: 2 } } } features { path { step: "feature2" } type: STRING string_stats { common_stats { num_missing: 2 } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=['feature1', 'feature2'], delimiter=',', stats_options=self._default_stats_options) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_gen_with_csv_header_in_multiple_files(self): records, _, expected_result = self._get_csv_test(delimiter=',', with_header=True) header = records.pop(0) # Split the records into two subsets and write to separate files. records1 = [header] + records[0:3] records2 = [header] + records[3:] tmp_dir = self._get_temp_dir() self._write_records_to_csv(records1, tmp_dir, 'input_data1.csv') self._write_records_to_csv(records2, tmp_dir, 'input_data2.csv') input_data_path = os.path.join(tmp_dir, 'input_data*') result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=None, delimiter=',', stats_options=self._default_stats_options) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_gen_with_csv_no_header_in_file(self, compression_type): records, header, expected_result = self._get_csv_test(delimiter=',', with_header=False) compression_type_lookup = { CompressionTypes.AUTO: '', CompressionTypes.GZIP: 'gzip' } input_data_path = self._write_records_to_csv( records, self._get_temp_dir(), 'input_data.csv', compression_type=compression_type_lookup[compression_type]) result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, column_names=header, delimiter=',', stats_options=self._default_stats_options, compression_type=compression_type) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_gen_with_csv_with_schema(self): records = ['feature1', '1'] input_data_path = self._write_records_to_csv(records, self._get_temp_dir(), 'input_data.csv') schema = text_format.Parse( """ feature { name: "feature1" type: BYTES } """, schema_pb2.Schema()) expected_result = text_format.Parse( """ datasets { num_examples: 1 features { path { step: "feature1" } type: STRING string_stats { common_stats { num_non_missing: 1 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } type: QUANTILES } tot_num_values: 1 } unique: 1 top_values { value: "1" frequency: 1.0 } avg_length: 1.0 rank_histogram { buckets { label: "1" sample_count: 1.0 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) self._default_stats_options.schema = schema self._default_stats_options.infer_type_from_schema = True result = stats_gen_lib.generate_statistics_from_csv( data_location=input_data_path, delimiter=',', stats_options=self._default_stats_options) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_gen_with_tfrecords_of_tfexamples(self, compression_type): examples = [ self._make_example({ 'a': ('float', [1.0, 2.0]), 'b': ('bytes', [b'a', b'b', b'c', b'e']) }), self._make_example({ 'a': ('float', [3.0, 4.0, float('nan'), 5.0]), 'b': ('bytes', [b'a', b'c', b'd', b'a']) }), self._make_example({ 'a': ('float', [1.0]), 'b': ('bytes', [b'a', b'b', b'c', b'd']) }) ] tf_compression_lookup = { CompressionTypes.AUTO: tf.compat.v1.python_io.TFRecordCompressionType.NONE, CompressionTypes.GZIP: tf.compat.v1.python_io.TFRecordCompressionType.GZIP } input_data_path = self._write_tfexamples_to_tfrecords( examples, tf_compression_lookup[compression_type]) expected_result = text_format.Parse( """ datasets { num_examples: 3 features { path { step: "a" } type: FLOAT num_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 4 avg_num_values: 2.33333333 tot_num_values: 7 num_values_histogram { buckets { low_value: 1.0 high_value: 4.0 sample_count: 1.5 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.5 } type: QUANTILES } } mean: 2.66666666 std_dev: 1.49071198 num_zeros: 0 min: 1.0 max: 5.0 median: 3.0 histograms { num_nan: 1 buckets { low_value: 1.0 high_value: 3.0 sample_count: 3.0 } buckets { low_value: 3.0 high_value: 5.0 sample_count: 3.0 } type: STANDARD } histograms { num_nan: 1 buckets { low_value: 1.0 high_value: 3.0 sample_count: 3.0 } buckets { low_value: 3.0 high_value: 5.0 sample_count: 3.0 } type: QUANTILES } } } features { path { step: "b" } type: STRING string_stats { common_stats { num_non_missing: 3 min_num_values: 4 max_num_values: 4 avg_num_values: 4.0 tot_num_values: 12 num_values_histogram { buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.5 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.5 } type: QUANTILES } } unique: 5 top_values { value: "a" frequency: 4.0 } top_values { value: "c" frequency: 3.0 } avg_length: 1.0 rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) result = stats_gen_lib.generate_statistics_from_tfrecord( data_location=input_data_path, stats_options=self._default_stats_options, compression_type=compression_type) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([result])
def test_stats_pipeline_with_examples_with_no_values(self): examples = [{'a': np.array([], dtype=np.floating), 'b': np.array([], dtype=np.object), 'c': np.array([], dtype=np.int32), 'w': np.array([2])}, {'a': np.array([], dtype=np.floating), 'b': np.array([], dtype=np.object), 'c': np.array([], dtype=np.int32), 'w': np.array([2])}, {'a': np.array([], dtype=np.floating), 'b': np.array([], dtype=np.object), 'c': np.array([], dtype=np.int32), 'w': np.array([2])}] expected_result = text_format.Parse( """ datasets{ num_examples: 3 features { name: 'a' type: FLOAT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { name: 'b' type: STRING string_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { name: 'c' type: INT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( weight_feature='w', num_top_values=1, num_rank_histogram_buckets=1, num_values_histogram_buckets=2, num_histogram_buckets=1, num_quantiles_histogram_buckets=1, epsilon=0.001) result = ( p | beam.Create(examples) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_stats_pipeline_with_examples_with_no_values(self): record_batches = [ pa.RecordBatch.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']), pa.RecordBatch.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']), pa.RecordBatch.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']) ] expected_result = text_format.Parse( """ datasets{ num_examples: 3 features { path { step: 'a' } type: FLOAT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'b' } type: STRING string_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'c' } type: INT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'w' } type: INT num_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 tot_num_values: 3 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6.0 avg_num_values: 1.0 tot_num_values: 6.0 } } mean: 2.0 std_dev: 0.0 min: 2.0 max: 2.0 median: 2.0 histograms { buckets { low_value: 2.0 high_value: 2.0 sample_count: 3.0 } type: STANDARD } histograms { buckets { low_value: 2.0 high_value: 2.0 sample_count: 3.0 } type: QUANTILES } weighted_numeric_stats { mean: 2.0 median: 2.0 histograms { buckets { low_value: 2.0 high_value: 2.0 sample_count: 6.0 } type: STANDARD } histograms { buckets { low_value: 2.0 high_value: 2.0 sample_count: 6.0 } type: QUANTILES } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( weight_feature='w', num_top_values=1, num_rank_histogram_buckets=1, num_values_histogram_buckets=2, num_histogram_buckets=1, num_quantiles_histogram_buckets=1, epsilon=0.001) result = (p | beam.Create(record_batches) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_stats_pipeline(self): record_batches = [ pa.RecordBatch.from_arrays([ pa.array([[1.0, 2.0]]), pa.array([['a', 'b', 'c', 'e']]), pa.array([np.linspace(1, 500, 500, dtype=np.int32)]), ], ['a', 'b', 'c']), pa.RecordBatch.from_arrays([ pa.array([[3.0, 4.0, np.NaN, 5.0]]), pa.array([['a', 'c', 'd', 'a']]), pa.array([np.linspace(501, 1250, 750, dtype=np.int32)]), ], ['a', 'b', 'c']), pa.RecordBatch.from_arrays([ pa.array([[1.0]]), pa.array([['a', 'b', 'c', 'd']]), pa.array([np.linspace(1251, 3000, 1750, dtype=np.int32)]), ], ['a', 'b', 'c']) ] expected_result = text_format.Parse( """ datasets { num_examples: 3 features { path { step: 'a' } type: FLOAT num_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 1 max_num_values: 4 avg_num_values: 2.33333333 tot_num_values: 7 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.0 } buckets { low_value: 1.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } type: QUANTILES } } mean: 2.66666666 std_dev: 1.49071198 num_zeros: 0 min: 1.0 max: 5.0 median: 3.0 histograms { num_nan: 1 buckets { low_value: 1.0 high_value: 2.3333333 sample_count: 2.9866667 } buckets { low_value: 2.3333333 high_value: 3.6666667 sample_count: 1.0066667 } buckets { low_value: 3.6666667 high_value: 5.0 sample_count: 2.0066667 } type: STANDARD } histograms { num_nan: 1 buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.5 } buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.5 } buckets { low_value: 3.0 high_value: 4.0 sample_count: 1.5 } buckets { low_value: 4.0 high_value: 5.0 sample_count: 1.5 } type: QUANTILES } } } features { path { step: 'c' } type: INT num_stats { common_stats { num_non_missing: 3 num_missing: 0 min_num_values: 500 max_num_values: 1750 avg_num_values: 1000.0 tot_num_values: 3000 num_values_histogram { buckets { low_value: 500.0 high_value: 500.0 sample_count: 1.0 } buckets { low_value: 500.0 high_value: 1750.0 sample_count: 1.0 } buckets { low_value: 1750.0 high_value: 1750.0 sample_count: 1.0 } type: QUANTILES } } mean: 1500.5 std_dev: 866.025355672 min: 1.0 max: 3000.0 median: 1501.0 histograms { buckets { low_value: 1.0 high_value: 1000.66666667 sample_count: 999.666666667 } buckets { low_value: 1000.66666667 high_value: 2000.33333333 sample_count: 999.666666667 } buckets { low_value: 2000.33333333 high_value: 3000.0 sample_count: 1000.66666667 } type: STANDARD } histograms { buckets { low_value: 1.0 high_value: 751.0 sample_count: 750.0 } buckets { low_value: 751.0 high_value: 1501.0 sample_count: 750.0 } buckets { low_value: 1501.0 high_value: 2250.0 sample_count: 750.0 } buckets { low_value: 2250.0 high_value: 3000.0 sample_count: 750.0 } type: QUANTILES } } } features { path { step: 'b' } type: STRING string_stats { common_stats { num_non_missing: 3 min_num_values: 4 max_num_values: 4 avg_num_values: 4.0 tot_num_values: 12 num_values_histogram { buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } type: QUANTILES } } unique: 5 top_values { value: "a" frequency: 4.0 } top_values { value: "c" frequency: 3.0 } avg_length: 1.0 rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( num_top_values=2, num_rank_histogram_buckets=3, num_values_histogram_buckets=3, num_histogram_buckets=3, num_quantiles_histogram_buckets=4, epsilon=0.001) result = (p | beam.Create(record_batches) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_validate_examples_in_tfrecord(self, num_sampled_examples): input_examples = [ # This example is anomalous because its feature contains a value that is # not in the string_domain specified in the schema. """ features { feature { key: 'annotated_enum' value { bytes_list { value: [ 'D' ] } } } } """, # This example is anomalous because it contains a feature that is not # in the schema. """ features { feature { key: 'annotated_enum' value { bytes_list { value: [ 'A' ] } } } feature { key: 'unknown_feature' value { bytes_list { value: [ 'A' ] } } } } """, ] schema = text_format.Parse( """ string_domain { name: "MyAloneEnum" value: "A" value: "B" value: "C" } feature { name: "annotated_enum" value_count { min:1 max:1 } presence { min_count: 1 } type: BYTES domain: "MyAloneEnum" } """, schema_pb2.Schema()) options = stats_options.StatsOptions( schema=schema, num_top_values=2, num_rank_histogram_buckets=2, num_values_histogram_buckets=2, num_histogram_buckets=2, num_quantiles_histogram_buckets=2) temp_dir_path = self.create_tempdir().full_path input_data_path = os.path.join(temp_dir_path, 'input_data.tfrecord') with tf.io.TFRecordWriter(input_data_path) as writer: for example in input_examples: example = text_format.Parse(example, tf.train.Example()) writer.write(example.SerializeToString()) expected_result = text_format.Parse( """ datasets { name: 'annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES' num_examples: 1 features { path: { step: 'annotated_enum' } type: STRING string_stats { common_stats { num_non_missing: 1 num_missing: 0 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 tot_num_values: 1 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } type: QUANTILES } } unique: 1 top_values { value: "D" frequency: 1.0 } avg_length: 1.0 rank_histogram { buckets { label: "D" sample_count: 1.0 } } } } } datasets { name: 'unknown_feature_SCHEMA_NEW_COLUMN' num_examples: 1 features { path: { step: 'unknown_feature' } type: STRING string_stats { common_stats { num_non_missing: 1 num_missing: 0 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 tot_num_values: 1 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } type: QUANTILES } } unique: 1 top_values { value: "A" frequency: 1.0 } avg_length: 1.0 rank_histogram { buckets { label: "A" sample_count: 1.0 } } } } features { path: { step: 'annotated_enum' } type: STRING string_stats { common_stats { num_non_missing: 1 num_missing: 0 min_num_values: 1 max_num_values: 1 avg_num_values: 1.0 tot_num_values: 1 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } buckets { low_value: 1.0 high_value: 1.0 sample_count: 0.5 } type: QUANTILES } } unique: 1 top_values { value: "A" frequency: 1.0 } avg_length: 1.0 rank_histogram { buckets { label: "A" sample_count: 1.0 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) actual_result = validation_lib.validate_examples_in_tfrecord( data_location=input_data_path, stats_options=options, num_sampled_examples=num_sampled_examples) if num_sampled_examples: actual_result, sampled_examples = actual_result self.assertCountEqual( [('annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES', [text_format.Parse(input_examples[0], tf.train.Example())]), ('unknown_feature_SCHEMA_NEW_COLUMN', [text_format.Parse(input_examples[1], tf.train.Example())])], sampled_examples.items()) compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result) compare_fn([actual_result])
def test_stats_pipeline_with_examples_with_no_values(self): tables = [ pa.Table.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']), pa.Table.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']), pa.Table.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']) ] expected_result = text_format.Parse( """ datasets{ num_examples: 3 features { path { step: 'a' } type: FLOAT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'b' } type: STRING string_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'c' } type: INT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( weight_feature='w', num_top_values=1, num_rank_histogram_buckets=1, num_values_histogram_buckets=2, num_histogram_buckets=1, num_quantiles_histogram_buckets=1, epsilon=0.001) result = (p | beam.Create(tables) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_stats_pipeline_with_feature_whitelist(self): # input with three examples. examples = [{ 'a': np.array([1.0, 2.0]), 'b': np.array(['a', 'b', 'c', 'e']), 'c': np.linspace(1, 500, 500, dtype=np.int32) }, { 'a': np.array([3.0, 4.0, np.NaN, 5.0]), 'b': np.array(['a', 'c', 'd', 'a']), 'c': np.linspace(501, 1250, 750, dtype=np.int32) }, { 'a': np.array([1.0]), 'b': np.array(['a', 'b', 'c', 'd']), 'c': np.linspace(1251, 3000, 1750, dtype=np.int32) }] expected_result = text_format.Parse( """ datasets { num_examples: 3 features { name: "b" type: STRING string_stats { common_stats { num_non_missing: 3 min_num_values: 4 max_num_values: 4 avg_num_values: 4.0 tot_num_values: 12 num_values_histogram { buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } type: QUANTILES } } unique: 5 top_values { value: "a" frequency: 4.0 } top_values { value: "c" frequency: 3.0 } avg_length: 1.0 rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( feature_whitelist=['b'], num_top_values=2, num_rank_histogram_buckets=3, num_values_histogram_buckets=3, num_histogram_buckets=3, num_quantiles_histogram_buckets=4) result = (p | beam.Create(examples) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_custom_generators(self): # Dummy PTransform that returns two DatasetFeatureStatistics protos. class CustomPTransform(beam.PTransform): def expand(self, pcoll): stats_proto1 = statistics_pb2.DatasetFeatureStatistics() proto1_feat = stats_proto1.features.add() proto1_feat.name = 'a' custom_stat1 = proto1_feat.custom_stats.add() custom_stat1.name = 'my_stat_a' custom_stat1.str = 'my_val_a' stats_proto2 = statistics_pb2.DatasetFeatureStatistics() proto2_feat = stats_proto2.features.add() proto2_feat.name = 'b' custom_stat2 = proto2_feat.custom_stats.add() custom_stat2.name = 'my_stat_b' custom_stat2.str = 'my_val_b' return [stats_proto1, stats_proto2] examples = [{ 'a': np.array([], dtype=np.int32), 'b': np.array([], dtype=np.int32) }] expected_result = text_format.Parse( """ datasets { num_examples: 1 features { name: 'a' type: INT num_stats { common_stats { num_non_missing: 1 num_missing: 0 tot_num_values: 0 num_values_histogram { buckets { low_value: 0 high_value: 0 sample_count: 0.5 } buckets { low_value: 0 high_value: 0 sample_count: 0.5 } type: QUANTILES } } } custom_stats { name: 'my_stat_a' str: 'my_val_a' } } features { name: 'b' type: INT num_stats { common_stats { num_non_missing: 1 num_missing: 0 tot_num_values: 0 num_values_histogram { buckets { low_value: 0 high_value: 0 sample_count: 0.5 } buckets { low_value: 0 high_value: 0 sample_count: 0.5 } type: QUANTILES } } } custom_stats { name: 'my_stat_b' str: 'my_val_b' } } } """, statistics_pb2.DatasetFeatureStatisticsList()) # Create a transform stats generator. transform_stats_gen = stats_generator.TransformStatsGenerator( name='CustomStatsGenerator', ptransform=CustomPTransform()) with beam.Pipeline() as p: options = stats_options.StatsOptions( generators=[transform_stats_gen], num_values_histogram_buckets=2) result = (p | beam.Create(examples) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_stats_pipeline_with_weight_feature(self): # input with four examples. examples = [{ 'a': np.array([1.0, 2.0]), 'b': np.array(['a', 'b', 'c', 'e']), 'w': np.array([1.0]) }, { 'a': np.array([3.0, 4.0, 5.0]), 'b': None, 'w': np.array([2.0]) }, { 'a': np.array([ 1.0, ]), 'b': np.array(['d', 'e']), 'w': np.array([ 3.0, ]) }, { 'a': None, 'b': np.array(['a', 'c', 'd', 'a']), 'w': np.array([1.0]) }] expected_result = text_format.Parse( """ datasets { num_examples: 4 features { name: 'a' type: FLOAT num_stats { common_stats { num_non_missing: 3 num_missing: 1 min_num_values: 1 max_num_values: 3 avg_num_values: 2.0 tot_num_values: 6 num_values_histogram { buckets { low_value: 1.0 high_value: 2.0 sample_count: 1 } buckets { low_value: 2.0 high_value: 3.0 sample_count: 1 } buckets { low_value: 3.0 high_value: 3.0 sample_count: 1 } type: QUANTILES } weighted_common_stats { num_non_missing: 6.0 num_missing: 1.0 avg_num_values: 1.83333333 tot_num_values: 11.0 } } mean: 2.66666666 std_dev: 1.49071198 num_zeros: 0 min: 1.0 max: 5.0 median: 3.0 histograms { buckets { low_value: 1.0 high_value: 2.3333333 sample_count: 2.9866667 } buckets { low_value: 2.3333333 high_value: 3.6666667 sample_count: 1.0066667 } buckets { low_value: 3.6666667 high_value: 5.0 sample_count: 2.0066667 } type: STANDARD } histograms { buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.5 } buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.5 } buckets { low_value: 3.0 high_value: 4.0 sample_count: 1.5 } buckets { low_value: 4.0 high_value: 5.0 sample_count: 1.5 } type: QUANTILES } weighted_numeric_stats { mean: 2.7272727 std_dev: 1.5427784 median: 3.0 histograms { buckets { low_value: 1.0 high_value: 2.3333333 sample_count: 4.9988889 } buckets { low_value: 2.3333333 high_value: 3.6666667 sample_count: 1.9922222 } buckets { low_value: 3.6666667 high_value: 5.0 sample_count: 4.0088889 } } histograms { buckets { low_value: 1.0 high_value: 1.0 sample_count: 2.75 } buckets { low_value: 1.0 high_value: 3.0 sample_count: 2.75 } buckets { low_value: 3.0 high_value: 4.0 sample_count: 2.75 } buckets { low_value: 4.0 high_value: 5.0 sample_count: 2.75 } type: QUANTILES } } } } features { name: 'b' type: STRING string_stats { common_stats { num_non_missing: 3 num_missing: 1 min_num_values: 2 max_num_values: 4 avg_num_values: 3.33333301544 num_values_histogram { buckets { low_value: 2.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } type: QUANTILES } weighted_common_stats { num_non_missing: 5.0 num_missing: 2.0 avg_num_values: 2.8 tot_num_values: 14.0 } tot_num_values: 10 } avg_length: 1.0 unique: 5 top_values { value: 'a' frequency: 3.0 } top_values { value: 'e' frequency: 2.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } weighted_string_stats { top_values { value: 'e' frequency: 4.0 } top_values { value: 'd' frequency: 4.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 4.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 3.0 } } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( weight_feature='w', num_top_values=2, num_rank_histogram_buckets=3, num_values_histogram_buckets=3, num_histogram_buckets=3, num_quantiles_histogram_buckets=4) result = (p | beam.Create(examples) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def test_stats_pipeline_with_schema(self): # input with three examples. examples = [{ 'a': np.array([1, 3, 5, 7]) }, { 'a': np.array([2, 4, 6, 8]) }, { 'a': np.array([0, 3, 6, 9]) }] schema = text_format.Parse( """ feature { name: "a" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) expected_result = text_format.Parse( """ datasets { num_examples: 3 features { name: "a" type: INT string_stats { common_stats { num_non_missing: 3 min_num_values: 4 max_num_values: 4 avg_num_values: 4.0 tot_num_values: 12 num_values_histogram { buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 } type: QUANTILES } } unique: 10 top_values { value: "6" frequency: 2.0 } top_values { value: "3" frequency: 2.0 } avg_length: 1.0 rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "6" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "3" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "9" sample_count: 1.0 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( schema=schema, num_top_values=2, num_rank_histogram_buckets=3, num_values_histogram_buckets=3) result = (p | beam.Create(examples) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))