def testGetBroadcastableColumnNotFound(self): with self.assertRaisesRegex( ValueError, r'Column "w" not present in the input table\.'): arrow_util.get_broadcastable_column(pa.Table.from_arrays( [pa.array([[1], [2]]), pa.array([[1], [3]])], ["u", "v"]), column_name="w")
def testGetArrayBroadcastString(self): table = pa.Table.from_arrays([ pa.array([[{ "sf": [ { "ssf": [[1]] }, { "ssf": [[2]] }, ] }], [{ "sf": [ { "ssf": [[3], [4]] }, ] }]]), pa.array([["one"], ["two"]]) ], ["f", "w"]) feature = types.FeaturePath(["f", "sf", "ssf"]) actual_arr, actual_weights = arrow_util.get_array( table, feature, broadcast_column_name="w") expected_arr = pa.array([[[1]], [[2]], [[3], [4]]]) expected_weights = np.array(["one", "one", "two"]) self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_weights, actual_weights)
def test_lift_min_x_count_filters_all(self): examples = [ pa.Table.from_arrays([ pa.array([['a'], ['a'], ['b'], ['a']]), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y']), ] schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'string_y' type: BYTES } """, schema_pb2.Schema()) expected_result = [] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['string_y']), min_x_count=4) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def testGetArrayReturnExampleIndices(self): table = pa.Table.from_arrays([ pa.array([[{ "sf": [{ "ssf": [1] }, { "ssf": [2] }] }], [{ "sf": [{ "ssf": [3, 4] }] }]]), pa.array([["one"], ["two"]]) ], ["f", "w"]) feature = types.FeaturePath(["f", "sf", "ssf"]) actual_arr, actual_indices = arrow_util.get_array( table, feature, return_example_indices=True) expected_arr = pa.array([[1], [2], [3, 4]]) expected_indices = np.array([0, 0, 1]) self.assertTrue( actual_arr.equals(expected_arr), "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format( feature, expected_arr, actual_arr)) np.testing.assert_array_equal(expected_indices, actual_indices)
def test_nl_generator_values_threshold_check(self): """Tests generator values threshold with fake heuristic.""" # Expected to give 6 matches. input_batches = [ pa.Column.from_array( 'feature', pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']])), pa.Column.from_array('feature', pa.array([['MATCH', 'MATCH']])), # Nones should be ignored. pa.Column.from_array('feature', pa.array([None, None])), ] # Try generators with values_threshold=7 (should not create stats) and # 6 (should create stats) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=7) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics()) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=6) self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str='natural_language_domain {}'), statistics_pb2.CustomStatistic( name='natural_language_match_rate', num=1.0) ]))
def test_mi_with_invalid_features(self): batch = pa.Table.from_arrays( [pa.array([[1]]), pa.array([[1, 2]])], ["label_key", "multivalent_feature"]) schema = text_format.Parse( """ feature { name: "label_key" type: INT shape { dim { size: 1 } } } feature { name: "multivalent_feature" type: INT value_count: { min: 2 max: 2 } } """, schema_pb2.Schema()) with self.assertRaisesRegexp(ValueError, "Found array with 0 sample"): sklearn_mutual_information.SkLearnMutualInformation( types.FeaturePath(["label_key"]), schema, TEST_SEED).compute(batch)
def test_nl_generator_avg_word_heuristic_match(self): """Tests generator with avg word length heuristic.""" generator = nlsg.NLStatsGenerator(values_threshold=2) input_batches = [ pa.Column.from_array( 'feature', pa.array([[ 'This looks correct.', 'This one too, it should be text.' ], ['xosuhddsofuhg123fdgosh']])), pa.Column.from_array( 'feature', pa.array( [['This should be text as well', 'Here is another text']])), pa.Column.from_array( 'feature', pa.array([['This should also be considered good.']])), ] self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str='natural_language_domain {}'), statistics_pb2.CustomStatistic( name='natural_language_match_rate', num=0.8333333) ]))
def test_lift_no_categorical_features(self): examples = [ pa.Table.from_arrays([ pa.array([[1.0], [2.0], [3.0], [4.0]]), pa.array([[1], [0], [1], [0]]), ], ['continous_x', 'int_y']), ] schema = text_format.Parse( """ feature { name: 'continuous_x' type: FLOAT } feature { name: 'int_y' type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) expected_result = [] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['int_y'])) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_batch_examples(self): examples = [{ 'a': np.array([1.0, 2.0], dtype=np.float32), 'b': np.array(['a', 'b', 'c', 'e']) }, { 'a': np.array([3.0, 4.0, 5.0], dtype=np.float32), }, { 'b': np.array(['d', 'e', 'f']), 'd': np.array([10, 20, 30], dtype=np.int64), }, { 'b': np.array(['a', 'b', 'c']) }, { 'c': np.array(['d', 'e', 'f']) }] expected_tables = [ pa.Table.from_arrays([ pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]], type=pa.list_(pa.float32())), pa.array([['a', 'b', 'c', 'e'], None]) ], ['a', 'b']), pa.Table.from_arrays([ pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]), pa.array([[10, 20, 30], None], type=pa.list_(pa.int64())) ], ['b', 'd']), pa.Table.from_arrays([pa.array([['d', 'e', 'f']])], ['c']), ] with beam.Pipeline() as p: result = ( p | beam.Create(examples) | batch_util.BatchExamplesToArrowTables(desired_batch_size=2)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_tables))
def test_mi_with_missing_label_key(self): batch = pa.Table.from_arrays( [pa.array([[1]]), pa.array([[1]])], ["label", "fa"]) schema = text_format.Parse( """ feature { name: "fa" type: FLOAT shape { dim { size: 1 } } } feature { name: "label" type: FLOAT shape { dim { size: 1 } } } """, schema_pb2.Schema()) with self.assertRaisesRegexp( ValueError, "Feature label_key not found in the schema."): sklearn_mutual_information.SkLearnMutualInformation( types.FeaturePath(["label_key"]), schema, TEST_SEED).compute(batch)
def test_mi_with_multivalent_label(self): batch = pa.Table.from_arrays( [pa.array([[1, 2]]), pa.array([[1]])], ["label_key", "fa"]) schema = text_format.Parse( """ feature { name: "fa" type: FLOAT shape { dim { size: 1 } } } feature { name: "label_key" type: FLOAT value_count: { min: 1 max: 2 } } """, schema_pb2.Schema()) with self.assertRaisesRegexp( ValueError, "Label column contains unsupported data."): sklearn_mutual_information.SkLearnMutualInformation( types.FeaturePath(["label_key"]), schema, TEST_SEED).compute(batch)
def test_time_stats_generator_match_ratio_with_same_valid_format(self): """Tests match ratio where all valid values have the same format.""" input_batches = [ pa.Column.from_array( 'feature', pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30', '2018-11-30']])), pa.Column.from_array( 'feature', pa.array([['not-valid', 'not-valid', 'not-valid'], ['not-valid', 'not-valid']])), ] # Try generator with match_ratio 0.51 (should not create stats). generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.51, values_threshold=5) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics()) # Try generator with match_ratio 0.49 (should create stats). generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.49, values_threshold=5) self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str="time_domain {string_format: '%Y-%m-%d'}"), statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.50), ]))
def test_time_stats_generator_values_threshold_check(self): """Tests generator values threshold.""" # Expected to give 6 matches with the same format. input_batches = [ pa.Column.from_array( 'feature', pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30']])), pa.Column.from_array('feature', pa.array([['2018-11-30', '2018-11-30']])), pa.Column.from_array('feature', pa.array([None, None])), ] # Try generator with values_threshold=7 (should not create stats). generator = time_stats_generator.TimeStatsGenerator(values_threshold=7) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics()) # Try generator with values_threshold=6 (should create stats). generator = time_stats_generator.TimeStatsGenerator(values_threshold=6) self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str="time_domain {string_format: '%Y-%m-%d'}"), statistics_pb2.CustomStatistic(name='time_match_ratio', num=1.0), ]))
def test_invalid_input_type(self): functions_expecting_list_array = [ arrow_util.ListLengthsFromListArray, arrow_util.GetFlattenedArrayParentIndices, ] functions_expecting_array = [arrow_util.GetArrayNullBitmapAsByteArray] functions_expecting_binary_array = [ arrow_util.GetBinaryArrayTotalByteSize ] for f in itertools.chain(functions_expecting_list_array, functions_expecting_array, functions_expecting_binary_array): with self.assertRaisesRegex(RuntimeError, "Could not unwrap Array"): f(1) for f in functions_expecting_list_array: with self.assertRaisesRegex(RuntimeError, "Expected ListArray but got"): f(pa.array([1, 2, 3])) for f in functions_expecting_binary_array: with self.assertRaisesRegex(RuntimeError, "Expected BinaryArray"): f(pa.array([[1, 2, 3]]))
def test_nl_generator_match_ratio_check(self): """Tests generator match ratio with fake heuristic.""" input_batches = [ pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH', 'Nope']]), pa.array([['MATCH', 'MATCH', 'MATCH']]), pa.array([['12345', 'No']]), ] # Set values_threshold=5 so it always passes. # Try generators with match_ratio 0.71 (should not create stats) and # 0.69 (should create stats) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), match_ratio=0.71, values_threshold=5) self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics()) generator = nlsg.NLStatsGenerator(_FakeHeuristic(), match_ratio=0.69, values_threshold=5) self.assertCombinerOutputEqual( input_batches, generator, statistics_pb2.FeatureNameStatistics(custom_stats=[ statistics_pb2.CustomStatistic( name='domain_info', str='natural_language_domain {}'), statistics_pb2.CustomStatistic( name='natural_language_match_rate', num=0.7) ]))
def test_stats_pipeline_with_sample_count(self): # input with three tables. tables = [ pa.Table.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), pa.Table.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), pa.Table.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), ] with beam.Pipeline() as p: options = stats_options.StatsOptions( sample_count=3000, num_top_values=2, num_rank_histogram_buckets=2, num_values_histogram_buckets=2, num_histogram_buckets=2, num_quantiles_histogram_buckets=2, epsilon=0.001, desired_batch_size=3000) result = (p | beam.Create(tables) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, self._sampling_test_expected_result))
def test_csv_decoder_with_schema(self): input_lines = ['1,1,2.0,hello', '5,5,12.34,world'] column_names = [ 'int_feature_parsed_as_float', 'int_feature', 'float_feature', 'str_feature' ] schema = text_format.Parse( """ feature { name: "int_feature_parsed_as_float" type: FLOAT } feature { name: "int_feature" type: INT } feature { name: "float_feature" type: FLOAT } feature { name: "str_feature" type: BYTES } """, schema_pb2.Schema()) expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.float32())), pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[2.0], [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())), ], [ 'int_feature_parsed_as_float', 'int_feature', 'float_feature', 'str_feature' ]) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names, schema=schema, infer_type_from_schema=True)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_image_stats_generator_real_image(self): test_data_dir = os.path.join(os.path.dirname(__file__), 'testdata') batches = [ pa.array([ [ _read_file(os.path.join(test_data_dir, 'image1.gif')), _read_file(os.path.join(test_data_dir, 'image2.png')), _read_file(os.path.join(test_data_dir, 'not_a_image.abc')) ], [ _read_file(os.path.join(test_data_dir, 'image3.bmp')), b'not_a_image' ], ]), pa.array([[ _read_file(os.path.join(test_data_dir, 'image4.png')), ]]), ] expected_result = text_format.Parse( """ custom_stats { name: 'domain_info' str: 'image_domain {}' } custom_stats { name: 'image_format_histogram' rank_histogram { buckets { label: 'UNKNOWN' sample_count: 2 } buckets { label: 'bmp' sample_count: 1 } buckets { label: 'gif' sample_count: 1 } buckets { label: 'png' sample_count: 2 } } } custom_stats { name: 'image_max_width' num: 51.0 } custom_stats { name: 'image_max_height' num: 26.0 } """, statistics_pb2.FeatureNameStatistics()) generator = image_stats_generator.ImageStatsGenerator( is_image_ratio_threshold=0.6, values_threshold=1, enable_size_stats=True) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_image_stats_generator_disable_size_stats(self): """Test the enable_size_stats_option.""" # Identical input to test_image_stats_generator_check_is_image_ratio batches = [ pa.array([ [ FakeImageDecoder.encode_image_metadata('PNG', 2, 4), FakeImageDecoder.encode_image_metadata('JPEG', 4, 2), ], [ FakeImageDecoder.encode_image_metadata('TIFF', 5, 1), FakeImageDecoder.encode_image_metadata('', -1, -1), FakeImageDecoder.encode_image_metadata('TIFF', 3, 7) ], ]), pa.array([[ FakeImageDecoder.encode_image_metadata('GIF', 2, 1), ]]), ] # Stats should be identical but without stats for image size. expected_result = text_format.Parse( """ custom_stats { name: 'domain_info' str: 'image_domain {}' } custom_stats { name: 'image_format_histogram' rank_histogram { buckets { label: 'UNKNOWN' sample_count: 1 } buckets { label: 'GIF' sample_count: 1 } buckets { label: 'JPEG' sample_count: 1 } buckets { label: 'PNG' sample_count: 1 } buckets { label: 'TIFF' sample_count: 2 } } } """, statistics_pb2.FeatureNameStatistics()) image_decoder = FakeImageDecoder() generator = image_stats_generator.ImageStatsGenerator( image_decoder=image_decoder, is_image_ratio_threshold=0.8, values_threshold=1, enable_size_stats=False) self.assertCombinerOutputEqual(batches, generator, expected_result)
def testGetArrayEmptyPath(self): with self.assertRaisesRegex(KeyError, r"query_path must be non-empty.*"): arrow_util.get_array(pa.Table.from_arrays( [pa.array([[1], [2, 3]]), pa.array([[1], [2, 2]])], ["v", "w"]), query_path=types.FeaturePath([]), broadcast_column_name="w")
def test_get_flattened_array_parent_indices(self): indices = arrow_util.GetFlattenedArrayParentIndices( pa.array([], type=pa.list_(pa.int32()))) self.assertTrue(indices.equals(pa.array([], type=pa.int32()))) indices = arrow_util.GetFlattenedArrayParentIndices( pa.array([[1.], [2.], [], [3.]])) self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
def test_flatten_list_array(self): flattened = arrow_util.FlattenListArray( pa.array([], type=pa.list_(pa.int64()))) self.assertTrue(flattened.equals(pa.array([], type=pa.int64()))) flattened = arrow_util.FlattenListArray( pa.array([[1.], [2.], [], [3.]])) self.assertTrue(flattened.equals(pa.array([1., 2., 3.])))
def testGetBroadcastableColumnTooManyValues(self): with self.assertRaisesRegex( ValueError, r'Column "w" must have exactly one value in each example\.'): arrow_util.get_broadcastable_column(pa.Table.from_arrays( [pa.array([[1], [2, 3]]), pa.array([[1], [2, 2]])], ["v", "w"]), column_name="w")
def test_basic_stats_generator_categorical_feature(self): batches = [ pa.Table.from_arrays([pa.array([[1, 5, 10], [0]])], ['c']), pa.Table.from_arrays([pa.array([[1, 1, 1, 5, 15], [-1]])], ['c']), ] expected_result = { types.FeaturePath(['c']): text_format.Parse( """ path { step: 'c' } string_stats { common_stats { num_non_missing: 4 min_num_values: 1 max_num_values: 5 avg_num_values: 2.5 num_values_histogram { buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.3333333 } buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.3333333 } buckets { low_value: 3.0 high_value: 5.0 sample_count: 1.3333333 } type: QUANTILES } tot_num_values: 10 } avg_length: 1.29999995232 } """, statistics_pb2.FeatureNameStatistics()) } schema = text_format.Parse( """ feature { name: "c" type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) generator = basic_stats_generator.BasicStatsGenerator( schema=schema, num_values_histogram_buckets=3, num_histogram_buckets=3, num_quantiles_histogram_buckets=4) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_basic_stats_generator_feature_with_different_types(self): batches = [ pa.Table.from_arrays([pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]])], ['a']), pa.Table.from_arrays([pa.array([[1]])], ['a']), ] generator = basic_stats_generator.BasicStatsGenerator() with self.assertRaisesRegexp(TypeError, 'Cannot determine the type'): self.assertCombinerOutputEqual(batches, generator, None)
def testInvalidWeightColumnStringValues(self): with self.assertRaisesRegex( ValueError, 'Weight feature "w" must be of numeric type.*'): for _ in arrow_util.enumerate_arrays(pa.Table.from_arrays( [pa.array([[1], [2, 3]]), pa.array([["two"], ["two"]])], ["v", "w"]), weight_column="w", enumerate_leaves_only=False): pass
def testInvalidWeightColumn(self): with self.assertRaisesRegex( ValueError, "weight feature must have exactly one value in each example"): for _ in arrow_util.enumerate_arrays(pa.Table.from_arrays( [pa.array([[1], [2, 3]]), pa.array([[1], []])], ["v", "w"]), weight_column="w", enumerate_leaves_only=False): pass
def test_all_null_mask_one_null(self): batch = input_batch.InputBatch( pa.Table.from_arrays( [pa.array([[1], [1]]), pa.array([None, None], type=pa.null())], ['f1', 'f2'])) path1 = types.FeaturePath(['f1']) path2 = types.FeaturePath(['f2']) expected_mask = np.array([False, False]) np.testing.assert_array_equal( batch.all_null_mask(path1, path2), expected_mask)
def test_nl_generator_avg_word_heuristic_non_match(self): """Tests generator with avg word length heuristic.""" generator = nlsg.NLStatsGenerator(values_threshold=2) input_batches = [ pa.array([['abc' * 10, 'xxxxxxxxx'], ['xosuhddsofuhg123fdgosh']]), pa.array([['Only one valid text?']]), ] self.assertCombinerOutputEqual(input_batches, generator, statistics_pb2.FeatureNameStatistics())
def test_count_missing_generator_required_path(self): batch = input_batch.InputBatch( pa.Table.from_arrays( [pa.array([[1], None, []]), pa.array([[1], None, []])], ['index', 'value'])) path = types.FeaturePath(['index']) required_path = types.FeaturePath(['value']) generator = count_missing_generator.CountMissingGenerator( path, [required_path]) accumulator = generator.create_accumulator() accumulator = generator.add_input(accumulator, batch) self.assertEqual(0, generator.extract_output(accumulator))