def test_correct_rows_ingested(self): test_dict = { '1': ['nan', 'null', None, None, ''], 1: ['nan', 'None', 'null', None, ''], } test_dataset = pd.DataFrame(data=test_dict) profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) trained_schema = dp.Profiler(test_dataset, len(test_dataset), profiler_options=profiler_options) self.assertCountEqual(['', 'nan', 'None', 'null'], trained_schema.profile['1'].null_types) self.assertEqual(5, trained_schema.profile['1'].null_count) self.assertEqual({ '': {4}, 'nan': {0}, 'None': {2, 3}, 'null': {1} }, trained_schema.profile['1'].null_types_index) self.assertCountEqual(['', 'nan', 'None', 'null'], trained_schema.profile[1].null_types) self.assertEqual(5, trained_schema.profile[1].null_count) self.assertEqual({ '': {4}, 'nan': {0}, 'None': {1, 3}, 'null': {2} }, trained_schema.profile[1].null_types_index)
def test_correct_total_sample_size_and_counts_and_mutability(self): data = [['test1', 1.0], ['test2', 2.0], ['test3', 3.0], [None, None], ['test5', 5.0], ['test6', 6.0], [None, None], ['test7', 7.0]] data = pd.DataFrame(data, columns=['NAME', 'VALUE']) profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) col_one_len = len(data['NAME']) col_two_len = len(data['VALUE']) # Test reloading data, ensuring immutable for i in range(2): # Profile Once data.index = pd.RangeIndex(0, 8) profile = dp.Profiler(data, profiler_options=profiler_options, samples_per_update=2) # Profile Twice data.index = pd.RangeIndex(8, 16) profile.update_profile(data) # rows sampled are [5, 6], [13, 14] (0 index) self.assertEqual(16, profile.total_samples) self.assertEqual(4, profile._max_col_samples_used) self.assertEqual(2, profile.row_has_null_count) self.assertEqual(0.5, profile._get_row_has_null_ratio()) self.assertEqual(2, profile.row_is_null_count) self.assertEqual(0.5, profile._get_row_is_null_ratio()) self.assertEqual(0.4375, profile._get_unique_row_ratio()) self.assertEqual(9, profile._get_duplicate_row_count()) self.assertEqual(col_one_len, len(data['NAME'])) self.assertEqual(col_two_len, len(data['VALUE']))
def test_sample_size_passed_to_profile(self, *mocks): update_mock = mocks[0] # data setup data = pd.DataFrame([0] * int(50e3)) # option setup profiler_options = ProfilerOptions() profiler_options.structured_options.multiprocess.is_enabled = False profiler_options.set({'data_labeler.is_enabled': False}) # test data size < min_sample_size = 5000 by default profiler = dp.Profiler(data[:1000], profiler_options=profiler_options) profiler._min_sample_size = 5000 profiler._sampling_ratio = 0.2 self.assertEqual(1000, update_mock.call_args[0][1]) # test data size * 0.20 < min_sample_size < data size profiler = dp.Profiler(data[:10000], profiler_options=profiler_options) profiler._min_sample_size = 5000 profiler._sampling_ratio = 0.2 self.assertEqual(5000, update_mock.call_args[0][1]) # test min_sample_size > data size * 0.20 profiler = dp.Profiler(data, profiler_options=profiler_options) profiler._min_sample_size = 5000 profiler._sampling_ratio = 0.2 self.assertEqual(10000, update_mock.call_args[0][1])
def setUpClass(cls): cls.input_file_path = os.path.join(test_root_path, 'data', 'csv/aws_honeypot_marx_geo.csv') cls.aws_dataset = pd.read_csv(cls.input_file_path) profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) cls.trained_schema = dp.Profiler(cls.aws_dataset, len(cls.aws_dataset), profiler_options=profiler_options)
def test_null_calculation_with_differently_sampled_cols(self): opts = ProfilerOptions() opts.structured_options.multiprocess.is_enabled = False data = pd.DataFrame({"full": [1, 2, 3, 4, 5, 6, 7, 8, 9], "sparse": [1, None, 3, None, 5, None, 7, None, 9]}) profile = dp.Profiler(data, samples_per_update=5, min_true_samples=5, profiler_options=opts) # Rows 2, 4, 5, 6, 7 are sampled in first column # Therefore only those rows should be considered for null calculations # The only null in those rows in second column in that subset are 5, 7 # Therefore only 2 rows have null according to row_has_null_count self.assertEqual(0, profile.row_is_null_count) self.assertEqual(2, profile.row_has_null_count) # Accordingly, make sure ratio of null rows accounts for the fact that # Only 5 total rows were sampled (5 in col 1, 9 in col 2) self.assertEqual(0, profile._get_row_is_null_ratio()) self.assertEqual(0.4, profile._get_row_has_null_ratio()) data2 = pd.DataFrame( {"sparse": [1, None, 3, None, 5, None, 7, None], "sparser": [1, None, None, None, None, None, None, 8]}) profile2 = dp.Profiler(data2, samples_per_update=2, min_true_samples=2, profiler_options=opts) # Rows are sampled as follows: [6, 5], [1, 4], [2, 3], [0, 7] # First column gets min true samples from ids 1, 4, 5, 6 # Second column gets completely sampled (has a null in 1, 4, 5, 6) # rows 1 and 5 are completely null, 4 and 6 only null in col 2 self.assertEqual(2, profile2.row_is_null_count) self.assertEqual(4, profile2.row_has_null_count) # Only 4 total rows sampled, ratio accordingly self.assertEqual(0.5, profile2._get_row_is_null_ratio()) self.assertEqual(1, profile2._get_row_has_null_ratio())
def test_correct_null_row_counts(self): file_path = os.path.join(test_root_path, 'data', 'csv/empty_rows.txt') data = pd.read_csv(file_path) profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) profile = dp.Profiler(data, profiler_options=profiler_options) self.assertEqual(2, profile.row_has_null_count) self.assertEqual(0.25, profile._get_row_has_null_ratio()) self.assertEqual(2, profile.row_is_null_count) self.assertEqual(0.25, profile._get_row_is_null_ratio()) file_path = os.path.join(test_root_path, 'data','csv/iris-with-null-rows.csv') data = pd.read_csv(file_path) profile = dp.Profiler(data, profiler_options=profiler_options) self.assertEqual(13, profile.row_has_null_count) self.assertEqual(13/24, profile._get_row_has_null_ratio()) self.assertEqual(3, profile.row_is_null_count) self.assertEqual(3/24, profile._get_row_is_null_ratio())
def test_validate(self): # Valid cases should return None while invalid cases # should return or throw a list of errors option = self.get_options() optpth = self.get_options_path() # Default Configuration Is Valid self.assertEqual(None, option.validate()) # Option is_enabled is not a boolean for key in self.keys: option.set({"{}.text.is_enabled".format(key): "Hello World"}) expected_error = [ "{}.{}.text.is_enabled must be a " "Boolean.".format(optpth, key) for key in self.keys ] expected_error = set(expected_error) # Verify expected errors are a subset of all errors with self.assertRaises(ValueError) as cm: option.validate(raise_error=True) raised_error = set(str(cm.exception).split("\n")) self.assertSetEqual(expected_error, expected_error.intersection(raised_error)) self.assertSetEqual( expected_error, expected_error.intersection(set( option.validate(raise_error=False))), ) # Wrong Class Type option = self.get_options() option.structured_options = ProfilerOptions() option.unstructured_options = ProfilerOptions() expected_error = [ "{}.structured_options must be a StructuredOptions.".format( optpth), "{}.unstructured_options must be an UnstructuredOptions.".format( optpth), ] with self.assertRaisesRegex(ValueError, "\n".join(expected_error)): option.validate() self.assertListEqual(expected_error, option.validate(raise_error=False))
def test_null_in_file(self): filename_null_in_file = os.path.join( test_root_path, 'data', 'csv/sparse-first-and-last-column.txt') profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) data = dp.Data(filename_null_in_file) profile = dp.Profiler(data, profiler_options=profiler_options) report = profile.report(report_options={"output_format":"pretty"}) self.assertEqual( report['data_stats']['COUNT']['statistics']['null_types_index'], {'': '[2, 3, 4, 5, 7, 8]'} ) self.assertEqual( report['data_stats'][' NUMBERS']['statistics']['null_types_index'], {'': '[5, 6, 8]', ' ': '[2, 4]'} )
def test_validate_helper(self): # Valid cases should return [] while invalid cases # should return a list of errors option = self.get_options() optpth = self.get_options_path() # Default Configuration Is Valid self.assertEqual([], option._validate_helper()) # Variable Path Is Not A String expected_error = "The variable path must be a string." with self.assertRaisesRegex(ValueError, expected_error): option._validate_helper(1) # Option is_enabled is not a boolean for key in self.keys: option.set({'{}.text.is_enabled'.format(key): "Hello World"}) expected_error = [ '{}.{}.text.is_enabled must be a ' 'Boolean.'.format(optpth, key) for key in self.keys ] expected_error = set(expected_error) # Verify expected errors are a subset of all errors self.assertSetEqual( expected_error, expected_error.intersection(set(option._validate_helper()))) # Wrong Class Type option = self.get_options() option.structured_options = ProfilerOptions() option.unstructured_options = ProfilerOptions() expected_error = [ '{}.structured_options must be a StructuredOptions.'.format( optpth), '{}.unstructured_options must be an UnstructuredOptions.'.format( optpth) ] self.assertEqual(expected_error, option._validate_helper())
def test_validate(self): # Valid cases should return None while invalid cases # should return or throw a list of errors option = self.get_options() optpth = self.get_options_path() # Default Configuration Is Valid self.assertEqual(None, option.validate()) # Option is_enabled is not a boolean for key in option.structured_options.properties: option.set({'structured_options.{}.is_enabled' \ .format(key): "Hello World"}) expected_error = ['{}.structured_options.{}.is_enabled must be a ' \ 'Boolean.'.format(optpth, key) for key in option.structured_options.properties] expected_error = set(expected_error) # Verify expected errors are a subset of all errors with self.assertRaises(ValueError) as cm: option.validate(raise_error=True) raised_error = set(str(cm.exception).split("\n")) self.assertSetEqual(expected_error, expected_error.intersection(raised_error)) self.assertSetEqual(expected_error, expected_error \ .intersection(set(option \ .validate(raise_error=False)))) # Wrong Class Type option = self.get_options() option.structured_options = ProfilerOptions() expected_error = '{}.structured_options must be a StructuredOptions.' \ .format(optpth,) with self.assertRaisesRegex(ValueError, expected_error): option.validate() self.assertEqual([expected_error], option.validate(raise_error=False))
def get_options(self, **params): options = ProfilerOptions() options.set(params) return options
def test_null_row_stats_correct_after_updates(self, *mocks): data1 = pd.DataFrame([[1, None], [1, 1], [None, None], [None, 1]]) data2 = pd.DataFrame([[None, None], [1, None], [None, None], [None, 1]]) opts = ProfilerOptions() opts.structured_options.multiprocess.is_enabled = False # When setting min true samples/samples per update profile = dp.Profiler(data1, min_true_samples=2, samples_per_update=2, profiler_options=opts) self.assertEqual(3, profile.row_has_null_count) self.assertEqual(1, profile.row_is_null_count) self.assertEqual(0.75, profile._get_row_has_null_ratio()) self.assertEqual(0.25, profile._get_row_is_null_ratio()) self.assertEqual(4, profile._min_sampled_from_batch) self.assertSetEqual({2, 3}, profile._profile[0].null_types_index['nan']) self.assertSetEqual({0, 2}, profile._profile[1].null_types_index['nan']) profile.update_profile(data2, min_true_samples=2, sample_size=2) self.assertEqual(7, profile.row_has_null_count) self.assertEqual(3, profile.row_is_null_count) self.assertEqual(0.875, profile._get_row_has_null_ratio()) self.assertEqual(0.375, profile._get_row_is_null_ratio()) self.assertEqual(4, profile._min_sampled_from_batch) self.assertSetEqual({2, 3, 4, 6, 7}, profile._profile[0].null_types_index['nan']) self.assertSetEqual({0, 2, 4, 5, 6}, profile._profile[1].null_types_index['nan']) # When not setting min true samples/samples per update opts = ProfilerOptions() opts.structured_options.multiprocess.is_enabled = False profile = dp.Profiler(data1, profiler_options=opts) self.assertEqual(3, profile.row_has_null_count) self.assertEqual(1, profile.row_is_null_count) self.assertEqual(0.75, profile._get_row_has_null_ratio()) self.assertEqual(0.25, profile._get_row_is_null_ratio()) self.assertEqual(4, profile._min_sampled_from_batch) self.assertSetEqual({2, 3}, profile._profile[0].null_types_index['nan']) self.assertSetEqual({0, 2}, profile._profile[1].null_types_index['nan']) profile.update_profile(data2) self.assertEqual(7, profile.row_has_null_count) self.assertEqual(3, profile.row_is_null_count) self.assertEqual(0.875, profile._get_row_has_null_ratio()) self.assertEqual(0.375, profile._get_row_is_null_ratio()) self.assertEqual(4, profile._min_sampled_from_batch) self.assertSetEqual({2, 3, 4, 6, 7}, profile._profile[0].null_types_index['nan']) self.assertSetEqual({0, 2, 4, 5, 6}, profile._profile[1].null_types_index['nan']) # Test that update with emtpy data doesn't change stats profile.update_profile(pd.DataFrame([])) self.assertEqual(7, profile.row_has_null_count) self.assertEqual(3, profile.row_is_null_count) self.assertEqual(0.875, profile._get_row_has_null_ratio()) self.assertEqual(0.375, profile._get_row_is_null_ratio()) self.assertEqual(0, profile._min_sampled_from_batch) self.assertSetEqual({2, 3, 4, 6, 7}, profile._profile[0].null_types_index['nan']) self.assertSetEqual({0, 2, 4, 5, 6}, profile._profile[1].null_types_index['nan']) # Test one row update profile.update_profile(pd.DataFrame([[1, None]])) self.assertEqual(8, profile.row_has_null_count) self.assertEqual(3, profile.row_is_null_count) self.assertEqual(8 / 9, profile._get_row_has_null_ratio()) self.assertEqual(3 / 9, profile._get_row_is_null_ratio()) self.assertEqual(1, profile._min_sampled_from_batch) self.assertSetEqual({2, 3, 4, 6, 7}, profile._profile[0].null_types_index['nan']) self.assertSetEqual({0, 2, 4, 5, 6}, profile._profile[1].null_types_index['nan']) # Weird pandas behavior makes this None since this column will be # recognized as object, not float64 self.assertSetEqual({8}, profile._profile[1].null_types_index['None'])