def test_data_labeler(self, *mocks): options = ProfilerOptions() options.structured_options.data_labeler.data_labeler_dirpath \ = "Test_Dirpath" options.structured_options.data_labeler.max_sample_size = 50 options.structured_options.multiprocess.is_enabled = False profile = Profiler(self.data, profiler_options=options) # Mock[0] is the Datalabeler Object mock mocks[0].assert_called_with(dirpath='Test_Dirpath', labeler_type='structured', load_options=None) actual_sample_size = profile._profile[0].profiles['data_label_profile'] \ ._profiles["data_labeler"]._max_sample_size self.assertEqual(actual_sample_size, 50) data_labeler = mock.Mock(spec=BaseDataLabeler) data_labeler.reverse_label_mapping = dict() data_labeler.model.num_labels = 0 options.set({'data_labeler.data_labeler_object': data_labeler}) with self.assertWarnsRegex( UserWarning, "The data labeler passed in will be used," " not through the directory of the default" " model"): options.validate() profile = Profiler(self.data, profiler_options=options) self.assertEqual( data_labeler, # profile, col prof, compiler ( profile._profile[0].profiles['data_label_profile']. # column profiler _profiles["data_labeler"].data_labeler))
def test_default_profiler_options(self, *mocks): # Allowing Profiler to create default options profile = Profiler(self.data) self.assertIsNotNone(profile.options) self.assertTrue(profile.options.data_labeler.is_enabled) for column in profile.options.properties: # TODO: remove the check for correlation option once it's updated to True if column == 'correlation': self.assertFalse(profile.options.properties[column].is_enabled) elif column == 'null_values': self.assertIsNone(profile.options.properties[column]) else: self.assertTrue(profile.options.properties[column].is_enabled) for column_type in ["int", "float", "text"]: column = profile.options.properties[column_type] self.assertTrue(column.properties["histogram_and_quantiles"]) self.assertTrue(column.properties["min"]) self.assertTrue(column.properties["max"]) self.assertTrue(column.properties["sum"]) self.assertTrue(column.properties["variance"]) self.assertTrue(column.properties["is_numeric_stats_enabled"]) if column_type != "text": self.assertTrue(column.properties["num_zeros"].is_enabled) self.assertTrue(column.properties["num_negatives"].is_enabled) else: self.assertFalse(column.properties["num_zeros"].is_enabled) self.assertFalse(column.properties["num_negatives"].is_enabled) # Using ProfilerOptions with the default options options = ProfilerOptions() profile2 = Profiler(self.data, options=options) # Stored in Profiler as StructuredOptions self.assertEqual(profile2.options, options.structured_options)
def test_float_precision(self, update_precision, *mocks): options = ProfilerOptions() options.structured_options.float.precision.is_enabled = False profile = Profiler(self.data, profiler_options=options) update_precision.assert_not_called() profile = Profiler(self.data) update_precision.assert_called()
def test_disabling_vocab(self, vocab_mock, *mocks): # Check to see disabling vocab prevents vocab from updating options = ProfilerOptions() options.structured_options.text.vocab.is_enabled = False profile = Profiler(self.data, profiler_options=options) vocab_mock.assert_not_called() # Check to see default options enable vocab profile = Profiler(self.data) vocab_mock.assert_called()
def test_numerical_stats_option(self, *mocks): # Assert that the stats are disabled options = ProfilerOptions() options.set( {"*.is_numeric_stats_enabled": False, "bias_correction.is_enabled": False} ) profile = Profiler(self.data, options=options) for col_profiler in profile.profile: profile_column = col_profiler.profile if ( profile_column["statistics"] and "histogram" in profile_column["statistics"].keys() and profile_column["statistics"]["histogram"] ): self.assertIsNone( profile_column["statistics"]["histogram"]["bin_counts"] ) self.assertIsNone( profile_column["statistics"]["histogram"]["bin_edges"] ) self.assertIsNone(profile_column["statistics"]["min"]) self.assertIsNone(profile_column["statistics"]["max"]) self.assertTrue(np.isnan(profile_column["statistics"]["variance"])) self.assertIsNone(profile_column["statistics"]["quantiles"][0]) self.assertTrue(np.isnan(profile_column["statistics"]["skewness"])) self.assertTrue(np.isnan(profile_column["statistics"]["kurtosis"])) # Assert that the stats are enabled options.set( {"*.is_numeric_stats_enabled": True, "bias_correction.is_enabled": True} ) profile = Profiler(self.data, options=options) for col_profiler in profile.profile: profile_column = col_profiler.profile if ( profile_column["statistics"] and "histogram" in profile_column["statistics"].keys() and profile_column["statistics"]["histogram"] ): self.assertIsNotNone( profile_column["statistics"]["histogram"]["bin_counts"] ) self.assertIsNotNone( profile_column["statistics"]["histogram"]["bin_edges"] ) self.assertIsNotNone(profile_column["statistics"]["min"]) self.assertIsNotNone(profile_column["statistics"]["max"]) self.assertEqual(0.5, profile_column["statistics"]["variance"]) self.assertIsNotNone(profile_column["statistics"]["quantiles"][0]) self.assertTrue(profile_column["statistics"]["skewness"] is np.nan) self.assertTrue(profile_column["statistics"]["kurtosis"] is np.nan)
def test_invalid_options_type(self, *mocks): # Test incorrect data labeler options options = ProfilerOptions() options.structured_options.data_labeler = IntOptions() with self.assertRaisesRegex( ValueError, "data_labeler must be a\(n\) DataLabelerOptions."): profile = Profiler(self.data, profiler_options=options) # Test incorrect float options options = ProfilerOptions() options.structured_options.float = IntOptions() with self.assertRaisesRegex(ValueError, "float must be a\(n\) FloatOptions."): profile = Profiler(self.data, profiler_options=options)
def test_improper_profile_options(self, *mocks): with self.assertRaisesRegex( ValueError, "The profile options must be passed as a " "ProfileOptions object."): profile = Profiler(self.data, profiler_options="Strings are not accepted") with self.assertRaisesRegex( ValueError, "ProfilerOptions.structured_options.text.max." "is_enabled must be a Boolean."): profile_options = ProfilerOptions() profile_options.structured_options.text.max.is_enabled \ = "String" profile = Profiler(self.data, profiler_options=profile_options)
def test_invalid_options_type(self, *mocks): # Test incorrect data labeler options options = ProfilerOptions() options.structured_options.data_labeler = IntOptions() with self.assertRaisesRegex( ValueError, "DataLabelerColumn parameter 'options' must be of " "type DataLabelerOptions."): profile = Profiler(self.data, profiler_options=options) # Test incorrect float options options = ProfilerOptions() options.structured_options.float = IntOptions() with self.assertRaisesRegex( ValueError, "FloatColumn parameter 'options' must be of type " "FloatOptions."): profile = Profiler(self.data, profiler_options=options)
def test_no_tensorflow(self): import sys import pandas orig_import = __import__ # necessary for any wrapper around the library to test if snappy caught # as an issue def import_mock(name, *args, **kwargs): if name == 'tensorflow': raise ImportError('test') return orig_import(name, *args, **kwargs) with mock.patch('builtins.__import__', side_effect=import_mock): with self.assertWarnsRegex(RuntimeWarning, "Partial Profiler Failure"): modules_with_tf = [ 'dataprofiler.labelers.character_level_cnn_model', ] for module in modules_with_tf: if module in sys.modules: del sys.modules[module] df = pandas.DataFrame([[1, 2.0], [1, 2.2], [-1, 3]]) profile = Profiler(df)
def test_disabling_all_columns(self, *mocks): options = ProfilerOptions() options.structured_options.text.is_enabled = False options.structured_options.float.is_enabled = False options.structured_options.int.is_enabled = False options.structured_options.datetime.is_enabled = False options.structured_options.order.is_enabled = False options.structured_options.category.is_enabled = False options.structured_options.chi2_homogeneity.is_enabled = False options.structured_options.data_labeler.is_enabled = False profile = Profiler(self.data, options=options) for col_profiler in profile.profile: profile_column = col_profiler.profile self.assertIsNone(profile_column["data_type"]) self.assertTrue("data_label" not in profile_column.keys()) self.assertIsNone(profile_column["categorical"]) self.assertIsNone(profile_column["order"]) self.assertDictEqual( { "sample_size": 2, "null_count": 0, "null_types": [], "null_types_index": {}, }, profile_column["statistics"], )
def test_disable_labeler_in_profiler_options(self, *mocks): options = ProfilerOptions() options.structured_options.data_labeler.enable = False profile = Profiler(self.data, profiler_options=options) for column_name in profile.profile.keys(): profile_column = profile.profile[column_name].profile if profile_column["statistics"] \ and "data_label_probability" in \ profile_column["statistics"].keys(): self.assertIsNone( profile_column["statistics"]["data_label_probability"])
def test_disabling_all_stats(self, *mocks): options = ProfilerOptions() statistical_options = { "histogram_and_quantiles.is_enabled": False, "min.is_enabled": False, "max.is_enabled": False, "mode.is_enabled": False, "median.is_enabled": False, "sum.is_enabled": False, "variance.is_enabled": False, "skewness.is_enabled": False, "kurtosis.is_enabled": False, "num_zeros.is_enabled": False, "num_negatives.is_enabled": False, "median_abs_deviation.is_enabled": False } options.set(statistical_options) # Assert the numerics are properly set text_options = options.structured_options.text.properties float_options = options.structured_options.float.properties int_options = options.structured_options.int.properties for option in ["histogram_and_quantiles", "min", "max", "sum", "mode", "variance", "skewness", "kurtosis", "median_abs_deviation", "num_zeros", "num_negatives"]: self.assertFalse(text_options[option].is_enabled) self.assertFalse(float_options[option].is_enabled) self.assertFalse(int_options[option].is_enabled) # Run the profiler profile = Profiler(self.data, options=options) # Assert that the stats are non-existent for col_profiler in profile.profile: profile_column = col_profiler.profile if profile_column["statistics"] \ and "histogram" in profile_column["statistics"].keys() \ and profile_column["statistics"]["histogram"]: self.assertIsNone( profile_column["statistics"]["histogram"]["bin_counts"]) self.assertIsNone( profile_column["statistics"]["histogram"]["bin_edges"]) self.assertIsNone(profile_column["statistics"]["min"]) self.assertIsNone(profile_column["statistics"]["max"]) self.assertTrue(np.isnan(profile_column["statistics"]["variance"])) self.assertIsNone(profile_column["statistics"]["quantiles"][0]) self.assertTrue(profile_column["statistics"]["skewness"] is np.nan) self.assertTrue(profile_column["statistics"]["kurtosis"] is np.nan) self.assertTrue( profile_column["statistics"]["median_abs_deviation"] is np.nan) self.assertTrue(np.isnan(profile_column["statistics"]["mode"])) self.assertTrue(np.isnan(profile_column["statistics"]["median"]))
def test_numerical_stats_option(self, *mocks): # Assert that the stats are disabled options = ProfilerOptions() options.set({"is_numeric_stats_enabled": False}) profile = Profiler(self.data, profiler_options=options) for column_name in profile.profile.keys(): profile_column = profile.profile[column_name].profile if profile_column["statistics"] \ and "histogram" in profile_column["statistics"].keys() \ and profile_column["statistics"]["histogram"]: self.assertIsNone( profile_column["statistics"]["histogram"]["bin_counts"]) self.assertIsNone( profile_column["statistics"]["histogram"]["bin_edges"]) self.assertIsNone(profile_column["statistics"]["min"]) self.assertIsNone(profile_column["statistics"]["max"]) self.assertEqual(0, profile_column["statistics"]["variance"]) self.assertIsNone(profile_column["statistics"]["quantiles"][0]) # Assert that the stats are enabled options.set({"is_numeric_stats_enabled": True}) profile = Profiler(self.data, profiler_options=options) for column_name in profile.profile.keys(): profile_column = profile.profile[column_name].profile if profile_column["statistics"] \ and "histogram" in profile_column["statistics"].keys() \ and profile_column["statistics"]["histogram"]: self.assertIsNotNone( profile_column["statistics"]["histogram"]["bin_counts"]) self.assertIsNotNone( profile_column["statistics"]["histogram"]["bin_edges"]) self.assertIsNotNone(profile_column["statistics"]["min"]) self.assertIsNotNone(profile_column["statistics"]["max"]) self.assertNotEqual(0, profile_column["statistics"]["variance"]) self.assertIsNotNone( profile_column["statistics"]["quantiles"][0])
def test_default_profiler_options(self, *mocks): # Allowing Profiler to create default options profile = Profiler(self.data) self.assertIsNotNone(profile.options) self.assertTrue( profile.options.structured_options.data_labeler.is_enabled) for column in profile.options.structured_options.properties: self.assertTrue(profile.options.structured_options. properties[column].is_enabled) for column in ["int", "float", "text"]: column = profile.options.structured_options.properties[column] self.assertTrue(column.properties["histogram_and_quantiles"]) self.assertTrue(column.properties["min"]) self.assertTrue(column.properties["max"]) self.assertTrue(column.properties["sum"]) self.assertTrue(column.properties["variance"]) self.assertTrue(column.properties["is_numeric_stats_enabled"]) # Using ProfilerOptions with the default options options = ProfilerOptions() profile2 = Profiler(self.data, profiler_options=options) self.assertEqual(profile2.options, options)
def test_data_labeler(self, *mocks): options = ProfilerOptions() options.structured_options.data_labeler.data_labeler_dirpath \ = "Test_Dirpath" options.structured_options.data_labeler.max_sample_size = 50 profile = Profiler(self.data, profiler_options=options) # Mock[0] is the Datalabeler Object mock mocks[0].assert_called_with(dirpath='Test_Dirpath', labeler_type='structured', load_options=None) actual_sample_size = profile._profile[0].profiles['data_label_profile']\ ._profiles["data_labeler"]._max_sample_size self.assertEqual(actual_sample_size, 50)
def test_validate(self, *mocks): options = ProfilerOptions() options.structured_options.data_labeler.is_enabled = "Invalid" options.structured_options.data_labeler.data_labeler_dirpath = 5 options.structured_options.int.max = "Invalid" expected_error = ( "ProfilerOptions.structured_options.int.max must be a " "BooleanOption.\n" "ProfilerOptions.structured_options.data_labeler.is_enabled must be" " a Boolean.\n" "ProfilerOptions.structured_options.data_labeler." "data_labeler_dirpath must be a string.") with self.assertRaisesRegex(ValueError, expected_error): profile = Profiler(self.data, profiler_options=options)
def test_disabling_all_columns(self, *mocks): options = ProfilerOptions() options.structured_options.text.is_enabled = False options.structured_options.float.is_enabled = False options.structured_options.int.is_enabled = False options.structured_options.datetime.is_enabled = False options.structured_options.order.is_enabled = False options.structured_options.category.is_enabled = False options.structured_options.data_labeler.is_enabled = False profile = Profiler(self.data, profiler_options=options) for column_name in profile.profile.keys(): profile_column = profile.profile[column_name].profile self.assertIsNone(profile_column["data_type"]) self.assertTrue("data_label" not in profile_column.keys()) self.assertIsNone(profile_column["categorical"]) self.assertIsNone(profile_column["order"]) self.assertDictEqual({}, profile_column["statistics"])
def test_disabling_all_stats(self, *mocks): options = ProfilerOptions() statistical_options = { "histogram_and_quantiles.is_enabled": False, "min.is_enabled": False, "max.is_enabled": False, "sum.is_enabled": False, "variance.is_enabled": False } options.set(statistical_options) # Assert the numerics are properly set text_options = options.structured_options.text.properties float_options = options.structured_options.float.properties int_options = options.structured_options.int.properties for option in [ "histogram_and_quantiles", "min", "max", "sum", "variance" ]: self.assertFalse(text_options[option].is_enabled) self.assertFalse(float_options[option].is_enabled) self.assertFalse(int_options[option].is_enabled) # Run the profiler profile = Profiler(self.data, profiler_options=options) # Assert that the stats are non-existent for column_name in profile.profile.keys(): profile_column = profile.profile[column_name].profile if profile_column["statistics"] \ and "histogram" in profile_column["statistics"].keys() \ and profile_column["statistics"]["histogram"]: self.assertIsNone( profile_column["statistics"]["histogram"]["bin_counts"]) self.assertIsNone( profile_column["statistics"]["histogram"]["bin_edges"]) self.assertIsNone(profile_column["statistics"]["min"]) self.assertIsNone(profile_column["statistics"]["max"]) self.assertEqual(0, profile_column["statistics"]["variance"]) self.assertIsNone(profile_column["statistics"]["quantiles"][0])
def test_data_profiling(self): for file in self.input_file_names: data = Data(file['path']) profile = Profiler(data) self.assertIsNotNone(profile.profile) self.assertIsNotNone(profile.report())