Exemple #1
0
    def test_data_labeler(self, *mocks):
        options = ProfilerOptions()
        options.structured_options.data_labeler.data_labeler_dirpath \
            = "Test_Dirpath"
        options.structured_options.data_labeler.max_sample_size = 50
        options.structured_options.multiprocess.is_enabled = False

        profile = Profiler(self.data, profiler_options=options)

        # Mock[0] is the Datalabeler Object mock
        mocks[0].assert_called_with(dirpath='Test_Dirpath',
                                    labeler_type='structured',
                                    load_options=None)
        actual_sample_size = profile._profile[0].profiles['data_label_profile'] \
            ._profiles["data_labeler"]._max_sample_size
        self.assertEqual(actual_sample_size, 50)

        data_labeler = mock.Mock(spec=BaseDataLabeler)
        data_labeler.reverse_label_mapping = dict()
        data_labeler.model.num_labels = 0
        options.set({'data_labeler.data_labeler_object': data_labeler})
        with self.assertWarnsRegex(
                UserWarning, "The data labeler passed in will be used,"
                " not through the directory of the default"
                " model"):
            options.validate()

        profile = Profiler(self.data, profiler_options=options)
        self.assertEqual(
            data_labeler,
            # profile, col prof, compiler
            (
                profile._profile[0].profiles['data_label_profile'].
                # column profiler
                _profiles["data_labeler"].data_labeler))
Exemple #2
0
    def test_default_profiler_options(self, *mocks):
        # Allowing Profiler to create default options
        profile = Profiler(self.data)
        self.assertIsNotNone(profile.options)
        self.assertTrue(profile.options.data_labeler.is_enabled)
        for column in profile.options.properties:
            # TODO: remove the check for correlation option once it's updated to True
            if column == 'correlation':
                self.assertFalse(profile.options.properties[column].is_enabled)
            elif column == 'null_values':
                self.assertIsNone(profile.options.properties[column])
            else:
                self.assertTrue(profile.options.properties[column].is_enabled)

        for column_type in ["int", "float", "text"]:
            column = profile.options.properties[column_type]
            self.assertTrue(column.properties["histogram_and_quantiles"])
            self.assertTrue(column.properties["min"])
            self.assertTrue(column.properties["max"])
            self.assertTrue(column.properties["sum"])
            self.assertTrue(column.properties["variance"])
            self.assertTrue(column.properties["is_numeric_stats_enabled"])
            if column_type != "text":
                self.assertTrue(column.properties["num_zeros"].is_enabled)
                self.assertTrue(column.properties["num_negatives"].is_enabled)
            else:
                self.assertFalse(column.properties["num_zeros"].is_enabled)
                self.assertFalse(column.properties["num_negatives"].is_enabled)
        # Using ProfilerOptions with the default options
        options = ProfilerOptions()
        profile2 = Profiler(self.data, options=options)
        # Stored in Profiler as StructuredOptions
        self.assertEqual(profile2.options, options.structured_options)
Exemple #3
0
    def test_float_precision(self, update_precision, *mocks):
        options = ProfilerOptions()
        options.structured_options.float.precision.is_enabled = False

        profile = Profiler(self.data, profiler_options=options)
        update_precision.assert_not_called()

        profile = Profiler(self.data)
        update_precision.assert_called()
Exemple #4
0
    def test_disabling_vocab(self, vocab_mock, *mocks):
        # Check to see disabling vocab prevents vocab from updating
        options = ProfilerOptions()
        options.structured_options.text.vocab.is_enabled = False
        profile = Profiler(self.data, profiler_options=options)
        vocab_mock.assert_not_called()

        # Check to see default options enable vocab
        profile = Profiler(self.data)
        vocab_mock.assert_called()
    def test_numerical_stats_option(self, *mocks):
        # Assert that the stats are disabled
        options = ProfilerOptions()
        options.set(
            {"*.is_numeric_stats_enabled": False, "bias_correction.is_enabled": False}
        )
        profile = Profiler(self.data, options=options)

        for col_profiler in profile.profile:
            profile_column = col_profiler.profile
            if (
                profile_column["statistics"]
                and "histogram" in profile_column["statistics"].keys()
                and profile_column["statistics"]["histogram"]
            ):
                self.assertIsNone(
                    profile_column["statistics"]["histogram"]["bin_counts"]
                )
                self.assertIsNone(
                    profile_column["statistics"]["histogram"]["bin_edges"]
                )
                self.assertIsNone(profile_column["statistics"]["min"])
                self.assertIsNone(profile_column["statistics"]["max"])
                self.assertTrue(np.isnan(profile_column["statistics"]["variance"]))
                self.assertIsNone(profile_column["statistics"]["quantiles"][0])
                self.assertTrue(np.isnan(profile_column["statistics"]["skewness"]))
                self.assertTrue(np.isnan(profile_column["statistics"]["kurtosis"]))

        # Assert that the stats are enabled
        options.set(
            {"*.is_numeric_stats_enabled": True, "bias_correction.is_enabled": True}
        )
        profile = Profiler(self.data, options=options)

        for col_profiler in profile.profile:
            profile_column = col_profiler.profile
            if (
                profile_column["statistics"]
                and "histogram" in profile_column["statistics"].keys()
                and profile_column["statistics"]["histogram"]
            ):
                self.assertIsNotNone(
                    profile_column["statistics"]["histogram"]["bin_counts"]
                )
                self.assertIsNotNone(
                    profile_column["statistics"]["histogram"]["bin_edges"]
                )
                self.assertIsNotNone(profile_column["statistics"]["min"])
                self.assertIsNotNone(profile_column["statistics"]["max"])
                self.assertEqual(0.5, profile_column["statistics"]["variance"])
                self.assertIsNotNone(profile_column["statistics"]["quantiles"][0])
                self.assertTrue(profile_column["statistics"]["skewness"] is np.nan)
                self.assertTrue(profile_column["statistics"]["kurtosis"] is np.nan)
Exemple #6
0
 def test_invalid_options_type(self, *mocks):
     # Test incorrect data labeler options
     options = ProfilerOptions()
     options.structured_options.data_labeler = IntOptions()
     with self.assertRaisesRegex(
             ValueError, "data_labeler must be a\(n\) DataLabelerOptions."):
         profile = Profiler(self.data, profiler_options=options)
     # Test incorrect float options
     options = ProfilerOptions()
     options.structured_options.float = IntOptions()
     with self.assertRaisesRegex(ValueError,
                                 "float must be a\(n\) FloatOptions."):
         profile = Profiler(self.data, profiler_options=options)
Exemple #7
0
    def test_improper_profile_options(self, *mocks):
        with self.assertRaisesRegex(
                ValueError, "The profile options must be passed as a "
                "ProfileOptions object."):
            profile = Profiler(self.data,
                               profiler_options="Strings are not accepted")

        with self.assertRaisesRegex(
                ValueError, "ProfilerOptions.structured_options.text.max."
                "is_enabled must be a Boolean."):
            profile_options = ProfilerOptions()
            profile_options.structured_options.text.max.is_enabled \
                = "String"
            profile = Profiler(self.data, profiler_options=profile_options)
Exemple #8
0
 def test_invalid_options_type(self, *mocks):
     # Test incorrect data labeler options
     options = ProfilerOptions()
     options.structured_options.data_labeler = IntOptions()
     with self.assertRaisesRegex(
             ValueError, "DataLabelerColumn parameter 'options' must be of "
             "type DataLabelerOptions."):
         profile = Profiler(self.data, profiler_options=options)
     # Test incorrect float options
     options = ProfilerOptions()
     options.structured_options.float = IntOptions()
     with self.assertRaisesRegex(
             ValueError, "FloatColumn parameter 'options' must be of type "
             "FloatOptions."):
         profile = Profiler(self.data, profiler_options=options)
Exemple #9
0
    def test_no_tensorflow(self):
        import sys
        import pandas
        orig_import = __import__

        # necessary for any wrapper around the library to test if snappy caught
        # as an issue

        def import_mock(name, *args, **kwargs):
            if name == 'tensorflow':
                raise ImportError('test')
            return orig_import(name, *args, **kwargs)

        with mock.patch('builtins.__import__', side_effect=import_mock):

            with self.assertWarnsRegex(RuntimeWarning,
                                       "Partial Profiler Failure"):
                modules_with_tf = [
                    'dataprofiler.labelers.character_level_cnn_model',
                ]
                for module in modules_with_tf:
                    if module in sys.modules:
                        del sys.modules[module]
                df = pandas.DataFrame([[1, 2.0], [1, 2.2], [-1, 3]])
                profile = Profiler(df)
 def test_disabling_all_columns(self, *mocks):
     options = ProfilerOptions()
     options.structured_options.text.is_enabled = False
     options.structured_options.float.is_enabled = False
     options.structured_options.int.is_enabled = False
     options.structured_options.datetime.is_enabled = False
     options.structured_options.order.is_enabled = False
     options.structured_options.category.is_enabled = False
     options.structured_options.chi2_homogeneity.is_enabled = False
     options.structured_options.data_labeler.is_enabled = False
     profile = Profiler(self.data, options=options)
     for col_profiler in profile.profile:
         profile_column = col_profiler.profile
         self.assertIsNone(profile_column["data_type"])
         self.assertTrue("data_label" not in profile_column.keys())
         self.assertIsNone(profile_column["categorical"])
         self.assertIsNone(profile_column["order"])
         self.assertDictEqual(
             {
                 "sample_size": 2,
                 "null_count": 0,
                 "null_types": [],
                 "null_types_index": {},
             },
             profile_column["statistics"],
         )
Exemple #11
0
 def test_disable_labeler_in_profiler_options(self, *mocks):
     options = ProfilerOptions()
     options.structured_options.data_labeler.enable = False
     profile = Profiler(self.data, profiler_options=options)
     for column_name in profile.profile.keys():
         profile_column = profile.profile[column_name].profile
         if profile_column["statistics"] \
                 and "data_label_probability" in \
                 profile_column["statistics"].keys():
             self.assertIsNone(
                 profile_column["statistics"]["data_label_probability"])
Exemple #12
0
    def test_disabling_all_stats(self, *mocks):
        options = ProfilerOptions()
        statistical_options = {
            "histogram_and_quantiles.is_enabled": False,
            "min.is_enabled": False,
            "max.is_enabled": False,
            "mode.is_enabled": False,
            "median.is_enabled": False,
            "sum.is_enabled": False,
            "variance.is_enabled": False,
            "skewness.is_enabled": False,
            "kurtosis.is_enabled": False,
            "num_zeros.is_enabled": False,
            "num_negatives.is_enabled": False,
            "median_abs_deviation.is_enabled": False
        }
        options.set(statistical_options)

        # Assert the numerics are properly set
        text_options = options.structured_options.text.properties
        float_options = options.structured_options.float.properties
        int_options = options.structured_options.int.properties
        for option in ["histogram_and_quantiles", "min", "max", "sum",
                       "mode", "variance", "skewness", "kurtosis",
                       "median_abs_deviation",
                       "num_zeros", "num_negatives"]:
            self.assertFalse(text_options[option].is_enabled)
            self.assertFalse(float_options[option].is_enabled)
            self.assertFalse(int_options[option].is_enabled)

        # Run the profiler
        profile = Profiler(self.data, options=options)

        # Assert that the stats are non-existent
        for col_profiler in profile.profile:
            profile_column = col_profiler.profile
            if profile_column["statistics"] \
                    and "histogram" in profile_column["statistics"].keys() \
                    and profile_column["statistics"]["histogram"]:
                self.assertIsNone(
                    profile_column["statistics"]["histogram"]["bin_counts"])
                self.assertIsNone(
                    profile_column["statistics"]["histogram"]["bin_edges"])
                self.assertIsNone(profile_column["statistics"]["min"])
                self.assertIsNone(profile_column["statistics"]["max"])
                self.assertTrue(np.isnan(profile_column["statistics"]["variance"]))
                self.assertIsNone(profile_column["statistics"]["quantiles"][0])
                self.assertTrue(profile_column["statistics"]["skewness"] is np.nan)
                self.assertTrue(profile_column["statistics"]["kurtosis"] is np.nan)
                self.assertTrue(
                    profile_column["statistics"]["median_abs_deviation"] is np.nan)
                self.assertTrue(np.isnan(profile_column["statistics"]["mode"]))
                self.assertTrue(np.isnan(profile_column["statistics"]["median"]))
Exemple #13
0
    def test_numerical_stats_option(self, *mocks):
        # Assert that the stats are disabled
        options = ProfilerOptions()
        options.set({"is_numeric_stats_enabled": False})
        profile = Profiler(self.data, profiler_options=options)

        for column_name in profile.profile.keys():
            profile_column = profile.profile[column_name].profile
            if profile_column["statistics"] \
                    and "histogram" in profile_column["statistics"].keys() \
                    and profile_column["statistics"]["histogram"]:
                self.assertIsNone(
                    profile_column["statistics"]["histogram"]["bin_counts"])
                self.assertIsNone(
                    profile_column["statistics"]["histogram"]["bin_edges"])
                self.assertIsNone(profile_column["statistics"]["min"])
                self.assertIsNone(profile_column["statistics"]["max"])
                self.assertEqual(0, profile_column["statistics"]["variance"])
                self.assertIsNone(profile_column["statistics"]["quantiles"][0])

        # Assert that the stats are enabled
        options.set({"is_numeric_stats_enabled": True})
        profile = Profiler(self.data, profiler_options=options)

        for column_name in profile.profile.keys():
            profile_column = profile.profile[column_name].profile
            if profile_column["statistics"] \
                    and "histogram" in profile_column["statistics"].keys() \
                    and profile_column["statistics"]["histogram"]:
                self.assertIsNotNone(
                    profile_column["statistics"]["histogram"]["bin_counts"])
                self.assertIsNotNone(
                    profile_column["statistics"]["histogram"]["bin_edges"])
                self.assertIsNotNone(profile_column["statistics"]["min"])
                self.assertIsNotNone(profile_column["statistics"]["max"])
                self.assertNotEqual(0,
                                    profile_column["statistics"]["variance"])
                self.assertIsNotNone(
                    profile_column["statistics"]["quantiles"][0])
Exemple #14
0
    def test_default_profiler_options(self, *mocks):
        # Allowing Profiler to create default options
        profile = Profiler(self.data)
        self.assertIsNotNone(profile.options)
        self.assertTrue(
            profile.options.structured_options.data_labeler.is_enabled)
        for column in profile.options.structured_options.properties:
            self.assertTrue(profile.options.structured_options.
                            properties[column].is_enabled)

        for column in ["int", "float", "text"]:
            column = profile.options.structured_options.properties[column]
            self.assertTrue(column.properties["histogram_and_quantiles"])
            self.assertTrue(column.properties["min"])
            self.assertTrue(column.properties["max"])
            self.assertTrue(column.properties["sum"])
            self.assertTrue(column.properties["variance"])
            self.assertTrue(column.properties["is_numeric_stats_enabled"])

        # Using ProfilerOptions with the default options
        options = ProfilerOptions()
        profile2 = Profiler(self.data, profiler_options=options)
        self.assertEqual(profile2.options, options)
Exemple #15
0
    def test_data_labeler(self, *mocks):
        options = ProfilerOptions()
        options.structured_options.data_labeler.data_labeler_dirpath \
            = "Test_Dirpath"
        options.structured_options.data_labeler.max_sample_size = 50

        profile = Profiler(self.data, profiler_options=options)

        # Mock[0] is the Datalabeler Object mock
        mocks[0].assert_called_with(dirpath='Test_Dirpath',
                                    labeler_type='structured',
                                    load_options=None)
        actual_sample_size = profile._profile[0].profiles['data_label_profile']\
            ._profiles["data_labeler"]._max_sample_size
        self.assertEqual(actual_sample_size, 50)
Exemple #16
0
    def test_validate(self, *mocks):
        options = ProfilerOptions()

        options.structured_options.data_labeler.is_enabled = "Invalid"
        options.structured_options.data_labeler.data_labeler_dirpath = 5
        options.structured_options.int.max = "Invalid"

        expected_error = (
            "ProfilerOptions.structured_options.int.max must be a "
            "BooleanOption.\n"
            "ProfilerOptions.structured_options.data_labeler.is_enabled must be"
            " a Boolean.\n"
            "ProfilerOptions.structured_options.data_labeler."
            "data_labeler_dirpath must be a string.")
        with self.assertRaisesRegex(ValueError, expected_error):
            profile = Profiler(self.data, profiler_options=options)
Exemple #17
0
 def test_disabling_all_columns(self, *mocks):
     options = ProfilerOptions()
     options.structured_options.text.is_enabled = False
     options.structured_options.float.is_enabled = False
     options.structured_options.int.is_enabled = False
     options.structured_options.datetime.is_enabled = False
     options.structured_options.order.is_enabled = False
     options.structured_options.category.is_enabled = False
     options.structured_options.data_labeler.is_enabled = False
     profile = Profiler(self.data, profiler_options=options)
     for column_name in profile.profile.keys():
         profile_column = profile.profile[column_name].profile
         self.assertIsNone(profile_column["data_type"])
         self.assertTrue("data_label" not in profile_column.keys())
         self.assertIsNone(profile_column["categorical"])
         self.assertIsNone(profile_column["order"])
         self.assertDictEqual({}, profile_column["statistics"])
Exemple #18
0
    def test_disabling_all_stats(self, *mocks):
        options = ProfilerOptions()
        statistical_options = {
            "histogram_and_quantiles.is_enabled": False,
            "min.is_enabled": False,
            "max.is_enabled": False,
            "sum.is_enabled": False,
            "variance.is_enabled": False
        }
        options.set(statistical_options)

        # Assert the numerics are properly set
        text_options = options.structured_options.text.properties
        float_options = options.structured_options.float.properties
        int_options = options.structured_options.int.properties
        for option in [
                "histogram_and_quantiles", "min", "max", "sum", "variance"
        ]:
            self.assertFalse(text_options[option].is_enabled)
            self.assertFalse(float_options[option].is_enabled)
            self.assertFalse(int_options[option].is_enabled)

        # Run the profiler
        profile = Profiler(self.data, profiler_options=options)

        # Assert that the stats are non-existent
        for column_name in profile.profile.keys():
            profile_column = profile.profile[column_name].profile
            if profile_column["statistics"] \
                    and "histogram" in profile_column["statistics"].keys() \
                    and profile_column["statistics"]["histogram"]:
                self.assertIsNone(
                    profile_column["statistics"]["histogram"]["bin_counts"])
                self.assertIsNone(
                    profile_column["statistics"]["histogram"]["bin_edges"])
                self.assertIsNone(profile_column["statistics"]["min"])
                self.assertIsNone(profile_column["statistics"]["max"])
                self.assertEqual(0, profile_column["statistics"]["variance"])
                self.assertIsNone(profile_column["statistics"]["quantiles"][0])
Exemple #19
0
 def test_data_profiling(self):
     for file in self.input_file_names:
         data = Data(file['path'])
         profile = Profiler(data)
         self.assertIsNotNone(profile.profile)
         self.assertIsNotNone(profile.report())