def test_correct_rows_ingested(self):
        test_dict = {
            '1': ['nan', 'null', None, None, ''],
            1: ['nan', 'None', 'null', None, ''],
        }
        test_dataset = pd.DataFrame(data=test_dict)
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        trained_schema = dp.Profiler(test_dataset,
                                     len(test_dataset),
                                     profiler_options=profiler_options)

        self.assertCountEqual(['', 'nan', 'None', 'null'],
                              trained_schema.profile['1'].null_types)
        self.assertEqual(5, trained_schema.profile['1'].null_count)
        self.assertEqual({
            '': {4},
            'nan': {0},
            'None': {2, 3},
            'null': {1}
        }, trained_schema.profile['1'].null_types_index)
        self.assertCountEqual(['', 'nan', 'None', 'null'],
                              trained_schema.profile[1].null_types)
        self.assertEqual(5, trained_schema.profile[1].null_count)
        self.assertEqual({
            '': {4},
            'nan': {0},
            'None': {1, 3},
            'null': {2}
        }, trained_schema.profile[1].null_types_index)
    def test_correct_total_sample_size_and_counts_and_mutability(self):
        data = [['test1', 1.0], ['test2', 2.0], ['test3', 3.0], [None, None],
                ['test5', 5.0], ['test6', 6.0], [None, None], ['test7', 7.0]]
        data = pd.DataFrame(data, columns=['NAME', 'VALUE'])
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})

        col_one_len = len(data['NAME'])
        col_two_len = len(data['VALUE'])

        # Test reloading data, ensuring immutable
        for i in range(2):

            # Profile Once
            data.index = pd.RangeIndex(0, 8)
            profile = dp.Profiler(data,
                                  profiler_options=profiler_options,
                                  samples_per_update=2)

            # Profile Twice
            data.index = pd.RangeIndex(8, 16)
            profile.update_profile(data)

            # rows sampled are [5, 6], [13, 14] (0 index)
            self.assertEqual(16, profile.total_samples)
            self.assertEqual(4, profile._max_col_samples_used)
            self.assertEqual(2, profile.row_has_null_count)
            self.assertEqual(0.5, profile._get_row_has_null_ratio())
            self.assertEqual(2, profile.row_is_null_count)
            self.assertEqual(0.5, profile._get_row_is_null_ratio())
            self.assertEqual(0.4375, profile._get_unique_row_ratio())
            self.assertEqual(9, profile._get_duplicate_row_count())

        self.assertEqual(col_one_len, len(data['NAME']))
        self.assertEqual(col_two_len, len(data['VALUE']))
    def test_null_calculation_with_differently_sampled_cols(self):
        opts = ProfilerOptions()
        opts.structured_options.multiprocess.is_enabled = False
        data = pd.DataFrame({"full": [1, 2, 3, 4, 5, 6, 7, 8, 9],
                             "sparse": [1, None, 3, None, 5, None, 7, None, 9]})
        profile = dp.Profiler(data, samples_per_update=5, min_true_samples=5,
                              profiler_options=opts)
        # Rows 2, 4, 5, 6, 7 are sampled in first column
        # Therefore only those rows should be considered for null calculations
        # The only null in those rows in second column in that subset are 5, 7
        # Therefore only 2 rows have null according to row_has_null_count
        self.assertEqual(0, profile.row_is_null_count)
        self.assertEqual(2, profile.row_has_null_count)
        # Accordingly, make sure ratio of null rows accounts for the fact that
        # Only 5 total rows were sampled (5 in col 1, 9 in col 2)
        self.assertEqual(0, profile._get_row_is_null_ratio())
        self.assertEqual(0.4, profile._get_row_has_null_ratio())

        data2 = pd.DataFrame(
            {"sparse": [1, None, 3, None, 5, None, 7, None],
             "sparser": [1, None, None, None, None, None, None, 8]})
        profile2 = dp.Profiler(data2, samples_per_update=2, min_true_samples=2,
                               profiler_options=opts)
        # Rows are sampled as follows: [6, 5], [1, 4], [2, 3], [0, 7]
        # First column gets min true samples from ids 1, 4, 5, 6
        # Second column gets completely sampled (has a null in 1, 4, 5, 6)
        # rows 1 and 5 are completely null, 4 and 6 only null in col 2
        self.assertEqual(2, profile2.row_is_null_count)
        self.assertEqual(4, profile2.row_has_null_count)
        # Only 4 total rows sampled, ratio accordingly
        self.assertEqual(0.5, profile2._get_row_is_null_ratio())
        self.assertEqual(1, profile2._get_row_has_null_ratio())
    def test_sample_size_passed_to_profile(self, *mocks):

        update_mock = mocks[0]

        # data setup
        data = pd.DataFrame([0] * int(50e3))

        # option setup
        profiler_options = ProfilerOptions()
        profiler_options.structured_options.multiprocess.is_enabled = False
        profiler_options.set({'data_labeler.is_enabled': False})

        # test data size < min_sample_size = 5000 by default
        profiler = dp.Profiler(data[:1000], profiler_options=profiler_options)
        profiler._min_sample_size = 5000
        profiler._sampling_ratio = 0.2
        self.assertEqual(1000, update_mock.call_args[0][1])

        # test data size * 0.20 < min_sample_size < data size
        profiler = dp.Profiler(data[:10000], profiler_options=profiler_options)
        profiler._min_sample_size = 5000
        profiler._sampling_ratio = 0.2
        self.assertEqual(5000, update_mock.call_args[0][1])

        # test min_sample_size > data size * 0.20
        profiler = dp.Profiler(data, profiler_options=profiler_options)
        profiler._min_sample_size = 5000
        profiler._sampling_ratio = 0.2
        self.assertEqual(10000, update_mock.call_args[0][1])
Beispiel #5
0
    def test_validate(self):
        # Valid cases should return None while invalid cases
        # should return or throw a list of errors
        option = self.get_options()
        optpth = self.get_options_path()

        # Default Configuration Is Valid
        self.assertEqual(None, option.validate())

        # Option is_enabled is not a boolean
        for key in self.keys:
            option.set({"{}.text.is_enabled".format(key): "Hello World"})
        expected_error = [
            "{}.{}.text.is_enabled must be a "
            "Boolean.".format(optpth, key) for key in self.keys
        ]
        expected_error = set(expected_error)

        # Verify expected errors are a subset of all errors
        with self.assertRaises(ValueError) as cm:
            option.validate(raise_error=True)
        raised_error = set(str(cm.exception).split("\n"))
        self.assertSetEqual(expected_error,
                            expected_error.intersection(raised_error))
        self.assertSetEqual(
            expected_error,
            expected_error.intersection(set(
                option.validate(raise_error=False))),
        )

        # Wrong Class Type
        option = self.get_options()
        option.structured_options = ProfilerOptions()
        option.unstructured_options = ProfilerOptions()
        expected_error = [
            "{}.structured_options must be a StructuredOptions.".format(
                optpth),
            "{}.unstructured_options must be an UnstructuredOptions.".format(
                optpth),
        ]
        with self.assertRaisesRegex(ValueError, "\n".join(expected_error)):
            option.validate()
        self.assertListEqual(expected_error,
                             option.validate(raise_error=False))
    def setUpClass(cls):

        cls.input_file_path = os.path.join(test_root_path, 'data',
                                           'csv/aws_honeypot_marx_geo.csv')
        cls.aws_dataset = pd.read_csv(cls.input_file_path)
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        cls.trained_schema = dp.Profiler(cls.aws_dataset,
                                         len(cls.aws_dataset),
                                         profiler_options=profiler_options)
Beispiel #7
0
    def test_validate_helper(self):
        # Valid cases should return [] while invalid cases
        # should return a list of errors
        option = self.get_options()
        optpth = self.get_options_path()

        # Default Configuration Is Valid
        self.assertEqual([], option._validate_helper())

        # Variable Path Is Not A String
        expected_error = "The variable path must be a string."
        with self.assertRaisesRegex(ValueError, expected_error):
            option._validate_helper(1)

        # Option is_enabled is not a boolean
        for key in self.keys:
            option.set({'{}.text.is_enabled'.format(key): "Hello World"})
        expected_error = [
            '{}.{}.text.is_enabled must be a '
            'Boolean.'.format(optpth, key) for key in self.keys
        ]
        expected_error = set(expected_error)

        # Verify expected errors are a subset of all errors
        self.assertSetEqual(
            expected_error,
            expected_error.intersection(set(option._validate_helper())))

        # Wrong Class Type
        option = self.get_options()
        option.structured_options = ProfilerOptions()
        option.unstructured_options = ProfilerOptions()
        expected_error = [
            '{}.structured_options must be a StructuredOptions.'.format(
                optpth),
            '{}.unstructured_options must be an UnstructuredOptions.'.format(
                optpth)
        ]
        self.assertEqual(expected_error, option._validate_helper())
    def test_correct_null_row_counts(self):
        file_path = os.path.join(test_root_path, 'data', 'csv/empty_rows.txt')
        data = pd.read_csv(file_path)
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        profile = dp.Profiler(data, profiler_options=profiler_options)
        self.assertEqual(2, profile.row_has_null_count)
        self.assertEqual(0.25, profile._get_row_has_null_ratio())
        self.assertEqual(2, profile.row_is_null_count)
        self.assertEqual(0.25, profile._get_row_is_null_ratio())

        file_path = os.path.join(test_root_path, 'data','csv/iris-with-null-rows.csv')
        data = pd.read_csv(file_path)
        profile = dp.Profiler(data, profiler_options=profiler_options)
        self.assertEqual(13, profile.row_has_null_count)
        self.assertEqual(13/24, profile._get_row_has_null_ratio())
        self.assertEqual(3, profile.row_is_null_count)
        self.assertEqual(3/24, profile._get_row_is_null_ratio())
    def test_null_in_file(self):
        filename_null_in_file = os.path.join(
            test_root_path, 'data', 'csv/sparse-first-and-last-column.txt')
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        data = dp.Data(filename_null_in_file)
        profile = dp.Profiler(data, profiler_options=profiler_options)

        report = profile.report(report_options={"output_format":"pretty"})
        
        self.assertEqual(
            report['data_stats']['COUNT']['statistics']['null_types_index'],
            {'': '[2, 3, 4, 5, 7, 8]'}
        )
        
        self.assertEqual(
            report['data_stats'][' NUMBERS']['statistics']['null_types_index'],
            {'': '[5, 6, 8]', ' ': '[2, 4]'}
        )
Beispiel #10
0
    def test_validate(self):
        # Valid cases should return None while invalid cases
        # should return or throw a list of errors
        option = self.get_options()
        optpth = self.get_options_path()

        # Default Configuration Is Valid
        self.assertEqual(None, option.validate())

        # Option is_enabled is not a boolean
        for key in option.structured_options.properties:
            option.set({'structured_options.{}.is_enabled' \
                        .format(key): "Hello World"})

        expected_error = ['{}.structured_options.{}.is_enabled must be a ' \
                          'Boolean.'.format(optpth, key)
                          for key in option.structured_options.properties]
        expected_error = set(expected_error)

        # Verify expected errors are a subset of all errors
        with self.assertRaises(ValueError) as cm:
            option.validate(raise_error=True)
        raised_error = set(str(cm.exception).split("\n"))
        self.assertSetEqual(expected_error,
                            expected_error.intersection(raised_error))
        self.assertSetEqual(expected_error,
                            expected_error \
                            .intersection(set(option \
                                              .validate(raise_error=False))))

        # Wrong Class Type
        option = self.get_options()
        option.structured_options = ProfilerOptions()

        expected_error = '{}.structured_options must be a StructuredOptions.' \
                         .format(optpth,)
        with self.assertRaisesRegex(ValueError, expected_error):
            option.validate()
        self.assertEqual([expected_error], option.validate(raise_error=False))
Beispiel #11
0
 def get_options(self, **params):
     options = ProfilerOptions()
     options.set(params)
     return options
    def test_null_row_stats_correct_after_updates(self, *mocks):
        data1 = pd.DataFrame([[1, None], [1, 1], [None, None], [None, 1]])
        data2 = pd.DataFrame([[None, None], [1, None], [None, None], [None,
                                                                      1]])
        opts = ProfilerOptions()
        opts.structured_options.multiprocess.is_enabled = False

        # When setting min true samples/samples per update
        profile = dp.Profiler(data1,
                              min_true_samples=2,
                              samples_per_update=2,
                              profiler_options=opts)
        self.assertEqual(3, profile.row_has_null_count)
        self.assertEqual(1, profile.row_is_null_count)
        self.assertEqual(0.75, profile._get_row_has_null_ratio())
        self.assertEqual(0.25, profile._get_row_is_null_ratio())
        self.assertEqual(4, profile._min_sampled_from_batch)
        self.assertSetEqual({2, 3},
                            profile._profile[0].null_types_index['nan'])
        self.assertSetEqual({0, 2},
                            profile._profile[1].null_types_index['nan'])

        profile.update_profile(data2, min_true_samples=2, sample_size=2)
        self.assertEqual(7, profile.row_has_null_count)
        self.assertEqual(3, profile.row_is_null_count)
        self.assertEqual(0.875, profile._get_row_has_null_ratio())
        self.assertEqual(0.375, profile._get_row_is_null_ratio())
        self.assertEqual(4, profile._min_sampled_from_batch)
        self.assertSetEqual({2, 3, 4, 6, 7},
                            profile._profile[0].null_types_index['nan'])
        self.assertSetEqual({0, 2, 4, 5, 6},
                            profile._profile[1].null_types_index['nan'])

        # When not setting min true samples/samples per update
        opts = ProfilerOptions()
        opts.structured_options.multiprocess.is_enabled = False
        profile = dp.Profiler(data1, profiler_options=opts)
        self.assertEqual(3, profile.row_has_null_count)
        self.assertEqual(1, profile.row_is_null_count)
        self.assertEqual(0.75, profile._get_row_has_null_ratio())
        self.assertEqual(0.25, profile._get_row_is_null_ratio())
        self.assertEqual(4, profile._min_sampled_from_batch)
        self.assertSetEqual({2, 3},
                            profile._profile[0].null_types_index['nan'])
        self.assertSetEqual({0, 2},
                            profile._profile[1].null_types_index['nan'])

        profile.update_profile(data2)
        self.assertEqual(7, profile.row_has_null_count)
        self.assertEqual(3, profile.row_is_null_count)
        self.assertEqual(0.875, profile._get_row_has_null_ratio())
        self.assertEqual(0.375, profile._get_row_is_null_ratio())
        self.assertEqual(4, profile._min_sampled_from_batch)
        self.assertSetEqual({2, 3, 4, 6, 7},
                            profile._profile[0].null_types_index['nan'])
        self.assertSetEqual({0, 2, 4, 5, 6},
                            profile._profile[1].null_types_index['nan'])

        # Test that update with emtpy data doesn't change stats
        profile.update_profile(pd.DataFrame([]))
        self.assertEqual(7, profile.row_has_null_count)
        self.assertEqual(3, profile.row_is_null_count)
        self.assertEqual(0.875, profile._get_row_has_null_ratio())
        self.assertEqual(0.375, profile._get_row_is_null_ratio())
        self.assertEqual(0, profile._min_sampled_from_batch)
        self.assertSetEqual({2, 3, 4, 6, 7},
                            profile._profile[0].null_types_index['nan'])
        self.assertSetEqual({0, 2, 4, 5, 6},
                            profile._profile[1].null_types_index['nan'])

        # Test one row update
        profile.update_profile(pd.DataFrame([[1, None]]))
        self.assertEqual(8, profile.row_has_null_count)
        self.assertEqual(3, profile.row_is_null_count)
        self.assertEqual(8 / 9, profile._get_row_has_null_ratio())
        self.assertEqual(3 / 9, profile._get_row_is_null_ratio())
        self.assertEqual(1, profile._min_sampled_from_batch)
        self.assertSetEqual({2, 3, 4, 6, 7},
                            profile._profile[0].null_types_index['nan'])
        self.assertSetEqual({0, 2, 4, 5, 6},
                            profile._profile[1].null_types_index['nan'])
        # Weird pandas behavior makes this None since this column will be
        # recognized as object, not float64
        self.assertSetEqual({8}, profile._profile[1].null_types_index['None'])